首页 技术 正文
技术 2022年11月14日
0 收藏 534 点赞 4,208 浏览 1811 个字

本方法采用简单的单线程计算每组行和列乘加运算

代码如下:

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda_runtime.h>__global__ void matrixMulKernel(float *C, float *A, float *B, int width, int height){
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int ty = blockIdx.y * blockDim.y + threadIdx.y;
if(tx >= width || ty >= height)
return; float sum = ;
for(int i=; i<width; ++i){
sum += A[ty * width + i] * B[i * width + tx];
} C[ty * width + tx] = sum;
}void constantInit(float *data, int size, float val){
for (int i = ; i < size; ++i){
data[i] = val;
}
}void matrixMul(){
unsigned int width = ;
unsigned int height = ;
unsigned int size = width * height * sizeof(float);
float *h_A = (float*)malloc(size);
float *h_B = (float*)malloc(size);
float *h_C = (float*)malloc(size);
// Initialize host memory
const float valB = 0.01f;
constantInit(h_A, width*height, 1.0f);
constantInit(h_B, width*height, valB); float *d_A, *d_B, *d_C;
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_B, size);
cudaMalloc((void**)&d_C, size); //copy host memory to device
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); //config dims
dim3 block(, );
dim3 grid(width / block.x, height / block.y); // Excute the kernel
matrixMulKernel<<<grid, block>>>(d_C, d_A, d_B, width, height); // Copy the memory from device to host
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = .e- ; // machine zero for (int i = ; i < width*height; i++){
double abs_err = fabs(h_C[i] - (width * valB));
double dot_length = width;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err/abs_val/dot_length ;
if (rel_err > eps)
{
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], (float)(width*height), eps);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); // Free
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);}int main(){
matrixMul();
}
相关推荐
python开发_常用的python模块及安装方法
adodb:我们领导推荐的数据库连接组件bsddb3:BerkeleyDB的连接组件Cheetah-1.0:我比较喜欢这个版本的cheeta…
日期:2022-11-24 点赞:878 阅读:9,111
Educational Codeforces Round 11 C. Hard Process 二分
C. Hard Process题目连接:http://www.codeforces.com/contest/660/problem/CDes…
日期:2022-11-24 点赞:807 阅读:5,584
下载Ubuntn 17.04 内核源代码
zengkefu@server1:/usr/src$ uname -aLinux server1 4.10.0-19-generic #21…
日期:2022-11-24 点赞:569 阅读:6,431
可用Active Desktop Calendar V7.86 注册码序列号
可用Active Desktop Calendar V7.86 注册码序列号Name: www.greendown.cn Code: &nb…
日期:2022-11-24 点赞:733 阅读:6,203
Android调用系统相机、自定义相机、处理大图片
Android调用系统相机和自定义相机实例本博文主要是介绍了android上使用相机进行拍照并显示的两种方式,并且由于涉及到要把拍到的照片显…
日期:2022-11-24 点赞:512 阅读:7,838
Struts的使用
一、Struts2的获取  Struts的官方网站为:http://struts.apache.org/  下载完Struts2的jar包,…
日期:2022-11-24 点赞:671 阅读:4,922