cuda编程-矩阵乘法（1）

开发者自豪技术 2022年11月14日

0 收藏 534 点赞 4,208 浏览 1811 个字

本方法采用简单的单线程计算每组行和列乘加运算

代码如下：

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda_runtime.h>__global__ void matrixMulKernel(float *C, float *A, float *B, int width, int height){
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  int ty = blockIdx.y * blockDim.y + threadIdx.y;
  if(tx >= width || ty >= height)
    return;  float sum = ;
  for(int i=; i<width; ++i){
    sum += A[ty * width + i] * B[i * width + tx];
  }  C[ty * width + tx] = sum;
}void constantInit(float *data, int size, float val){
    for (int i = ; i < size; ++i){
        data[i] = val;
    }
}void matrixMul(){
  unsigned int width = ;
  unsigned int height = ;
  unsigned int size = width * height * sizeof(float);
  float *h_A = (float*)malloc(size);
  float *h_B = (float*)malloc(size);
  float *h_C = (float*)malloc(size);
  // Initialize host memory
  const float valB = 0.01f;
  constantInit(h_A, width*height, 1.0f);
  constantInit(h_B, width*height, valB);  float *d_A, *d_B, *d_C;
  cudaMalloc((void**)&d_A, size);
  cudaMalloc((void**)&d_B, size);
  cudaMalloc((void**)&d_C, size);  //copy host memory to device
  cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);  //config dims
  dim3 block(, );
  dim3 grid(width / block.x, height / block.y);  // Excute the kernel
  matrixMulKernel<<<grid, block>>>(d_C, d_A, d_B, width, height);  // Copy the memory from device to host
  cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);  printf("Checking computed result for correctness: ");
  bool correct = true;
  // test relative error by the formula
  //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
  double eps = .e- ; // machine zero  for (int i = ; i < width*height; i++){
      double abs_err = fabs(h_C[i] - (width * valB));
      double dot_length = width;
      double abs_val = fabs(h_C[i]);
      double rel_err = abs_err/abs_val/dot_length ;
      if (rel_err > eps)
      {
          printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], (float)(width*height), eps);
          correct = false;
      }
  }
  printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");  // Free
  free(h_A);
  free(h_B);
  free(h_C);
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);}int main(){
  matrixMul();
}