Cuda 输出错误的矩阵乘法

Cuda 输出错误的矩阵乘法,cuda,nvidia,matrix-multiplication,gpu,Cuda,Nvidia,Matrix Multiplication,Gpu,我试图做的是将矩阵A和矩阵B相乘,然后从乘积矩阵中得到每列最大值的索引。但不幸的是,只有矩阵乘法的前128*128个值是正确的,而其他值只是垃圾。我不太明白这是怎么回事。我请求你用这个来指导我 #include<stdio.h> #include "cuda.h" #include<stdlib.h> #define blockD 32 const int wA = 128; const int hA = 4096; const int wB = 4096; c

我试图做的是将矩阵A和矩阵B相乘,然后从乘积矩阵中得到每列最大值的索引。但不幸的是,只有矩阵乘法的前128*128个值是正确的,而其他值只是垃圾。我不太明白这是怎么回事。我请求你用这个来指导我

#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>

#define blockD 32
const int wA = 128;
const int hA = 4096;    
const int wB = 4096;
const int hB = wA;

main(void){

    void MatrixMultiplication(float *, float *, float *, float *);

    int size_A = wA * hA * sizeof(float);
    int size_B = wB * hB * sizeof(float);
    int size_C = wB * hA * sizeof(float);
    int size_max = 2 * wB * sizeof(float);
    float *M, *N, *P, *C;   

    // allocate memory on the CPU
    M = (float*)malloc(size_A);
    N = (float*)malloc(size_B);
    P = (float*)malloc(size_max);
    C = (float*)malloc(size_C);

    // initialize the matrices
    for (int y=0; y < hA; y++) {
        for (int x=0; x < wA; x++){
            M[y*wA + x] = 32; //x + y*wA; 
       }
    }

    for (int y=0; y<hB; y++) {
        for (int x=0; x<wB; x++){
            N[y*wB + x] = 21; //x + y*wB; 
       }
    }


    MatrixMultiplication(M, N, P, C);

    //Write
    FILE *f1;
    int i,j;
    f1 = fopen("C.txt","w");
    for(i = hA - 2 ; i < hA; i ++){
    for(j = 0; j < wB; j++){
        fprintf(f1,"%d\t",int(C[i*wB + j]));
    }
    fprintf(f1,"\n");
    }
    fclose(f1);

    // free the memory allocated on the CPU
    free( M );
    free( N );
    free( P ); 
    free( C );
    cudaDeviceReset();
    return 0;
}


__device__ void MaxFunction(float* Pd, float* max)
{
 int x = (threadIdx.x + blockIdx.x * blockDim.x);  
 int y = (threadIdx.y + blockIdx.y * blockDim.y); 

 int k = 0;

 int temp = 0; int temp_idx = 0;
 for (k = 0; k < wB; ++k) {
            if(Pd[x*wB + k] > temp){
                temp = Pd[x*wB + k];
                temp_idx = x*wB + k;
            }
  }
  max[y*2 + 0] = temp;
  max[y*2 + 1] = temp_idx;
}


__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
  // declare cache in the shared memory
  __shared__ float Mds[blockD][blockD];
  __shared__ float Nds[blockD][blockD];

  float Pvalue = 0;
  // Loop over the Md and Nd block dimension required to compute the Pd element
  for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x); 
                            m < ((wA * blockD * blockIdx.y)+wA-1); 
                                        m += blockD, n += (blockD*hB)){

    // collaboratively loading of Md and Nd blocks into shared memory    
    Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
    Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
    __syncthreads();

    // keep track of the running sum    
    for (int k = 0; k < blockD; k++)
      Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
    __syncthreads();
  }

  // write back to the global memory
  int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
  Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
  __syncthreads();

  MaxFunction(Pd, max);

}

void MatrixMultiplication(float *M, float *N, float *P, float *C) {

    int size_A = wA * hA * sizeof(float);
    int size_B = wB * hB * sizeof(float);
    int size_C = wB * hA * sizeof(float);
    int size_max = 2 * wB * sizeof(float);
    float *Md, *Nd, *Pd, *max; 

    // allocate memory on the GPU
    cudaMalloc((void**)&Md, size_A);
    cudaMalloc((void**)&Nd, size_B);
    cudaMalloc((void**)&Pd, size_C);
    cudaMalloc((void**)&max, size_max);

    // transfer M and N to device memory
    cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
    cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);

    // kernel invocation code
    dim3 dimBlock(blockD, blockD);
    dim3 dimGrid(wA/blockD, hB/blockD);

    //Execute Kernel
    MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);

    // transfer P from device    
    cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
    cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);

    // free the memory allocated on the GPU
    cudaFree(Md);
    cudaFree(Nd);
    cudaFree(Pd);
    cudaFree(max);
}
#包括
#包括“cuda.h”
#包括
#定义块32
常数int wA=128;
常数int hA=4096;
常数int wB=4096;
const int hB=wA;
主(空){
无效矩阵乘法(浮点*,浮点*,浮点*,浮点*);
int size_A=wA*hA*sizeof(浮点);
int size_B=wB*hB*sizeof(浮点);
int size_C=wB*hA*sizeof(浮点);
int size_max=2*wB*sizeof(浮点);
浮点数*M,*N,*P,*C;
//在CPU上分配内存
M=(浮动*)malloc(大小_A);
N=(浮动*)malloc(大小B);
P=(浮动*)最大值(最大尺寸);
C=(浮动*)malloc(大小C);
//初始化矩阵
对于(int y=0;y对于代码中的(int y=0;y,您似乎有不止一个问题。其中一个问题是:

dim3 dimGrid(wA/blockD, hB/blockD);
你应该有这个:

dim3 dimGrid(wB/blockD, hA/blockD);
最终,你需要在网格中为每个输出点设置一个线程。你的公式是4块乘以4块的网格,而你需要128块乘以128块的网格

我发现您的代码存在的另一个问题是内核中的以下行:

int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
它们没有通过输出数组正确建立索引。我没有尝试使用您的方案对其进行排序,而是使用以下方法:

Pd[(threadIdx.x + (blockIdx.x * blockDim.x)) + ((threadIdx.y + (blockIdx.y * blockDim.y))*(gridDim.x*blockDim.x))] = Pvalue;
当我对您的代码进行上述两项更改时,我在整个阵列中得到了我认为是正确的结果。在我的机器上运行它花费了大约32秒。(注意,我没有尝试修复您最初的max finding代码——请参阅下文以获得更好的方法。)

基于上一个问题,您似乎很关心速度。如果您想进行快速矩阵乘法,您应该使用。下面的代码演示了如何使用cublas来乘法两个普通的C型矩阵(它们不必是平方)。我还包括了一个列最大查找内核,当列数较大时(例如,超过500列左右。示例中有4096列)。对于少量列,可能有更快的方法来执行此功能,但少量列也表明总体问题规模可能较小,因此速度较慢(关于这段代码)将不是一个真正的问题

代码如下:

#include <stdio.h>
#include <cublas_v2.h>
#define VERBOSE 1
#define nTPB 64
#define ROW_A 4
#define COL_A 4
#define ROW_B COL_A
#define COL_B 4
#define ROW_C ROW_A
#define COL_C COL_B
#define SIZ_A (ROW_A*COL_A)
#define SIZ_B (ROW_B*COL_B)
#define SIZ_C (ROW_C*COL_C)



// error check macros
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
    do { \
        cublasStatus_t __err = fn; \
        if (__err != CUBLAS_STATUS_SUCCESS) { \
            fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
                (int)(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__global__ void col_max(float *mat, float *max, unsigned int *midx, unsigned int rows, unsigned int cols){
  int idx = threadIdx.x + blockDim.x*blockIdx.x;
  if (idx < cols){
    float tempmax = mat[idx];
    unsigned int tempmidx = 0;
    for (int i = 1; i< rows; i++)
      if (mat[idx + (i*cols)] > tempmax){
        tempmax = mat[idx + (i*cols)];
        tempmidx = i;}
    max[idx] = tempmax;
    midx[idx] = tempmidx;
  }
}

int main(){

  float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C, *h_max, *d_max;
  unsigned int *h_idx, *d_idx;

  h_A = (float *)malloc(SIZ_A*sizeof(float));
  if (h_A==0) {printf("malloc fail\n"); return -1;}
  h_B = (float *)malloc(SIZ_B*sizeof(float));
  if (h_B==0) {printf("malloc fail\n"); return -1;}
  h_C = (float *)malloc(SIZ_C*sizeof(float));
  if (h_C==0) {printf("malloc fail\n"); return -1;}
  h_max = (float *)malloc(COL_C*sizeof(float));
  if (h_max==0) {printf("malloc fail\n"); return -1;}
  h_idx = (unsigned int*)malloc(COL_C*sizeof(unsigned int));

  if (h_idx==0) {printf("malloc fail\n"); return -1;}

  cudaMalloc((void **)&d_A, SIZ_A*sizeof(float));
  cudaMalloc((void **)&d_B, SIZ_B*sizeof(float));
  cudaMalloc((void **)&d_C, SIZ_C*sizeof(float));
  cudaMalloc((void **)&d_max, COL_C*sizeof(float));
  cudaMalloc((void **)&d_idx, COL_C*sizeof(unsigned int));
  cudaCheckErrors("cuda malloc fail");

  // initialize data
  for (int i=0; i< SIZ_A; i++) h_A[i] = (float)(i+1);
  for (int i=0; i< SIZ_B; i++) h_B[i] = (float)(i+2);

  cudaMemcpy(d_A, h_A, SIZ_A*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, SIZ_B*sizeof(float), cudaMemcpyHostToDevice);
  cudaCheckErrors("cuda memcpy 1 fail");
  const float alpha = 1.0f;
  const float beta  = 0.0f;
  cublasHandle_t handle;
  cublasCheckErrors(cublasCreate(&handle));
  // C = A*B
  // due to cublas expecting column-major storage, parameters
  // are scrambled
  cublasCheckErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, COL_B, ROW_A, COL_A, &alpha, d_B, COL_B, d_A, COL_A, &beta, d_C, COL_C));
  cudaMemcpy(h_C, d_C, SIZ_C*sizeof(float), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cuda memcpy 2 fail");
  col_max<<<(COL_C + nTPB - 1)/nTPB, nTPB>>>(d_C, d_max, d_idx, ROW_C, COL_C);
  cudaCheckErrors("kernel launch fail");
  cudaMemcpy(h_max, d_max, COL_C*sizeof(float), cudaMemcpyDeviceToHost);
  cudaMemcpy(h_idx, d_idx, COL_C*sizeof(unsigned int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cuda memcpy 3 fail/kernel fail");

  if (VERBOSE){
    printf("A: \n");
    for (int i=0; i< ROW_A; i++){
      for (int j=0; j< COL_A; j++)
        printf("%7.5G", h_A[j+(i*COL_A)]);
      printf("\n");}
    printf("B: \n");
    for (int i=0; i< ROW_B; i++){
      for (int j=0; j< COL_B; j++)
        printf("%7.5G", h_B[j+(i*COL_B)]);
      printf("\n");}
    printf("C = A*B: \n");
    for (int i=0; i< ROW_C; i++){
      for (int j=0; j< COL_C; j++)
        printf("%7.5G", h_C[j+(i*COL_C)]);
      printf("\n");}
    printf("COLUMN MAX:\n");
    for (int i=0; i< COL_C; i++)
      printf("%7.5G", h_max[i]);
    printf("\nCOLUMN MAX IDX:\n");
    for (int i=0; i< COL_C; i++)
      printf("%7d", h_idx[i]);
  }
  printf("\n finished!\n");
  return 0;
}
下面是示例输出:

$ cuda-memcheck ./t221
========= CUDA-MEMCHECK
A:
      1      2      3      4
      5      6      7      8
      9     10     11     12
     13     14     15     16
B:
      2      3      4      5
      6      7      8      9
     10     11     12     13
     14     15     16     17
C = A*B:
    100    110    120    130
    228    254    280    306
    356    398    440    482
    484    542    600    658
COLUMN MAX:
    484    542    600    658
COLUMN MAX IDX:
      3      3      3      3
 finished!
========= ERROR SUMMARY: 0 errors
$
当我扩展代码以处理您指示的相同大小时,(A=4096x128,B=128x4096)在我的机器上大约花了1秒。因此它比你的代码快得多。但是,当我在内核中取出你的代码并注释掉你对
MaxFunction
的调用时,计算矩阵乘法结果也只需要1秒左右。因此,如果你想保留矩阵乘法代码(即不使用cublas)您可以将代码分成两个内核,在第一个内核中使用乘法例程,在第二个内核中使用我的max finding例程(
col\u max
),也可能会得到非常快的结果


正如@Talonmes所指出的,如果您在windows计算机上运行,请确保您了解windows TDR的影响。(如果需要,请在右上角的搜索框中搜索)

这与您先前问题中的代码和问题完全相同。请不要再重新发布相同的问题。我同意这是相同的代码。但我找不到答案。这不是发布重复问题的借口。获得帮助的关键是编辑现有问题以使其更易于回答。现在,您的代码是似乎有两个独立的问题-矩阵乘法和约化。选择一个问题。改进代码-例如,我看不到任何CUDA API错误检查。你甚至确定代码实际运行到完成?使用提供的工具-调试器,CUDA memcheck。用你发现的改进问题-不是免费的调试服务,我们为您做工作。帮助我们帮助您…值得指出的是,原始问题中发布的矩阵乘法代码实际上工作正常。我怀疑它是在低速设备上运行的,并且会触发显示器驱动程序监视计时器。这里实际上没有问题,但无论如何,感谢您发布了一个合理的答案。。。我现在编辑了我的答案和我的修复程序,以便在这个问题中发布OP的代码,以生成(我认为)正确的矩阵乘法结果。我非常确信这个问题中OP的代码不会生成正确的矩阵乘法结果。只要矩阵是方块大小的方形(wA=wB=hB)和圆形倍数(so 32),矩阵乘法代码在我试过的128到4096的每种大小下都能工作。很容易验证,每个条目都应该是wA*32*21。这一点反复出现,SDK矩阵乘法代码被误用,然后问题/投诉被张贴出来,关于它为什么不能工作……非常感谢你们。CUBLAS解决方案需要0.8左右的时间42ms当我将其与Matlab进行比较时-没有GPU阵列,Matlab需要约900ms,使用gpuArrays,Matlab需要约700ms。但是对于上面的代码,当我将nTPB增加到128时,计时为约680ms注:矩阵大小->A(4000128)和B(12819800)GPU:nVidia GeForce 410m
$ cuda-memcheck ./t221
========= CUDA-MEMCHECK
A:
      1      2      3      4
      5      6      7      8
      9     10     11     12
     13     14     15     16
B:
      2      3      4      5
      6      7      8      9
     10     11     12     13
     14     15     16     17
C = A*B:
    100    110    120    130
    228    254    280    306
    356    398    440    482
    484    542    600    658
COLUMN MAX:
    484    542    600    658
COLUMN MAX IDX:
      3      3      3      3
 finished!
========= ERROR SUMMARY: 0 errors
$