Cuda 输出错误的矩阵乘法
我试图做的是将矩阵A和矩阵B相乘,然后从乘积矩阵中得到每列最大值的索引。但不幸的是,只有矩阵乘法的前128*128个值是正确的,而其他值只是垃圾。我不太明白这是怎么回事。我请求你用这个来指导我Cuda 输出错误的矩阵乘法,cuda,nvidia,matrix-multiplication,gpu,Cuda,Nvidia,Matrix Multiplication,Gpu,我试图做的是将矩阵A和矩阵B相乘,然后从乘积矩阵中得到每列最大值的索引。但不幸的是,只有矩阵乘法的前128*128个值是正确的,而其他值只是垃圾。我不太明白这是怎么回事。我请求你用这个来指导我 #include<stdio.h> #include "cuda.h" #include<stdlib.h> #define blockD 32 const int wA = 128; const int hA = 4096; const int wB = 4096; c
#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>
#define blockD 32
const int wA = 128;
const int hA = 4096;
const int wB = 4096;
const int hB = wA;
main(void){
void MatrixMultiplication(float *, float *, float *, float *);
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *M, *N, *P, *C;
// allocate memory on the CPU
M = (float*)malloc(size_A);
N = (float*)malloc(size_B);
P = (float*)malloc(size_max);
C = (float*)malloc(size_C);
// initialize the matrices
for (int y=0; y < hA; y++) {
for (int x=0; x < wA; x++){
M[y*wA + x] = 32; //x + y*wA;
}
}
for (int y=0; y<hB; y++) {
for (int x=0; x<wB; x++){
N[y*wB + x] = 21; //x + y*wB;
}
}
MatrixMultiplication(M, N, P, C);
//Write
FILE *f1;
int i,j;
f1 = fopen("C.txt","w");
for(i = hA - 2 ; i < hA; i ++){
for(j = 0; j < wB; j++){
fprintf(f1,"%d\t",int(C[i*wB + j]));
}
fprintf(f1,"\n");
}
fclose(f1);
// free the memory allocated on the CPU
free( M );
free( N );
free( P );
free( C );
cudaDeviceReset();
return 0;
}
__device__ void MaxFunction(float* Pd, float* max)
{
int x = (threadIdx.x + blockIdx.x * blockDim.x);
int y = (threadIdx.y + blockIdx.y * blockDim.y);
int k = 0;
int temp = 0; int temp_idx = 0;
for (k = 0; k < wB; ++k) {
if(Pd[x*wB + k] > temp){
temp = Pd[x*wB + k];
temp_idx = x*wB + k;
}
}
max[y*2 + 0] = temp;
max[y*2 + 1] = temp_idx;
}
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
// declare cache in the shared memory
__shared__ float Mds[blockD][blockD];
__shared__ float Nds[blockD][blockD];
float Pvalue = 0;
// Loop over the Md and Nd block dimension required to compute the Pd element
for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x);
m < ((wA * blockD * blockIdx.y)+wA-1);
m += blockD, n += (blockD*hB)){
// collaboratively loading of Md and Nd blocks into shared memory
Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
__syncthreads();
// keep track of the running sum
for (int k = 0; k < blockD; k++)
Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
__syncthreads();
}
// write back to the global memory
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
__syncthreads();
MaxFunction(Pd, max);
}
void MatrixMultiplication(float *M, float *N, float *P, float *C) {
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *Md, *Nd, *Pd, *max;
// allocate memory on the GPU
cudaMalloc((void**)&Md, size_A);
cudaMalloc((void**)&Nd, size_B);
cudaMalloc((void**)&Pd, size_C);
cudaMalloc((void**)&max, size_max);
// transfer M and N to device memory
cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);
// kernel invocation code
dim3 dimBlock(blockD, blockD);
dim3 dimGrid(wA/blockD, hB/blockD);
//Execute Kernel
MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);
// transfer P from device
cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);
// free the memory allocated on the GPU
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
cudaFree(max);
}
#包括
#包括“cuda.h”
#包括
#定义块32
常数int wA=128;
常数int hA=4096;
常数int wB=4096;
const int hB=wA;
主(空){
无效矩阵乘法(浮点*,浮点*,浮点*,浮点*);
int size_A=wA*hA*sizeof(浮点);
int size_B=wB*hB*sizeof(浮点);
int size_C=wB*hA*sizeof(浮点);
int size_max=2*wB*sizeof(浮点);
浮点数*M,*N,*P,*C;
//在CPU上分配内存
M=(浮动*)malloc(大小_A);
N=(浮动*)malloc(大小B);
P=(浮动*)最大值(最大尺寸);
C=(浮动*)malloc(大小C);
//初始化矩阵
对于(int y=0;y 对于代码中的(int y=0;y,您似乎有不止一个问题。其中一个问题是:
dim3 dimGrid(wA/blockD, hB/blockD);
你应该有这个:
dim3 dimGrid(wB/blockD, hA/blockD);
最终,你需要在网格中为每个输出点设置一个线程。你的公式是4块乘以4块的网格,而你需要128块乘以128块的网格
我发现您的代码存在的另一个问题是内核中的以下行:
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
它们没有通过输出数组正确建立索引。我没有尝试使用您的方案对其进行排序,而是使用以下方法:
Pd[(threadIdx.x + (blockIdx.x * blockDim.x)) + ((threadIdx.y + (blockIdx.y * blockDim.y))*(gridDim.x*blockDim.x))] = Pvalue;
当我对您的代码进行上述两项更改时,我在整个阵列中得到了我认为是正确的结果。在我的机器上运行它花费了大约32秒。(注意,我没有尝试修复您最初的max finding代码——请参阅下文以获得更好的方法。)
基于上一个问题,您似乎很关心速度。如果您想进行快速矩阵乘法,您应该使用。下面的代码演示了如何使用cublas来乘法两个普通的C型矩阵(它们不必是平方)。我还包括了一个列最大查找内核,当列数较大时(例如,超过500列左右。示例中有4096列)。对于少量列,可能有更快的方法来执行此功能,但少量列也表明总体问题规模可能较小,因此速度较慢(关于这段代码)将不是一个真正的问题
代码如下:
#include <stdio.h>
#include <cublas_v2.h>
#define VERBOSE 1
#define nTPB 64
#define ROW_A 4
#define COL_A 4
#define ROW_B COL_A
#define COL_B 4
#define ROW_C ROW_A
#define COL_C COL_B
#define SIZ_A (ROW_A*COL_A)
#define SIZ_B (ROW_B*COL_B)
#define SIZ_C (ROW_C*COL_C)
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void col_max(float *mat, float *max, unsigned int *midx, unsigned int rows, unsigned int cols){
int idx = threadIdx.x + blockDim.x*blockIdx.x;
if (idx < cols){
float tempmax = mat[idx];
unsigned int tempmidx = 0;
for (int i = 1; i< rows; i++)
if (mat[idx + (i*cols)] > tempmax){
tempmax = mat[idx + (i*cols)];
tempmidx = i;}
max[idx] = tempmax;
midx[idx] = tempmidx;
}
}
int main(){
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C, *h_max, *d_max;
unsigned int *h_idx, *d_idx;
h_A = (float *)malloc(SIZ_A*sizeof(float));
if (h_A==0) {printf("malloc fail\n"); return -1;}
h_B = (float *)malloc(SIZ_B*sizeof(float));
if (h_B==0) {printf("malloc fail\n"); return -1;}
h_C = (float *)malloc(SIZ_C*sizeof(float));
if (h_C==0) {printf("malloc fail\n"); return -1;}
h_max = (float *)malloc(COL_C*sizeof(float));
if (h_max==0) {printf("malloc fail\n"); return -1;}
h_idx = (unsigned int*)malloc(COL_C*sizeof(unsigned int));
if (h_idx==0) {printf("malloc fail\n"); return -1;}
cudaMalloc((void **)&d_A, SIZ_A*sizeof(float));
cudaMalloc((void **)&d_B, SIZ_B*sizeof(float));
cudaMalloc((void **)&d_C, SIZ_C*sizeof(float));
cudaMalloc((void **)&d_max, COL_C*sizeof(float));
cudaMalloc((void **)&d_idx, COL_C*sizeof(unsigned int));
cudaCheckErrors("cuda malloc fail");
// initialize data
for (int i=0; i< SIZ_A; i++) h_A[i] = (float)(i+1);
for (int i=0; i< SIZ_B; i++) h_B[i] = (float)(i+2);
cudaMemcpy(d_A, h_A, SIZ_A*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, SIZ_B*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy 1 fail");
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));
// C = A*B
// due to cublas expecting column-major storage, parameters
// are scrambled
cublasCheckErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, COL_B, ROW_A, COL_A, &alpha, d_B, COL_B, d_A, COL_A, &beta, d_C, COL_C));
cudaMemcpy(h_C, d_C, SIZ_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 2 fail");
col_max<<<(COL_C + nTPB - 1)/nTPB, nTPB>>>(d_C, d_max, d_idx, ROW_C, COL_C);
cudaCheckErrors("kernel launch fail");
cudaMemcpy(h_max, d_max, COL_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_idx, d_idx, COL_C*sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 3 fail/kernel fail");
if (VERBOSE){
printf("A: \n");
for (int i=0; i< ROW_A; i++){
for (int j=0; j< COL_A; j++)
printf("%7.5G", h_A[j+(i*COL_A)]);
printf("\n");}
printf("B: \n");
for (int i=0; i< ROW_B; i++){
for (int j=0; j< COL_B; j++)
printf("%7.5G", h_B[j+(i*COL_B)]);
printf("\n");}
printf("C = A*B: \n");
for (int i=0; i< ROW_C; i++){
for (int j=0; j< COL_C; j++)
printf("%7.5G", h_C[j+(i*COL_C)]);
printf("\n");}
printf("COLUMN MAX:\n");
for (int i=0; i< COL_C; i++)
printf("%7.5G", h_max[i]);
printf("\nCOLUMN MAX IDX:\n");
for (int i=0; i< COL_C; i++)
printf("%7d", h_idx[i]);
}
printf("\n finished!\n");
return 0;
}
下面是示例输出:
$ cuda-memcheck ./t221
========= CUDA-MEMCHECK
A:
1 2 3 4
5 6 7 8
9 10 11 12
13 14 15 16
B:
2 3 4 5
6 7 8 9
10 11 12 13
14 15 16 17
C = A*B:
100 110 120 130
228 254 280 306
356 398 440 482
484 542 600 658
COLUMN MAX:
484 542 600 658
COLUMN MAX IDX:
3 3 3 3
finished!
========= ERROR SUMMARY: 0 errors
$
当我扩展代码以处理您指示的相同大小时,(A=4096x128,B=128x4096)在我的机器上大约花了1秒。因此它比你的代码快得多。但是,当我在内核中取出你的代码并注释掉你对MaxFunction
的调用时,计算矩阵乘法结果也只需要1秒左右。因此,如果你想保留矩阵乘法代码(即不使用cublas)您可以将代码分成两个内核,在第一个内核中使用乘法例程,在第二个内核中使用我的max finding例程(col\u max
),也可能会得到非常快的结果
正如@Talonmes所指出的,如果您在windows计算机上运行,请确保您了解windows TDR的影响。(如果需要,请在右上角的搜索框中搜索)这与您先前问题中的代码和问题完全相同。请不要再重新发布相同的问题。我同意这是相同的代码。但我找不到答案。这不是发布重复问题的借口。获得帮助的关键是编辑现有问题以使其更易于回答。现在,您的代码是似乎有两个独立的问题-矩阵乘法和约化。选择一个问题。改进代码-例如,我看不到任何CUDA API错误检查。你甚至确定代码实际运行到完成?使用提供的工具-调试器,CUDA memcheck。用你发现的改进问题-不是免费的调试服务,我们为您做工作。帮助我们帮助您…值得指出的是,原始问题中发布的矩阵乘法代码实际上工作正常。我怀疑它是在低速设备上运行的,并且会触发显示器驱动程序监视计时器。这里实际上没有问题,但无论如何,感谢您发布了一个合理的答案。。。我现在编辑了我的答案和我的修复程序,以便在这个问题中发布OP的代码,以生成(我认为)正确的矩阵乘法结果。我非常确信这个问题中OP的代码不会生成正确的矩阵乘法结果。只要矩阵是方块大小的方形(wA=wB=hB)和圆形倍数(so 32),矩阵乘法代码在我试过的128到4096的每种大小下都能工作。很容易验证,每个条目都应该是wA*32*21。这一点反复出现,SDK矩阵乘法代码被误用,然后问题/投诉被张贴出来,关于它为什么不能工作……非常感谢你们。CUBLAS解决方案需要0.8左右的时间42ms当我将其与Matlab进行比较时-没有GPU阵列,Matlab需要约900ms,使用gpuArrays,Matlab需要约700ms。但是对于上面的代码,当我将nTPB增加到128时,计时为约680ms注:矩阵大小->A(4000128)和B(12819800)GPU:nVidia GeForce 410m
$ cuda-memcheck ./t221
========= CUDA-MEMCHECK
A:
1 2 3 4
5 6 7 8
9 10 11 12
13 14 15 16
B:
2 3 4 5
6 7 8 9
10 11 12 13
14 15 16 17
C = A*B:
100 110 120 130
228 254 280 306
356 398 440 482
484 542 600 658
COLUMN MAX:
484 542 600 658
COLUMN MAX IDX:
3 3 3 3
finished!
========= ERROR SUMMARY: 0 errors
$