Matrix Cuda矩阵加法

Matrix Cuda矩阵加法,matrix,parallel-processing,cuda,gpu,Matrix,Parallel Processing,Cuda,Gpu,我已经编写了以下代码来求cuda中两个4x4矩阵的和 #include<stdio.h> #include<stdlib.h> #include<math.h> __global__ void Matrix_add(double* a, double* b, double* c,int n) { int row = blockIdx.x * blockDim.x + threadIdx.x; int col = blockIdx.y * bloc

我已经编写了以下代码来求cuda中两个4x4矩阵的和

#include<stdio.h>
#include<stdlib.h>
#include<math.h>

__global__ void Matrix_add(double* a, double* b, double* c,int n)
{
   int row = blockIdx.x * blockDim.x + threadIdx.x;
   int col = blockIdx.y * blockDim.y + threadIdx.y;
   int index = row * n + col;
   if(col<n && row <n)
      c[index] = a[index] + b[index];
}
int main()
{

int n=4;

double **h_a;
double **h_b;
double **h_c;
double *d_a, *d_b, *d_c;

int size = n*n*sizeof(double);

h_a = (double **) malloc(n*sizeof(double*));
h_b = (double **) malloc(n*sizeof(double*));
h_c = (double **) malloc(n*sizeof(double*));

cudaMalloc((void**)&d_a,size);
cudaMalloc((void**)&d_b,size); 
cudaMalloc((void**)&d_c,size);

int t=0;
for (t=0;t<n;t++)
{   
        h_a[t]= (double *)malloc(n*sizeof(double));
        h_b[t]= (double *)malloc(n*sizeof(double));
        h_c[t]= (double *)malloc(n*sizeof(double));
}   

int i=0,j=0;

for(i=0;i<n;i++)
{
for(j=0;j<n;j++)
     {
         h_a[i][j]=sin(i)*sin(i);
         h_b[i][j]=cos(i)*cos(i);
     }
}

cudaMemcpy(d_a,h_a+n,size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b+n,size,cudaMemcpyHostToDevice);

dim3 dimBlock(4,4);
dim3 dimGrid(1,1);
Matrix_add<<<dimGrid, dimBlock>>>(d_a,d_b,d_c,n);
cudaMemcpy(h_c+n,d_c,size,cudaMemcpyDeviceToHost);

for(i=0;i<n;i++)
{
  for( j=0;j<n;j++)
        {
            printf("%f",h_c[i][j]);
            printf("\t");
        }
  printf("\n");
}

for(i=0;i<n;i++)
  {
        free(h_a[i]);
        free(h_b[i]);
        free(h_c[i]);
}
free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;

}
#包括
#包括
#包括
__全局无效矩阵加(双*a,双*b,双*c,整数n)
{
int row=blockIdx.x*blockDim.x+threadIdx.x;
int col=blockIdx.y*blockDim.y+threadIdx.y;
int index=行*n+col;
如果(col您的主机阵列(h_a、h_b、h_c)在内存中不连续,那么您最初的cudaMemcpy()调用将把垃圾读入GPU内存(在您的情况下显然是零)


原因是主机数组实际上不是平面的,而是用指针数组表示的。我想在C中伪造二维数组?在任何情况下,您都需要更加小心使用cudaMemcpy()s并逐行复制主机数组,或在主机上使用平面表示。

回答得好。正确的解决方案是使用平面表示。单独分配的行数组杂乱无章,容易出错,效率低下。您将习惯于
h[i*row\u stride+j]
(或将其隐藏在宏后面),您的簿记代码将大幅缩减。