Cuda 如何在GPU上使用带有CULA的3D矩阵?

Cuda 如何在GPU上使用带有CULA的3D矩阵?,cuda,gpu,linear-algebra,algebra,cula,Cuda,Gpu,Linear Algebra,Algebra,Cula,在某些代码的CPU版本中,我有许多类似于以下内容: for(int i =0;i<N;i++){ dgemm(A[i], B[i],C[i], Size[i][0], Size[i][1], Size[i][2], Size[i][3], 'N','T'); } 然而,我想在课程开始时提前将我的B存储在GPU上,因为它们不会改变,但我不知道如何去做。或者我如何存储我的阵列,使之成为可能 我在网上看到了关于在CUDA中使用3D矩阵的各种情况,但它们似乎不太适用于对CULA函数进

在某些代码的CPU版本中,我有许多类似于以下内容:

for(int i =0;i<N;i++){

    dgemm(A[i], B[i],C[i], Size[i][0], Size[i][1], Size[i][2], Size[i][3], 'N','T');

}
然而,我想在课程开始时提前将我的B存储在GPU上,因为它们不会改变,但我不知道如何去做。或者我如何存储我的阵列,使之成为可能

我在网上看到了关于在CUDA中使用3D矩阵的各种情况,但它们似乎不太适用于对CULA函数进行函数调用

根据下面答案中的示例,我得出以下结论:

extern "C" void copyFNFVecs_(double **FNFVecs, int numpulsars, int numcoeff){


  cudaError_t err;
 err = cudaMalloc( (void ***)&GlobalFVecs_d, numpulsars*sizeof(double*) );
 checkCudaError(err);

    for(int i =0; i < numpulsars;i++){
         err = cudaMalloc( (void **) &(GlobalFVecs_d[i]), numcoeff*numcoeff*sizeof(double) );
         checkCudaError(err);    
       //  err = cudaMemcpy( GlobalFVecs_d[i], FNFVecs[i], sizeof(double)*numcoeff*numcoeff, cudaMemcpyHostToDevice );
        // checkCudaError(err); 
        }

}
然而,这似乎正是另一个例子中的情况

我意识到这是不一样的,所以我现在有了可编译的代码,包括:

double **GlobalFVecs_d;
double **GlobalFPVecs_d;

extern "C" void copyFNFVecs_(double **FNFVecs, int numpulsars, int numcoeff){


  cudaError_t err;
  GlobalFPVecs_d = (double **)malloc(numpulsars * sizeof(double*));
 err = cudaMalloc( (void ***)&GlobalFVecs_d, numpulsars*sizeof(double*) );
 checkCudaError(err);

    for(int i =0; i < numpulsars;i++){
         err = cudaMalloc( (void **) &(GlobalFPVecs_d[i]), numcoeff*numcoeff*sizeof(double) );
         checkCudaError(err);    
         err = cudaMemcpy( GlobalFPVecs_d[i], FNFVecs[i], sizeof(double)*numcoeff*numcoeff, cudaMemcpyHostToDevice );
         checkCudaError(err);   
        }

         err = cudaMemcpy( GlobalFVecs_d, GlobalFPVecs_d, sizeof(double*)*numpulsars, cudaMemcpyHostToDevice );
         checkCudaError(err);

}
double**globalvecs\u d;
双**全局pvecs\u d;
外部“C”无效副本(双**fnfecs,整数,整数){
错误;
GlobalFPVecs_d=(双**)malloc(numpulsars*sizeof(双*);
err=cudaMalloc((无效***)和globalvecs_d,numpulsars*sizeof(双*);
检查CUDAERR(错误);
对于(int i=0;i
但是,如果我现在尝试使用以下工具访问它:

 dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
 dim3 dimGrid;//((G + dimBlock.x - 1) / dimBlock.x,(N + dimBlock.y - 1) / dimBlock.y);
 dimGrid.x=(numcoeff + dimBlock.x - 1)/dimBlock.x;
 dimGrid.y = (numcoeff + dimBlock.y - 1)/dimBlock.y;

 for(int i =0; i < numpulsars; i++){
    CopyPPFNF<<<dimGrid, dimBlock>>>(PPFMVec_d, GlobalFVecs_d[i], numpulsars, numcoeff, i);
 }
dim3 dimBlock(块大小,块大小);
dim3 dimGrid//((G+dimBlock.x-1)/dimBlock.x,(N+dimBlock.y-1)/dimBlock.y);
dimGrid.x=(numcoeff+dimBlock.x-1)/dimBlock.x;
y=(numcoeff+dimBlock.y-1)/dimBlock.y;
对于(int i=0;i
这里是seg故障,这不是获取数据的方法吗

  • 使用
    cudamaloc()
  • 使用
    cudaMemcpy()将其从主机复制到设备
  • 在内核参数列表中传递设备指针
  • 最后,您将从内核使用它,并使用您传递的参数! 例如:

    1//内核定义,另请参见Nvidia Cuda编程指南第4.2.3节
    2.全局无效向量添加(浮点*A、浮点*B、浮点*C)
    3     { 
    4//threadIdx.x是CUDA在运行时提供的内置变量
    5 int i=螺纹IDx.x;
    6a[i]=0;
    7 B[i]=i;
    8c[i]=A[i]+B[i];
    9     } 
    10
    11#包括
    12#定义尺寸10
    13 int main()
    14     { 
    15 int N=尺寸;
    16浮点数A[尺寸]、B[尺寸]、C[尺寸];
    17浮动*devPtrA;
    18浮动*devPtrB;
    19浮动*devPtrC;
    20 int memsize=SIZE*sizeof(float);
    21
    22**cudamaloc((void**)和devPtrA,memsize);**
    23 cudamaloc((void**)和devPtrB,memsize);
    24 cudamaloc((void**)和devPtrC,memsize);
    25**cudaMemcpy(devPtrA,A,memsize,cudamemcpyhostodevice);**
    26 cudaMemcpy(devPtrB,B,memsize,cudamemcpyhostodevice);
    调用27//\uuuu全局函数:Func>(参数);
    28**vecAdd(devPtrA、devPtrB、devPtrC);**
    29个cudaMemcpy(C、devPtrC、memsize、cudaMemcpyDeviceToHost);
    30
    
    对于(inti=0;iHi,感谢您的回复,我将不得不在后续回答中发布代码,请稍候
    double **GlobalFVecs_d;
    double **GlobalFPVecs_d;
    
    extern "C" void copyFNFVecs_(double **FNFVecs, int numpulsars, int numcoeff){
    
    
      cudaError_t err;
      GlobalFPVecs_d = (double **)malloc(numpulsars * sizeof(double*));
     err = cudaMalloc( (void ***)&GlobalFVecs_d, numpulsars*sizeof(double*) );
     checkCudaError(err);
    
        for(int i =0; i < numpulsars;i++){
             err = cudaMalloc( (void **) &(GlobalFPVecs_d[i]), numcoeff*numcoeff*sizeof(double) );
             checkCudaError(err);    
             err = cudaMemcpy( GlobalFPVecs_d[i], FNFVecs[i], sizeof(double)*numcoeff*numcoeff, cudaMemcpyHostToDevice );
             checkCudaError(err);   
            }
    
             err = cudaMemcpy( GlobalFVecs_d, GlobalFPVecs_d, sizeof(double*)*numpulsars, cudaMemcpyHostToDevice );
             checkCudaError(err);
    
    }
    
     dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
     dim3 dimGrid;//((G + dimBlock.x - 1) / dimBlock.x,(N + dimBlock.y - 1) / dimBlock.y);
     dimGrid.x=(numcoeff + dimBlock.x - 1)/dimBlock.x;
     dimGrid.y = (numcoeff + dimBlock.y - 1)/dimBlock.y;
    
     for(int i =0; i < numpulsars; i++){
        CopyPPFNF<<<dimGrid, dimBlock>>>(PPFMVec_d, GlobalFVecs_d[i], numpulsars, numcoeff, i);
     }
    
      1     //  Kernel definition, see also section 4.2.3 of Nvidia Cuda Programming Guide 
      2     __global__  void vecAdd(float* A, float* B, float* C) 
      3     { 
      4        // threadIdx.x is a built-in variable  provided by CUDA at runtime 
      5        int i = threadIdx.x; 
      6        A[i]=0; 
      7        B[i]=i; 
      8        C[i] = A[i] + B[i]; 
      9     } 
      10     
      11     #include  <stdio.h> 
      12     #define  SIZE 10 
      13     int  main() 
      14     { 
      15         int N=SIZE; 
      16         float A[SIZE], B[SIZE], C[SIZE]; 
      17         float *devPtrA; 
      18         float *devPtrB; 
      19         float *devPtrC; 
      20         int memsize= SIZE * sizeof(float); 
      21     
      22         **cudaMalloc((void**)&devPtrA, memsize);** 
      23         cudaMalloc((void**)&devPtrB, memsize); 
      24         cudaMalloc((void**)&devPtrC, memsize); 
      25         **cudaMemcpy(devPtrA, A, memsize,  cudaMemcpyHostToDevice);** 
      26         cudaMemcpy(devPtrB, B, memsize,  cudaMemcpyHostToDevice); 
      27         // __global__ functions are called:  Func<<< Dg, Db, Ns  >>>(parameter); 
      28         **vecAdd<<<1, N>>>(devPtrA,  devPtrB, devPtrC);** 
      29         cudaMemcpy(C, devPtrC, memsize,  cudaMemcpyDeviceToHost); 
      30     
      31         for (int i=0; i<SIZE; i++) 
      32          printf("C[%d]=%f\n",i,C[i]); 
      33     
      34          cudaFree(devPtrA); 
      35         cudaFree(devPtrA); 
      36         cudaFree(devPtrA); 
      37     }