Cuda 如何在GPU上使用带有CULA的3D矩阵?
在某些代码的CPU版本中,我有许多类似于以下内容:Cuda 如何在GPU上使用带有CULA的3D矩阵?,cuda,gpu,linear-algebra,algebra,cula,Cuda,Gpu,Linear Algebra,Algebra,Cula,在某些代码的CPU版本中,我有许多类似于以下内容: for(int i =0;i<N;i++){ dgemm(A[i], B[i],C[i], Size[i][0], Size[i][1], Size[i][2], Size[i][3], 'N','T'); } 然而,我想在课程开始时提前将我的B存储在GPU上,因为它们不会改变,但我不知道如何去做。或者我如何存储我的阵列,使之成为可能 我在网上看到了关于在CUDA中使用3D矩阵的各种情况,但它们似乎不太适用于对CULA函数进
for(int i =0;i<N;i++){
dgemm(A[i], B[i],C[i], Size[i][0], Size[i][1], Size[i][2], Size[i][3], 'N','T');
}
然而,我想在课程开始时提前将我的B存储在GPU上,因为它们不会改变,但我不知道如何去做。或者我如何存储我的阵列,使之成为可能
我在网上看到了关于在CUDA中使用3D矩阵的各种情况,但它们似乎不太适用于对CULA函数进行函数调用
根据下面答案中的示例,我得出以下结论:
extern "C" void copyFNFVecs_(double **FNFVecs, int numpulsars, int numcoeff){
cudaError_t err;
err = cudaMalloc( (void ***)&GlobalFVecs_d, numpulsars*sizeof(double*) );
checkCudaError(err);
for(int i =0; i < numpulsars;i++){
err = cudaMalloc( (void **) &(GlobalFVecs_d[i]), numcoeff*numcoeff*sizeof(double) );
checkCudaError(err);
// err = cudaMemcpy( GlobalFVecs_d[i], FNFVecs[i], sizeof(double)*numcoeff*numcoeff, cudaMemcpyHostToDevice );
// checkCudaError(err);
}
}
然而,这似乎正是另一个例子中的情况
我意识到这是不一样的,所以我现在有了可编译的代码,包括:
double **GlobalFVecs_d;
double **GlobalFPVecs_d;
extern "C" void copyFNFVecs_(double **FNFVecs, int numpulsars, int numcoeff){
cudaError_t err;
GlobalFPVecs_d = (double **)malloc(numpulsars * sizeof(double*));
err = cudaMalloc( (void ***)&GlobalFVecs_d, numpulsars*sizeof(double*) );
checkCudaError(err);
for(int i =0; i < numpulsars;i++){
err = cudaMalloc( (void **) &(GlobalFPVecs_d[i]), numcoeff*numcoeff*sizeof(double) );
checkCudaError(err);
err = cudaMemcpy( GlobalFPVecs_d[i], FNFVecs[i], sizeof(double)*numcoeff*numcoeff, cudaMemcpyHostToDevice );
checkCudaError(err);
}
err = cudaMemcpy( GlobalFVecs_d, GlobalFPVecs_d, sizeof(double*)*numpulsars, cudaMemcpyHostToDevice );
checkCudaError(err);
}
double**globalvecs\u d;
双**全局pvecs\u d;
外部“C”无效副本(双**fnfecs,整数,整数){
错误;
GlobalFPVecs_d=(双**)malloc(numpulsars*sizeof(双*);
err=cudaMalloc((无效***)和globalvecs_d,numpulsars*sizeof(双*);
检查CUDAERR(错误);
对于(int i=0;i
但是,如果我现在尝试使用以下工具访问它:
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid;//((G + dimBlock.x - 1) / dimBlock.x,(N + dimBlock.y - 1) / dimBlock.y);
dimGrid.x=(numcoeff + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (numcoeff + dimBlock.y - 1)/dimBlock.y;
for(int i =0; i < numpulsars; i++){
CopyPPFNF<<<dimGrid, dimBlock>>>(PPFMVec_d, GlobalFVecs_d[i], numpulsars, numcoeff, i);
}
dim3 dimBlock(块大小,块大小);
dim3 dimGrid//((G+dimBlock.x-1)/dimBlock.x,(N+dimBlock.y-1)/dimBlock.y);
dimGrid.x=(numcoeff+dimBlock.x-1)/dimBlock.x;
y=(numcoeff+dimBlock.y-1)/dimBlock.y;
对于(int i=0;i
这里是seg故障,这不是获取数据的方法吗
cudamaloc()
cudaMemcpy()将其从主机复制到设备
1//内核定义,另请参见Nvidia Cuda编程指南第4.2.3节
2.全局无效向量添加(浮点*A、浮点*B、浮点*C)
3 {
4//threadIdx.x是CUDA在运行时提供的内置变量
5 int i=螺纹IDx.x;
6a[i]=0;
7 B[i]=i;
8c[i]=A[i]+B[i];
9 }
10
11#包括
12#定义尺寸10
13 int main()
14 {
15 int N=尺寸;
16浮点数A[尺寸]、B[尺寸]、C[尺寸];
17浮动*devPtrA;
18浮动*devPtrB;
19浮动*devPtrC;
20 int memsize=SIZE*sizeof(float);
21
22**cudamaloc((void**)和devPtrA,memsize);**
23 cudamaloc((void**)和devPtrB,memsize);
24 cudamaloc((void**)和devPtrC,memsize);
25**cudaMemcpy(devPtrA,A,memsize,cudamemcpyhostodevice);**
26 cudaMemcpy(devPtrB,B,memsize,cudamemcpyhostodevice);
调用27//\uuuu全局函数:Func>(参数);
28**vecAdd(devPtrA、devPtrB、devPtrC);**
29个cudaMemcpy(C、devPtrC、memsize、cudaMemcpyDeviceToHost);
30
对于(inti=0;iHi,感谢您的回复,我将不得不在后续回答中发布代码,请稍候
double **GlobalFVecs_d;
double **GlobalFPVecs_d;
extern "C" void copyFNFVecs_(double **FNFVecs, int numpulsars, int numcoeff){
cudaError_t err;
GlobalFPVecs_d = (double **)malloc(numpulsars * sizeof(double*));
err = cudaMalloc( (void ***)&GlobalFVecs_d, numpulsars*sizeof(double*) );
checkCudaError(err);
for(int i =0; i < numpulsars;i++){
err = cudaMalloc( (void **) &(GlobalFPVecs_d[i]), numcoeff*numcoeff*sizeof(double) );
checkCudaError(err);
err = cudaMemcpy( GlobalFPVecs_d[i], FNFVecs[i], sizeof(double)*numcoeff*numcoeff, cudaMemcpyHostToDevice );
checkCudaError(err);
}
err = cudaMemcpy( GlobalFVecs_d, GlobalFPVecs_d, sizeof(double*)*numpulsars, cudaMemcpyHostToDevice );
checkCudaError(err);
}
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid;//((G + dimBlock.x - 1) / dimBlock.x,(N + dimBlock.y - 1) / dimBlock.y);
dimGrid.x=(numcoeff + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (numcoeff + dimBlock.y - 1)/dimBlock.y;
for(int i =0; i < numpulsars; i++){
CopyPPFNF<<<dimGrid, dimBlock>>>(PPFMVec_d, GlobalFVecs_d[i], numpulsars, numcoeff, i);
}
1 // Kernel definition, see also section 4.2.3 of Nvidia Cuda Programming Guide
2 __global__ void vecAdd(float* A, float* B, float* C)
3 {
4 // threadIdx.x is a built-in variable provided by CUDA at runtime
5 int i = threadIdx.x;
6 A[i]=0;
7 B[i]=i;
8 C[i] = A[i] + B[i];
9 }
10
11 #include <stdio.h>
12 #define SIZE 10
13 int main()
14 {
15 int N=SIZE;
16 float A[SIZE], B[SIZE], C[SIZE];
17 float *devPtrA;
18 float *devPtrB;
19 float *devPtrC;
20 int memsize= SIZE * sizeof(float);
21
22 **cudaMalloc((void**)&devPtrA, memsize);**
23 cudaMalloc((void**)&devPtrB, memsize);
24 cudaMalloc((void**)&devPtrC, memsize);
25 **cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);**
26 cudaMemcpy(devPtrB, B, memsize, cudaMemcpyHostToDevice);
27 // __global__ functions are called: Func<<< Dg, Db, Ns >>>(parameter);
28 **vecAdd<<<1, N>>>(devPtrA, devPtrB, devPtrC);**
29 cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);
30
31 for (int i=0; i<SIZE; i++)
32 printf("C[%d]=%f\n",i,C[i]);
33
34 cudaFree(devPtrA);
35 cudaFree(devPtrA);
36 cudaFree(devPtrA);
37 }