Cuda 如何从主机访问设备2D数组全局变量

Cuda 如何从主机访问设备2D数组全局变量,cuda,Cuda,我想在main方法中使用“printf”打印d_t全局2D数组变量。但我收到一条警告说: 无法在主机函数中直接读取设备变量“d_t” 如何将全局2D数组变量从设备复制到主机,然后打印每行的第一列 __device__ double *d_t; __device__ size_t d_gridPitch; __global__ void kernelFunc() { int i = blockIdx.x * blockDim.x + threadIdx.x; double*

我想在main方法中使用“printf”打印d_t全局2D数组变量。但我收到一条警告说:

无法在主机函数中直接读取设备变量“d_t”

如何将全局2D数组变量从设备复制到主机,然后打印每行的第一列

__device__ double *d_t;

__device__ size_t d_gridPitch;


__global__ void kernelFunc()
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    double* rowt = (double*)((char *)d_t + i * d_gridPitch);
    rowt[0] = rowt[0] + 40000;

}


int main()
{
    int size = 16;
    size_t d_pitchLoc;
    double *d_tLoc;

    cudaMallocPitch((void**)&d_tLoc, &d_pitchLoc, size * sizeof(double), size);
    cudaMemset2D(d_tLoc, d_pitchLoc, 0, size * sizeof(double), size);

    cudaMemcpyToSymbol(d_gridPitch, &d_pitchLoc, sizeof(int));
    cudaMemcpyToSymbol(d_t, & d_tLoc, sizeof(d_tLoc));

    kernelFunc<<<1,size>>>();

    for(int i=0; i< size; i++){
        double* rowt = (double*)((char *)d_t + i * d_gridPitch);
        printf("%.0f, ",rowt[0]);
    }

    cudaDeviceReset();

    return 0;
}
\uuuuu设备\uuuuuuuuuuuu双倍*d\t;
__设备尺寸和网格间距;
__全局无效内核函数()
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
double*rowt=(double*)((char*)d_t+i*d_gridPitch);
rowt[0]=rowt[0]+40000;
}
int main()
{
int size=16;
尺寸(t d);;
双*d_tLoc;
cudaMallocPitch((无效**)和d_tLoc和d_pitchLoc,尺寸*尺寸(双),尺寸);
cudaMemset2D(d_tLoc,d_pitchLoc,0,尺寸*尺寸F(双),尺寸);
CUDAMEMCPITOSYMBOL(d_网格间距和d_间距,尺寸(int));
cudaMemcpyToSymbol(d_t,&d_tLoc,sizeof(d_tLoc));
kernelFunc();
对于(int i=0;i
如注释所示,API正是为这项任务而设计的。您必须分配或静态定义一个主机内存缓冲区或容器作为设备数据的存储,然后将该主机缓冲区的间距提供给
cudaMemcpy2D
调用。API处理基音转换,而无需调用方进一步干预

如果将打印循环替换为以下内容:

double* h_t = new double[size * size];
cudaMemcpy2D(h_t, size * sizeof(double), d_tLoc, d_pitchLoc, 
        size * sizeof(double), size, cudaMemcpyDeviceToHost);
for(int i=0, j=0; i< size; i++){
    std::cout << h_t[i * size + j] << std::endl; 
}
double*h_t=新的double[size*size];
cudaMemcpy2D(高,尺寸*尺寸(双),d_tLoc,d_pitchLoc,
大小*sizeof(双精度),大小,cudaMemcpyDeviceToHost);
对于(int i=0,j=0;istd::cout
cudaMemcpy2D
函数用于向倾斜分配(即使用
cudamallocitch
创建)进行复制或从倾斜分配进行复制。是
cudaMemcpy2D
的API文档。如果搜索此CUDA标记,您将发现许多问题和答案,这些问题和答案证明了正确的用法,例如.Use。