C++ 管理2D CUDA阵列_C++_C_Arrays_Cuda_2d

C++ 管理2D CUDA阵列

c++ c arrays cuda

C++ 管理2D CUDA阵列,c++,c,arrays,cuda,2d,C++,C,Arrays,Cuda,2d,我试图将一个2d数组传递给内核，以便每个线程都可以访问index=threadIdx.x+（blockIdx.x*blockDim.x），但我在弄清楚如何执行此操作以及如何将数据复制回来时遇到了困难 size_t pitch; cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks); cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int

我试图将一个2d数组传递给内核，以便每个线程都可以访问index=threadIdx.x+（blockIdx.x*blockDim.x），但我在弄清楚如何执行此操作以及如何将数据复制回来时遇到了困难

size_t pitch;
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks * sizeof(int));
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size, num_blocks, cudaMemcpyDeviceToHost);
for (num_blocks)
  for(block_size)
    h_array[block][thread] should be 1

__global__ void kernel(int *array, int pitch) {
  int *row = (int*)((char*)array + blockIdx.x * pitch);
  row[threadIdx.x] = 1;
  return;
}

size\u t节距；
cudaMallocPitch（&d_数组和螺距，块大小*大小（int），块数）；
cudaMemset2D（d_数组，音高，0，块大小*sizeof（int），num_块*sizeof（int））；
内核（d_数组，基音）；
cudaMemcpy2D（h_数组、音高、d_数组、音高、块大小、块数、cudaMemcpyDeviceToHost）；
用于（num_块）
用于（块大小）
h_数组[块][线程]应为1
__全局无效内核（int*数组，int间距）{
int*行=（int*）（（char*）数组+blockIdx.x*节距）；
行[threadIdx.x]=1；
返回；
}

这里我做错了什么？

您的CUDAEMSET2D正在占用您以前使用CUDAMALLOCITCH分配的更大内存空间，而且您的CUDAEMCPY2D正在复制该内存的一小部分

应按以下方式使用该函数：

cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks) // * sizeof(int)); <- This size is bigger than the previously declared
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size * sizeof(int) /* you forgot this here */, num_blocks, cudaMemcpyDeviceToHost);

cudaMallocPitch（&d_数组和螺距，块大小*sizeof（int），num_块）；
cudaMemset2D（d_数组，间距，0，块大小*大小（int），块数）/*大小（int）） 下面是一个完整的代码，它通过了一个基本测试，并修复了@hidrargyro提到的错误：
$ cat t236.cu
#include <stdio.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


__global__ void kernel(int *array, int pitch) {
  int *row = (int*)((char*)array + blockIdx.x * pitch);
  row[threadIdx.x] = 1;
  return;
}

int main(){

int *d_array, *h_array;
int block_size = 256;
int num_blocks = 256;
int grid_size = num_blocks;
h_array=(int *)malloc(block_size*num_blocks*sizeof(int));
if (h_array==0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_array, block_size*num_blocks*sizeof(int));
cudaCheckErrors("cudaMalloc fail");

size_t pitch;
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaCheckErrors("cudaMallocPitch fail");
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks);
cudaCheckErrors("cudaMemset2D fail");
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");

cudaMemcpy2D(h_array, block_size*sizeof(int), d_array, pitch, block_size*sizeof(int), num_blocks, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2D fail");
for (int i = 0; i<num_blocks; i++)
  for(int j = 0; j<block_size; j++)
    if (h_array[i*block_size+j] != 1) {printf("mismatch at i=%d, j=%d, should be 1, was %d\n", i,j,h_array[i*block_size+j]); return 1;}
printf("success\n");
return 0;
}

$ nvcc -arch=sm_20 -o t236 t236.cu
$ ./t236
success
$

$cat t236.cu
#包括
#定义cudaCheckErrors（msg）\
做{\
cudaError\u t\u err=cudaGetLastError（）\
如果（_err！=cudaSuccess）{\
fprintf（标准，“致命错误：%s（%s位于%s:%d）\n”\
msg，cudaGetErrorString（_err）\
__文件（行）\
fprintf（stderr，“***失败-中止\n”）\
出口（1）\
} \
}而（0）
__全局无效内核（int*数组，int间距）{
int*行=（int*）（（char*）数组+blockIdx.x*节距）；
行[threadIdx.x]=1；
返回；
}
int main（）{
int*d_数组，*h_数组；
int block_size=256；
int num_块=256；
int grid_size=num_块；
h_数组=（int*）malloc（block_size*num_blocks*sizeof（int））；
如果（h_数组==0）{printf（“malloc fail\n”）；返回1；}
cudamaloc（（void**）和d_数组，块大小*块数*大小（int））；
cudaCheckErrors（“Cudamaloc失败”）；
大小和间距；
cudaMallocPitch（&d_数组和螺距，块大小*大小（int），块数）；
cudaCheckErrors（“cudamallocitch失败”）；
cudaMemset2D（d_数组、音高、0、块大小*sizeof（int）、num_块）；
CUDACHECKERRS（“cudaMemset2D失败”）；
内核（d_数组，基音）；
cudaDeviceSynchronize（）；
cudaCheckErrors（“内核失败”）；
cudaMemcpy2D（h_数组、块大小*大小（int）、d_数组、音高、块大小*大小（int）、num_块、cudamemcpydevicetoost）；
CUDACHECKERS（“cudaMemcpy 2D失败”）；
对于（int i=0；i为什么要将数组强制转换为（char*）？这将导致一个错误的指针算法这是在这两个问题中描述的：@LarryPel:不，不会。音高是以字节为单位的，正确执行指针算法需要一个指向字节大小类型的指针。请提供一个完整的可编译程序。不要让我们玩20个关于什么是“块大小”的问题，什么是“网格大小”等等。此外，如果您执行了cudamemsset2d
（至少）将抛出错误。此外，您的h\u数组
可能没有倾斜。因此在cudaMemcpy2D中为其传递的pitch
参数不正确。您可能应该传递块大小*sizeof（int）
（或类似的东西）对于h\u数组的音调
。这就像CUDA\u SAFE\u CALL（），我总是想知道while（0）的奇怪循环