Memory management 如何在CUDA中的CPU/GPU之间分配内存和复制2D阵列，而不使其扁平化？_Memory Management_Multidimensional Array_Cuda

Memory management 如何在CUDA中的CPU/GPU之间分配内存和复制2D阵列，而不使其扁平化？

memory-management cuda

Memory management 如何在CUDA中的CPU/GPU之间分配内存和复制2D阵列，而不使其扁平化？,memory-management,multidimensional-array,cuda,Memory Management,Multidimensional Array,Cuda,所以我想分配2D阵列，并在CUDA的CPU和GPU之间复制它们，但我是一个完全的初学者，其他在线材料对我来说很难理解或不完整。重要的是，我能够在内核代码中以2D数组的形式访问它们，如下所示请注意高度！=对于数组的宽度，如果可能的话，这会让我更加困惑，因为我总是很难选择网格大小我考虑过把它们压平，但我真的想让它这样工作这就是我自己研究的结果 __global__ void myKernel(int *firstArray, int *secondArray, int rows, int co

所以我想分配2D阵列，并在CUDA的CPU和GPU之间复制它们，但我是一个完全的初学者，其他在线材料对我来说很难理解或不完整。重要的是，我能够在内核代码中以2D数组的形式访问它们，如下所示

请注意高度！=对于数组的宽度，如果可能的话，这会让我更加困惑，因为我总是很难选择网格大小

我考虑过把它们压平，但我真的想让它这样工作

这就是我自己研究的结果

__global__ void myKernel(int *firstArray, int *secondArray, int rows, int columns) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    int column = blockIdx.y * blockDim.y + threadIdx.y;

    if (row >= rows || column >= columns)
        return;

    // Do something with the arrays like you would on a CPU, like:
    firstArray[row][column] = row * 2;
    secondArray[row[column] = row * 3;  
}


int main() {
    int rows = 300, columns = 200;
    int h_firstArray[rows][columns], h_secondArray[rows][columns];
    int *d_firstArray[rows][columns], *d_secondArray[rows][columns];

    // populate h_ arrays (Can do this bit myself)

    // Allocate memory on device, no idea how to do for 2D arrays.
    // Do memcopies to GPU, no idea how to do for 2D arrays.

    dim3 block(rows,columns);
    dim3 grid (1,1);
    myKernel<<<grid,block>>>(d_firstArray, d_secondArray, rows, columns);

    // Do memcopies back to host, no idea how to do for 2D arrays.

    cudaFree(d_firstArray);
    cudaFree(d_secondArray);

    return 0;
}

\uuuu全局\uuuuu无效myKernel（int*firstArray、int*secondArray、int行、int列）{
int row=blockIdx.x*blockDim.x+threadIdx.x；
int column=blockIdx.y*blockDim.y+threadIdx.y；
如果（行>=行| |列>=列）
返回；
//像在CPU上一样对阵列执行某些操作，例如：
firstArray[行][列]=行*2；
第二个数组[行[列]=行*3；
}
int main（）{
int行=300，列=200；
int h_firstArray[行][列]，h_secondArray[行][列]；
int*d_firstArray[行][列]，*d_secondArray[行][列]；
//填充h_u数组（我自己可以做这一点）
//在设备上分配内存，不知道如何处理2D阵列。
//做memcopies到GPU，不知道如何做二维阵列。
dim3块（行、列）；
dim3网格（1,1）；
myKernel（d_firstArray、d_secondArray、行、列）；
//不要将memcopies复制回主机，不知道如何处理二维阵列。
cudaFree（d_firstArray）；
cudaFree（d_二次数组）；
返回0；
}

编辑：有人问我，在我试图解决的问题中，是否会在编译时知道数组宽度。你可以假设这是因为我现在主要对这种特殊情况感兴趣。

在一般情况下（直到运行时才知道数组维度）在CUDA设备代码中处理双下标访问需要一个指针数组，就像在主机代码中一样。C和C++将每个下标作为指针引用来处理，以便到达“2D数组”中的最终位置。在一般情况下，设备代码中的双指针/双下标访问已包含在从的链接中。这有几个缺点，这些缺点已包含在该答案中，因此我在此不再重复

但是，如果数组宽度在编译时已知（数组高度可以是动态的，即在运行时确定），那么我们可以利用编译器和语言类型机制来避免大多数缺点。您的代码演示了CUDA和/或C/C++使用的其他几种不正确模式：

不能通过简单的单指针类型，如<代码> int *FiStRays< /Cult>

，进行一个双C或C++函数访问项。

通过基于堆栈的机制分配大型主机阵列：

int h_firstArray[rows][columns], h_secondArray[rows][columns];

在C和C++中经常出现问题。这些是基于堆栈的变量，如果足够大，它们通常会进入堆栈限制。

CUDA threadblock的总线程数限制为1024个。因此，此类threadblock维度：

dim3 block(rows,columns);

除了非常小的

行

和

列

（产品必须小于或等于1024）外，不起作用

在CUDA中为设备数组声明指针变量时，创建指针数组几乎是不正确的：

int *d_firstArray[rows][columns], *d_secondArray[rows][columns];

我们也不会在主机上分配空间，然后“重新分配”这些指针以供设备使用

下面是一个处理了上述各项的工作示例，演示了在运行时已知数组宽度的上述方法：

$ cat t50.cu
#include <stdio.h>

const int array_width = 200;

typedef int my_arr[array_width];

__global__ void myKernel(my_arr *firstArray, my_arr *secondArray, int rows, int columns) {
    int column = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;

    if (row >= rows || column >= columns)
        return;

    // Do something with the arrays like you would on a CPU, like:
    firstArray[row][column] = row * 2;
    secondArray[row][column] = row * 3;
}


int main() {
    int rows = 300, columns = array_width;
    my_arr *h_firstArray, *h_secondArray;
    my_arr *d_firstArray, *d_secondArray;
    size_t dsize = rows*columns*sizeof(int);
    h_firstArray = (my_arr *)malloc(dsize);
    h_secondArray = (my_arr *)malloc(dsize);
    // populate h_ arrays
    memset(h_firstArray, 0, dsize);
    memset(h_secondArray, 0, dsize);

    // Allocate memory on device
    cudaMalloc(&d_firstArray, dsize);
    cudaMalloc(&d_secondArray, dsize);
    // Do memcopies to GPU
    cudaMemcpy(d_firstArray, h_firstArray, dsize, cudaMemcpyHostToDevice);
    cudaMemcpy(d_secondArray, h_secondArray, dsize, cudaMemcpyHostToDevice);

    dim3 block(32,32);
    dim3 grid ((columns+block.x-1)/block.x,(rows+block.y-1)/block.y);
    myKernel<<<grid,block>>>(d_firstArray, d_secondArray, rows, columns);

    // Do memcopies back to host
    cudaMemcpy(h_firstArray, d_firstArray, dsize, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_secondArray, d_secondArray, dsize, cudaMemcpyDeviceToHost);
    // validate
    if (cudaGetLastError() != cudaSuccess) {printf("cuda error\n"); return -1;}
    for (int i = 0; i < rows; i++)
      for (int j = 0; j < columns; j++){
        if (h_firstArray[i][j] != i*2) {printf("first mismatch at %d,%d, was: %d, should be: %d\n", i,j,h_firstArray[i][j], i*2); return -1;}
        if (h_secondArray[i][j] != i*3) {printf("second mismatch at %d,%d, was: %d, should be: %d\n", i,j,h_secondArray[i][j], i*3); return -1;}}

    printf("success!\n");


    cudaFree(d_firstArray);
    cudaFree(d_secondArray);

    return 0;
}
$ nvcc -arch=sm_61 -o t50 t50.cu
$ cuda-memcheck ./t50
========= CUDA-MEMCHECK
success!
========= ERROR SUMMARY: 0 errors
$

$cat t50.cu
#包括
常数int数组_宽度=200；
typedef int my_arr[数组宽度]；
__全局无效myKernel（my_arr*firstArray，my_arr*secondArray，int行，int列）{
int column=blockIdx.x*blockDim.x+threadIdx.x；
int row=blockIdx.y*blockDim.y+threadIdx.y；
如果（行>=行| |列>=列）
返回；
//像在CPU上一样对阵列执行某些操作，例如：
firstArray[行][列]=行*2；
第二个数组[行][列]=行*3；
}
int main（）{
int行=300，列=数组的宽度；
my_arr*h_firstArray，*h_secondArray；
my_arr*d_firstArray，*d_secondArray；
大小=行*列*大小（int）；
h_firstArray=（my_arr*）malloc（dsize）；
h_secondArray=（my_arr*）malloc（dsize）；
//填充h_u数组
memset（h_firstArray，0，dsize）；
memset（h_secondArray，0，dsize）；
//在设备上分配内存
Cudamaloc（和d_firstArray，dsize）；
Cudamaloc（和d_secondArray，dsize）；
//是否将内存复制到GPU
cudaMemcpy（d_firstArray、h_firstArray、dsize、cudaMemcpyHostToDevice）；
cudaMemcpy（d_secondArray，h_secondArray，dsize，cudaMemcpyHostToDevice）；
dim3区块（32,32）；
dim3网格（（列+block.x-1）/block.x，（行+block.y-1）/block.y）；
myKernel（d_firstArray、d_secondArray、行、列）；
//是否将副本复制回主机
cudaMemcpy（h_firstArray、d_firstArray、dsize、cudaMemcpyDeviceToHost）；
cudaMemcpy（h_secondArray，d_secondArray，dsize，cudaMemcpyDeviceToHost）；
//证实
如果（cudaGetLastError（）！=cudaSuccess）{printf（“cuda error\n”）；返回-1；}
对于（int i=0；i


我已经颠倒了内核索引的含义