CUDA内核和2D阵列-它是如何工作的？_C_Cuda_Nvidia

CUDA内核和2D阵列-它是如何工作的？

c cuda

CUDA内核和2D阵列-它是如何工作的？,c,cuda,nvidia,C,Cuda,Nvidia,我正在做一个图像旋转的方法。它需要两个矩阵和一个旋转度。它按度数旋转原始矩阵，并将其存储到旋转后的矩阵中。我有以下“正常”代码（用于CPU-取自此站点-），它正常工作 static void RotateImage(unsigned char original[RAW_HEIGHT][RAW_WIDTH] , unsigned char rotated[RAW_HEIGHT][RAW_WIDTH] , int degrees) { double centerX = RAW_WIDTH/

我正在做一个图像旋转的方法。它需要两个矩阵和一个旋转度。它按度数旋转原始矩阵，并将其存储到旋转后的矩阵中。我有以下“正常”代码（用于CPU-取自此站点-），它正常工作

static void RotateImage(unsigned char original[RAW_HEIGHT][RAW_WIDTH] , unsigned char rotated[RAW_HEIGHT][RAW_WIDTH] , int degrees)
{
    double centerX = RAW_WIDTH/2;
    double centerY = RAW_HEIGHT/2;

    for(int x = 0; x< RAW_HEIGHT;x++)
    {
        for (int y = 0; y < RAW_WIDTH; y++)
        {
            double dir = calculateDirection(x-centerX,y-centerY);
            double mag = calculateMagnitude(x-centerX,y-centerY);

            dir-=degrees;

            int origX = (int)(centerX + calculateX(dir,mag));
            int origY = (int)(centerY + calculateY(dir,mag));

            if (origX >= 0 && origX < RAW_HEIGHT && origY >= 0 && origY < RAW_WIDTH)
            {
                    rotated[x][y] = original[origX][origY];
            }
        }
    }
}

static void RotateImage（无符号字符原始[原始高度][原始宽度]，无符号字符旋转[原始高度][原始宽度]，整数度）
{
双中心X=原始宽度/2；
双中心=原始高度/2；
对于（int x=0；x<原始高度；x++）
{
对于（int y=0；y=0和&origX=0和&origY


我想将此代码转换为CUDA代码。以下是我的版本：
#define RAW_WIDTH 1600*3
#define RAW_HEIGHT 1200

unsigned char *dev_original_image;
unsigned char *dev_rotated_image;

__global__ void rotatePicture(unsigned char *original, unsigned char *rotated, int degrees)
{
    int x = threadIdx.x + blockDim.x * blockIdx.x;
    int y = threadIdx.y + blockDim.y * blockIdx.y;
    int offset_rotated = x + y * blockDim.x * gridDim.x;

    double centerX = 2400.0;
    double centerY = 600.0;

    double dir = (atan2(y-centerY,x-centerX))*180/3.14159265;
    double mag = sqrt((x-centerX)*(x-centerX) + (y-centerY)*(y-centerY));

    dir = dir - degrees;

    int origX = (int)(centerX + cos((dir*3.14159265/180)) * mag);
    int origY = (int)(centerY + sin((dir*3.14159265/180)) * mag);
    int offset_original = origX + origY * blockDim.x * gridDim.x;

    if(offset_original > 0 && offset_original < RAW_HEIGHT*RAW_WIDTH)
        *(rotated + offset_rotated) = *(original + offset_original);
}

#定义原始宽度1600*3
#定义原始高度1200
无符号字符*dev_原始图像；
无符号字符*dev_旋转图像；
__全局_uu; void rotatePicture（无符号字符*原始，无符号字符*旋转，整数度）
{
int x=threadIdx.x+blockDim.x*blockIdx.x；
int y=线程IDX.y+块DIM.y*块IDX.y；
int offset_rotated=x+y*blockDim.x*gridDim.x；
双中心X=2400.0；
双中心=600.0；
双方向=（atan2（y-centerY，x-centerX））*180/3.14159265；
双磁位=sqrt（（x-centerX）*（x-centerX）+（y-centerY）*（y-centerY））；
dir=dir-度；
int origX=（int）（centerX+cos（（dir*3.14159265/180））*mag）；
国际货币基金组织=（国际货币基金组织）（centerY+sin（（dir*3.14159265/180））*mag）；
int offset_original=origX+origY*blockDim.x*gridDim.x；
如果（偏移原始>0和偏移原始<原始高度*原始宽度）
*（旋转+偏移\旋转）=*（原始+偏移\原始）；
}

但它并没有给我和CPU部分相同的结果。
我认为问题在于CUDA kerenl的过渡性争论。我将它们作为2D数组传递，这样可以吗？有人能给我解释一下吗？
以下是我的kerenl配置和调用：
dim3 BlockPerGrid(450,400,1);
dim3 ThreadsPerGrid(8,4,1);

cudaMalloc((void**)&dev_original_image,sizeof(unsigned char)*RAW_HEIGHT*RAW_WIDTH);
cudaMalloc((void**)&dev_rotated_image,sizeof(unsigned char)*RAW_HEIGHT*RAW_WIDTH);

cudaMemcpy(dev_original_image, raw_image2D, sizeof(unsigned char)*RAW_HEIGHT*RAW_WIDTH,cudaMemcpyHostToDevice);
cudaMemcpy(dev_rotated_image, raw_image2D_rotated, sizeof(unsigned char)*RAW_HEIGHT*RAW_WIDTH, cudaMemcpyHostToDevice);

rotatePicture<<<BlockPerGrid,ThreadsPerGrid>>>(dev_original_image,dev_rotated_image, deg);

dim3块状网格（450400,1）；
dim3螺纹棒（8,4,1）；
cudamaloc（（void**）和dev_原始图像，sizeof（无符号字符）*原始高度*原始宽度）；
cudamaloc（（void**）和dev_旋转图像，sizeof（无符号字符）*原始高度*原始宽度）；
cudaMemcpy（dev_原始图像，raw_图像2d，sizeof（无符号字符）*raw_高度*raw_宽度，cudaMemcpyHostToDevice）；
cudaMemcpy（dev_rotated_image，raw_image2D_rotated，sizeof（unsigned char）*raw_HEIGHT*raw_WIDTH，cudaMemcpyHostToDevice）；
旋转图像（dev_原始图像，dev_旋转图像，度）；

谢谢你的建议
注意：我修改了我的代码，工作得更好，但仍然不正确。以下是解决这些水域中其他潜在问题的方法。
以下是我正确的内核：
__global__ void rotatePicture(unsigned char *original, unsigned char *rotated, int degrees)
{
    int x = threadIdx.x + blockDim.x * blockIdx.x;
    int y = threadIdx.y + blockDim.y * blockIdx.y;
    int offset_rotated = x + y * blockDim.x * gridDim.x;

    double centerX = 2400.0;
    double centerY = 600.0;

    double dir = (atan2(x-centerX,y-centerY))*180/3.14159265;
    double mag = sqrt((x-centerX)*(x-centerX) + (y-centerY)*(y-centerY));

    dir = dir - degrees;

    int origX = (int)(centerX + sin((dir*3.14159265/180)) * mag);
    int origY = (int)(centerY + cos((dir*3.14159265/180)) * mag);
    int offset_original = origX + origY * blockDim.x * gridDim.x;

    if(origX > 0 && origX < RAW_WIDTH && origY > 0 && origY < RAW_HEIGHT)
        *(rotated + offset_rotated) = *(original + offset_original);
}

因此，它的功能与上面的CPU版本相同，但使用GPU资源。享受
您在所有cuda电话中都做了什么？RAW\u HEIGHT
和RAW\u WIDTH
的尺寸是多少？您对dev_original_image
和dev_rotated_image
的定义是什么，它们只是无符号字符*
还是其他类型？代码不应编译dev_original_image
和dev_rotated_image具有单级间接寻址，而内核的参数具有两级间接寻址。@Robert Crovella我添加了您指定的@sgar91行，它编译的bot无法像我想象的那样工作，您可能会感兴趣。我给出的第一个代码示例显示了如何将多维数组传递给cuda内核。@Robert，你能告诉我我是否需要将CUDAMAAllocPictch（）与cudaMemcpy2D（）一起使用，或者这种方法很好吗？在哪里学习cudamallocitch（）/cudaMemcpy2D（）？谢谢
dim3 BlockPerGrid(600,300,1);
dim3 ThreadsPerGrid(8,4,1);