C++ CUDA并行编程_C++_Cuda - Fatal编程技术网

C++ CUDA并行编程

c++ cuda

C++ CUDA并行编程,c++,cuda,C++,Cuda,所以我已经看了好几遍了，但似乎还没弄明白。发生的事情是我试图从GPU内存复制到CPU内存的变量总是显示为空根据我的理解，我应该有一个或多个变量，并创建这些变量的副本，我会将其与一些数据一起发送到GPU进行计算，一旦计算完成，返回并将GPU中变量的内容插入CPU中的变量但是每次我这样做，我的变量'd_result'总是空的。如果有人知道如何解决这个问题，我们将不胜感激我的CUDA功能： __global__ void gpu_histogram_equalization(unsigned c

所以我已经看了好几遍了，但似乎还没弄明白。发生的事情是我试图从GPU内存复制到CPU内存的变量总是显示为空

根据我的理解，我应该有一个或多个变量，并创建这些变量的副本，我会将其与一些数据一起发送到GPU进行计算，一旦计算完成，返回并将GPU中变量的内容插入CPU中的变量

但是每次我这样做，我的变量'd_result'总是空的。如果有人知道如何解决这个问题，我们将不胜感激

我的CUDA功能：

__global__ void gpu_histogram_equalization(unsigned char * img_out, unsigned char * img_in,
                            int * hist_in, int img_size, int nbr_bin){

    int *lut = (int *)malloc(sizeof(int)*nbr_bin);
    int i, cdf, min, d;
    /* Construct the LUT by calculating the CDF */
    cdf = 0;
    min = 0;
    i = threadIdx.x;
    while(min == 0){
        min = hist_in[i++];
    }
    d = img_size - min;
    if(i < nbr_bin){
        cdf += hist_in[i];
        //lut[i] = (cdf - min)*(nbr_bin - 1)/d;
        lut[i] = (int)(((float)cdf - min)*255/d + 0.5);
        if(lut[i] < 0){
            lut[i] = 0;
        }
    }

    /* Get the result image */
    if(i < img_size){
        if(lut[img_in[i]] > 255){
            img_out[i] = 255;
        }
        else{
            img_out[i] = (unsigned char)lut[img_in[i]];
        }

    }
}

\uuuuu全局\uuuuuu无效gpu直方图\u均衡（无符号字符*img\u输出，无符号字符*img\u输入，
int*历史记录，int img\U尺寸，int nbr\U bin）{
int*lut=（int*）malloc（sizeof（int）*nbr_-bin）；
int i，cdf，min，d；
/*通过计算CDF构建LUT*/
cdf=0；
最小值=0；
i=threadIdx.x；
while（min==0）{
min=历史单位，单位为[i++]；
}
d=img_尺寸-最小值；
如果（i255）{
img_out[i]=255；
}
否则{
img_out[i]=（无符号字符）lut[img_in[i]；
}
}
}

然后是调用它的函数：

PGM_IMG gpu_contrast_enhancement_g(PGM_IMG img_in)
{
    PGM_IMG result;
    int hist[256];
    unsigned char * d_result;

    result.w = img_in.w;
    result.h = img_in.h;
    result.img = (unsigned char *)malloc(result.w * result.h * sizeof(unsigned char));

    cudaMalloc(&d_result, result.w * result.h * sizeof(unsigned char));

    cudaMemcpy(d_result, result.img, result.w * result.h * sizeof(unsigned char), cudaMemcpyHostToDevice);
    histogram(hist, img_in.img, img_in.h * img_in.w, 256);
    gpu_histogram_equalization<<<1,result.w * result.h * sizeof(unsigned char)>>>(d_result,img_in.img,hist,result.w*result.h, 256);

    cudaMemcpy(result.img, d_result, result.w * result.h * sizeof(unsigned char), cudaMemcpyDeviceToHost);
    cudaFree(d_result);

    return result;
}

PGM\U IMG gpu\U对比度增强（PGM\U IMG IMG\U in）
{
PGM_-IMG结果；
国际历史[256]；
无符号字符*d_结果；
结果w=img_in.w；
结果h=img_in.h；
result.img=（无符号字符*）malloc（result.w*result.h*sizeof（无符号字符））；
cudamaloc（&d_result，result.w*result.h*sizeof（unsigned char））；
cudaMemcpy（d_result，result.img，result.w*result.h*sizeof（unsigned char），cudaMemcpyHostToDevice）；
直方图（hist，img_in.img，img_in.h*img_in.w，256）；
gpu直方图均衡化（d_结果，img_in.img，hist，result.w*result.h，256）；
cudaMemcpy（result.img、d_result、result.w*result.h*sizeof（unsigned char）、cudaMemcpyDeviceToHost）；
cudaFree（d_结果）；
返回结果；
}

让我们看看这一行：

gpu_histogram_equalization<<<1,result.w*result.h*sizeof(unsigned char)>>>
        (d_result,img_in.img,hist,result.w*result.h, 256);

gpu\u直方图\u均衡化
（d_result，img_in.img，hist，result.w*result.h，256）；

以下是您遇到的一些问题：

img_in.img-这是主机内存

hist-这是主机内存

发生的情况是，您的内核由于无效的内存访问而崩溃

请阅读有关错误检查的内容。

如果您需要调试方面的帮助，您必须提供其他人可以编译和运行的最短、完整的示例，因为您提供的代码不够。此外，每个CUDAAPI调用都会返回一个状态，您应该检查它们是否存在运行时错误。如果使用cuda memcheck运行代码会发生什么？它是否报告了任何问题？另外，请编辑您的问题标题，使其描述您的问题。标题对于搜索来说很重要，而且你的标题完全没有说明你的问题实际上是什么事实上，如果你看一下你引用的代码中的内核参数，你会发现对于任何非平凡的图像大小，块尺寸都会非常大，这意味着内核甚至从未启动过。