使用CUDA C/C++；_Cuda - Fatal编程技术网

使用CUDA C/C++；

cuda

使用CUDA C/C++；,cuda,Cuda,我正在尝试使用CUDA C/C++编程应用高斯图像模糊。CPU部分工作正常，产生了良好的结果。但是，在GPU的情况下，它只产生一个黑色的图像。我不确定问题出在哪里。这是我的全部代码我如何解决这个问题 #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <iostream> #include <iomanip> #include <fstream&

我正在尝试使用CUDA C/C++编程应用高斯图像模糊。CPU部分工作正常，产生了良好的结果。但是，在GPU的情况下，它只产生一个黑色的图像。我不确定问题出在哪里。这是我的全部代码

我如何解决这个问题

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <iostream>
#include <iomanip>
#include <fstream>

#define IMW 1600
#define IMH 1600
#define CHANNEL_NUM 3
#define IMAGE_BUFFER_SIZE (IMW*IMH*CHANNEL_NUM)
#define BLOCKX 16
#define BLOCKY BLOCKX
#define BLUR_DEGREE 3
#define BLUR_SIZE 1
unsigned int width, height;

int hmask[3][3] = { 1, 2, 1,
2, 4, 2,
1, 2, 1
};

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL



__global__ void blurKernel(unsigned char * in, unsigned char * out, int w, int h) 
{ 
    int Col = blockIdx.x * blockDim.x + threadIdx.x; 
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    if (Col < w && Row < h) 
    { 
        int pixVal = 0; 
        int pixels = 0;
        // Get the average of the surrounding BLUR_SIZE x BLUR_SIZE box 
            for(int blurRow = -BLUR_SIZE; blurRow < BLUR_SIZE+1; ++blurRow) 
                { 
                    for(int blurCol = -BLUR_SIZE; blurCol < BLUR_SIZE+1; ++blurCol) 
                    {
                        int curRow = Row + blurRow; 
                        int curCol = Col + blurCol; 

                        // Verify we have a valid image pixel 
                        if(curRow > -1 && curRow < h && curCol > -1 && curCol < w) 
                            { 
                                pixVal += in[curRow * w + curCol]; 
                                pixels++; // Keep track of number of pixels in the avg 
                            } 
                    } 
                } 
            // Write our new pixel value out 
        out[Row * w + Col] = (unsigned char)(pixVal / pixels); 
    } 
}

int  main(int argc, char **argv)
{
/************ Setup work ***********************/
  unsigned char *d_resultPixels;
  unsigned char *h_resultPixels;
  unsigned char *h_devicePixels;

  unsigned char *h_pixels = NULL;
  unsigned char *d_pixels = NULL;



  int nBlurDegree;
  int imageSize = sizeof(unsigned char) * IMAGE_BUFFER_SIZE;

  h_pixels = (unsigned char *)malloc(imageSize);
  h_resultPixels = (unsigned char *)malloc(imageSize);
  h_devicePixels = (unsigned char *)malloc(imageSize);

  int width1, height1, bpp;

  h_pixels = stbi_load("rana_1600_1600.png", &width1, &height1, &bpp, CHANNEL_NUM);

  width = width1;
  height = height1;

  printf("Image size: %u\n", imageSize);
  printf("Image width: %u\n", width);
  printf("Image height: %u\n", height);

  //memcpy(h_devicePixels, h_pixels, imageSize);



/************************** Start host processing ************************/
  unsigned long long cputime = dtime_usec(0);
  // cpu code here.....
  cputime = dtime_usec(cputime);

  stbi_write_png("host_output.png", width, height, CHANNEL_NUM, h_resultPixels, width*CHANNEL_NUM);

/************************** End host processing **************************/

/************************** Start device processing **********************/

  // allocate memory on the GPU for the output image
  cudaMalloc((void**)&d_pixels, imageSize);
  cudaMalloc((void**)&d_resultPixels, imageSize);

  cudaMemcpy(d_pixels, h_pixels, imageSize, cudaMemcpyHostToDevice);
  checkCUDAError("CUDA memcpy to device");

  dim3 blocksPerGrid(IMW / 16, 1);
  dim3 threadsPerBlock(16, 1);

  unsigned long long gputime = dtime_usec(0);

  for (nBlurDegree = 0; nBlurDegree < BLUR_DEGREE; nBlurDegree++)
  {
    cudaMemset(d_resultPixels, 0, imageSize);

    blurKernel << <blocksPerGrid, threadsPerBlock >> >(d_pixels, d_resultPixels, width, height);

    cudaMemcpy(d_pixels, d_resultPixels, imageSize, cudaMemcpyDeviceToDevice);

    cudaThreadSynchronize();
  }

  cudaDeviceSynchronize();
  gputime = dtime_usec(gputime);


  cudaMemcpy(h_devicePixels, d_resultPixels, imageSize, cudaMemcpyDeviceToHost);


  printf("GPU time: %f seconds, CPU time: %f seconds\n", gputime/(float)USECPSEC, cputime/(float)USECPSEC);

  printf("Speedup: %f\n", (cputime/(float)USECPSEC)/(gputime/(float)USECPSEC));

  validate(h_pixels, h_devicePixels, imageSize);

  stbi_write_png("device_output.png", width, height, CHANNEL_NUM, h_devicePixels, width*CHANNEL_NUM);


/************************** End device processing ************************/

// Release resources
  cudaFree(d_pixels);
  cudaFree(d_resultPixels);

  //stbi_image_free(h2_pixels);

  free(h_devicePixels);
  free(h_pixels);
  free(h_resultPixels);

  return 0;
} // End main

#包括
#包括
#包括
#包括
#包括
#包括
#定义IMW 1600
#定义IMH 1600
#定义通道编号3
#定义图像缓冲区大小（IMW*IMH*CHANNEL\u NUM）
#定义blockx16
#定义BLOCKY BLOCKX
#定义模糊度为3
#定义模糊_大小1
无符号整数宽度、高度；
int-hmask[3][3]={1,2,1，
2, 4, 2,
1, 2, 1
};
#包括
#包括
#定义USECPSEC 10000000ull
__全局函数（无符号字符*输入，无符号字符*输出，整数w，整数h）
{ 
int Col=blockIdx.x*blockDim.x+threadIdx.x；
int Row=blockIdx.y*blockDim.y+threadIdx.y；
如果（列-1&&curRow-1&&curCol（d_像素、d_结果像素、宽度、高度）；
cudaMemcpy（d_像素、d_结果像素、图像大小、cudaMemcpyDeviceToDevice）；
cudaThreadSynchronize（）；
}
cudaDeviceSynchronize（）；
gputime=dtime\u usec（gputime）；
cudaMemcpy（h_设备像素、d_结果像素、图像大小、cudaMemcpyDeviceToHost）；
printf（“GPU时间：%f秒，CPU时间：%f秒\n”，gputime/（float）USECPSEC，cputime/（float）USECPSEC）；
printf（“加速比：%f\n”，（cputime/（float）USECPSEC）/（gputime/（float）USECPSEC））；
验证（h_像素、h_设备像素、图像大小）；
stbi_write_png（“device_output.png”，宽度、高度、通道数、h_设备像素、宽度*通道数）；
/**************************终端设备处理************************/
//释放资源
cudaFree（d_像素）；
cudaFree（d_结果像素）；
//无stbi_图像（h2_像素）；
免费（h_设备像素）；
自由像素（h_像素）；
免费（h_结果像素）；
返回0；
}//末端总管

我需要关于如何获得GPU输出图像“device_output.png”的帮助