C++ 尝试使用cuda进行图像模糊_C++_Cuda_Gpu_Gpgpu

C++ 尝试使用cuda进行图像模糊

c++ cuda

C++ 尝试使用cuda进行图像模糊,c++,cuda,gpu,gpgpu,C++,Cuda,Gpu,Gpgpu,我正在尝试使用cuda模糊图像，使用stbi_图像作为库来加载和保存图像。编译代码时没有错误，但当我试图查看结果时，它只是一个空白图像。这就是代码 #include "lodepng.h" #define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_WRITE_IMPLEMENTATION #include "stb_image.h" #include "stb_image_write.h" #

我正在尝试使用cuda模糊图像，使用stbi_图像作为库来加载和保存图像。编译代码时没有错误，但当我试图查看结果时，它只是一个空白图像。这就是代码

#include "lodepng.h"
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image.h"
#include "stb_image_write.h"
#include <iostream>
#define BLUR_SIZE 7
#define R 0
#define G 1
#define B 2

__global__ void blurKernel(unsigned char* in, unsigned char* out, int width, int height, int num_channel, int channel) {

  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;

  if(col < width && row < height) {
    int pixVal = 0;
    int pixels = 0;

    for(int blurRow = -BLUR_SIZE; blurRow < BLUR_SIZE + 1; ++blurRow) {
      for(int blurCol = -BLUR_SIZE; blurCol < BLUR_SIZE + 1; ++blurCol) {
        int curRow = row + blurRow;
        int curCol = col + blurCol;
        if(curRow > -1 && curRow < height && curCol > -1 && curCol < width) {
          pixVal += in[curRow * width * num_channel + curCol * num_channel + channel];
          pixels++;
        }
      }
    }
    out[row * width * num_channel + col * num_channel + channel] = (unsigned char)(pixVal/pixels);
  }
}

int main() {

  int width, height,n;
  unsigned char *image = stbi_load("image4.png",&width,&height,&n,0);
  unsigned char *output = (unsigned char*)malloc(width * height * n *sizeof(unsigned char));
  
  unsigned char* Dev_Input_Image = NULL;
  unsigned char* Dev_Output_Image = NULL;
  cudaMalloc((void**)&Dev_Input_Image, sizeof(unsigned char)* height * width * n);
  cudaMalloc((void**)&Dev_Output_Image, sizeof(unsigned char)* height * width * n);

  cudaMemcpy(Dev_Input_Image, image, sizeof(unsigned char) * height * width * n, cudaMemcpyHostToDevice);

  //kernel call
  dim3 blockSize(16, 16, 1);
  dim3 gridSize(width/blockSize.x, height/blockSize.y,1);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,R);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,G);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,B);
  
  cudaDeviceSynchronize();

    cudaMemcpy(image, Dev_Output_Image, sizeof(unsigned char) * height * width * n, cudaMemcpyDeviceToHost);

  stbi_write_png("output_stbimage.png", width, height, n, image, width * n);

  cudaFree(Dev_Input_Image);
  cudaFree(Dev_Output_Image);

  return 0;
}

#包括“lodepng.h”
#定义机顶盒映像的实现
#定义机顶盒映像写入实现
#包括“stb_image.h”
#包括“stb_image_write.h”
#包括
#定义模糊_大小7
#定义r0
#定义g1
#定义b2
__全局字符（无符号字符*输入，无符号字符*输出，整数宽度，整数高度，整数个通道，整数通道）{
int col=blockIdx.x*blockDim.x+threadIdx.x；
int row=blockIdx.y*blockDim.y+threadIdx.y；
if（列<宽度&行<高度）{
int-pixVal=0；
整数像素=0；
对于（int blurRow=-BLUR\u SIZE；blurRow-1&&curRow-1&&curCol


我尝试了所有可能的方法，但我无法达到我的目的。我试着用一种串行的方式来做，它完全可以用相同的逻辑工作（我的意思是在每个通道上处理模糊）。希望有人能帮助我
您的代码有一个问题，就是您没有在内核代码（或其他任何地方）中将alpha通道从输入复制（或设置）到输出图像。alpha通道实际上未初始化。如果它恰好结束于零，那么无论其他通道如何，您都不会在输出图片中看到任何有趣的内容
当我像这样修复您的代码时：
#include "lodepng.h"
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image.h"
#include "stb_image_write.h"
#include <iostream>
#define BLUR_SIZE 7
#define R 0
#define G 1
#define B 2
#define A 3

__global__ void blurKernel(unsigned char* in, unsigned char* out, int width, int height, int num_channel, int channel, int copy_A) {

  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;

  if(col < width && row < height) {
    int pixVal = 0;
    int pixels = 0;
    if (copy_A)
      out[row*width*num_channel+col*num_channel+A] = in[row*width*num_channel+col*num_channel+A];
    for(int blurRow = -BLUR_SIZE; blurRow < BLUR_SIZE + 1; ++blurRow) {
      for(int blurCol = -BLUR_SIZE; blurCol < BLUR_SIZE + 1; ++blurCol) {
        int curRow = row + blurRow;
        int curCol = col + blurCol;
        if(curRow > -1 && curRow < height && curCol > -1 && curCol < width) {
          pixVal += in[curRow * width * num_channel + curCol * num_channel + channel];
          pixels++;
        }
      }
    }
    out[row * width * num_channel + col * num_channel + channel] = (unsigned char)(pixVal/pixels);
  }
}

int main() {

  int width, height,n;
  unsigned char *image = stbi_load("image4.png",&width,&height,&n,0);
  unsigned char *output = (unsigned char*)malloc(width * height * n *sizeof(unsigned char));
  unsigned char* Dev_Input_Image = NULL;
  unsigned char* Dev_Output_Image = NULL;
  cudaMalloc((void**)&Dev_Input_Image, sizeof(unsigned char)* height * width * n);
  cudaMalloc((void**)&Dev_Output_Image, sizeof(unsigned char)* height * width * n);

  cudaMemcpy(Dev_Input_Image, image, sizeof(unsigned char) * height * width * n, cudaMemcpyHostToDevice);

  //kernel call
  dim3 blockSize(16, 16, 1);
  dim3 gridSize(width/blockSize.x, height/blockSize.y,1);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,R,0);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,G,0);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,B,1);
  
  cudaDeviceSynchronize();

    cudaMemcpy(image, Dev_Output_Image, sizeof(unsigned char) * height * width * n, cudaMemcpyDeviceToHost);
  cudaFree(Dev_Input_Image);
  cudaFree(Dev_Output_Image);
  stbi_write_png("output_stbimage.png", width, height, n, image, width * n);


  return 0;
}

#包括“lodepng.h”
#定义机顶盒映像的实现
#定义机顶盒映像写入实现
#包括“stb_image.h”
#包括“stb_image_write.h”
#包括
#定义模糊_大小7
#定义r0
#定义g1
#定义b2
#定义一个3
__全局函数（无符号字符*输入，无符号字符*输出，整数宽度，整数高度，整数个通道，整数通道，整数拷贝A）{
int col=blockIdx.x*blockDim.x+threadIdx.x；
int row=blockIdx.y*blockDim.y+threadIdx.y；
if（列<宽度&行<高度）{
int-pixVal=0；
整数像素=0；
如果（复印件A）
out[row*width*num_channel+col*num_channel+A]=in[row*width*num_channel+col*num_channel+A]；
对于（int blurRow=-BLUR\u SIZE；blurRow-1&&curRow-1&&curCol

并使用以下命令编译和运行它：

我得到的输出图片如下所示：
#include "lodepng.h"
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image.h"
#include "stb_image_write.h"
#include <iostream>
#define BLUR_SIZE 7
#define R 0
#define G 1
#define B 2
#define A 3

__global__ void blurKernel(unsigned char* in, unsigned char* out, int width, int height, int num_channel, int channel, int copy_A) {

  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;

  if(col < width && row < height) {
    int pixVal = 0;
    int pixels = 0;
    if (copy_A)
      out[row*width*num_channel+col*num_channel+A] = in[row*width*num_channel+col*num_channel+A];
    for(int blurRow = -BLUR_SIZE; blurRow < BLUR_SIZE + 1; ++blurRow) {
      for(int blurCol = -BLUR_SIZE; blurCol < BLUR_SIZE + 1; ++blurCol) {
        int curRow = row + blurRow;
        int curCol = col + blurCol;
        if(curRow > -1 && curRow < height && curCol > -1 && curCol < width) {
          pixVal += in[curRow * width * num_channel + curCol * num_channel + channel];
          pixels++;
        }
      }
    }
    out[row * width * num_channel + col * num_channel + channel] = (unsigned char)(pixVal/pixels);
  }
}

int main() {

  int width, height,n;
  unsigned char *image = stbi_load("image4.png",&width,&height,&n,0);
  unsigned char *output = (unsigned char*)malloc(width * height * n *sizeof(unsigned char));
  unsigned char* Dev_Input_Image = NULL;
  unsigned char* Dev_Output_Image = NULL;
  cudaMalloc((void**)&Dev_Input_Image, sizeof(unsigned char)* height * width * n);
  cudaMalloc((void**)&Dev_Output_Image, sizeof(unsigned char)* height * width * n);

  cudaMemcpy(Dev_Input_Image, image, sizeof(unsigned char) * height * width * n, cudaMemcpyHostToDevice);

  //kernel call
  dim3 blockSize(16, 16, 1);
  dim3 gridSize(width/blockSize.x, height/blockSize.y,1);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,R,0);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,G,0);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,B,1);
  
  cudaDeviceSynchronize();

    cudaMemcpy(image, Dev_Output_Image, sizeof(unsigned char) * height * width * n, cudaMemcpyDeviceToHost);
  cudaFree(Dev_Input_Image);
  cudaFree(Dev_Output_Image);
  stbi_write_png("output_stbimage.png", width, height, n, image, width * n);


  return 0;
}

内核中是否存在任何运行时错误？如果原始图像有4个通道，可能需要复制alpha通道。我没有运行时错误，并且我还尝试将alpha通道设置为最大值，以便为图像提供非传输背景看起来正常，在小错误的情况下，它至少应该产生一些东西。尝试将刚加载的图像保存到新文件并进行检查。然后在CUDA中只从输入复制到输出。。。