Warning: file_get_contents(/data/phpspider/zhask/data//catemap/6/cplusplus/138.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C++ 尝试使用cuda进行图像模糊_C++_Cuda_Gpu_Gpgpu - Fatal编程技术网

C++ 尝试使用cuda进行图像模糊

C++ 尝试使用cuda进行图像模糊,c++,cuda,gpu,gpgpu,C++,Cuda,Gpu,Gpgpu,我正在尝试使用cuda模糊图像,使用stbi_图像作为库来加载和保存图像。编译代码时没有错误,但当我试图查看结果时,它只是一个空白图像。这就是代码 #include "lodepng.h" #define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_WRITE_IMPLEMENTATION #include "stb_image.h" #include "stb_image_write.h" #

我正在尝试使用cuda模糊图像,使用stbi_图像作为库来加载和保存图像。编译代码时没有错误,但当我试图查看结果时,它只是一个空白图像。这就是代码

#include "lodepng.h"
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image.h"
#include "stb_image_write.h"
#include <iostream>
#define BLUR_SIZE 7
#define R 0
#define G 1
#define B 2

__global__ void blurKernel(unsigned char* in, unsigned char* out, int width, int height, int num_channel, int channel) {

  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;

  if(col < width && row < height) {
    int pixVal = 0;
    int pixels = 0;

    for(int blurRow = -BLUR_SIZE; blurRow < BLUR_SIZE + 1; ++blurRow) {
      for(int blurCol = -BLUR_SIZE; blurCol < BLUR_SIZE + 1; ++blurCol) {
        int curRow = row + blurRow;
        int curCol = col + blurCol;
        if(curRow > -1 && curRow < height && curCol > -1 && curCol < width) {
          pixVal += in[curRow * width * num_channel + curCol * num_channel + channel];
          pixels++;
        }
      }
    }
    out[row * width * num_channel + col * num_channel + channel] = (unsigned char)(pixVal/pixels);
  }
}

int main() {

  int width, height,n;
  unsigned char *image = stbi_load("image4.png",&width,&height,&n,0);
  unsigned char *output = (unsigned char*)malloc(width * height * n *sizeof(unsigned char));
  
  unsigned char* Dev_Input_Image = NULL;
  unsigned char* Dev_Output_Image = NULL;
  cudaMalloc((void**)&Dev_Input_Image, sizeof(unsigned char)* height * width * n);
  cudaMalloc((void**)&Dev_Output_Image, sizeof(unsigned char)* height * width * n);

  cudaMemcpy(Dev_Input_Image, image, sizeof(unsigned char) * height * width * n, cudaMemcpyHostToDevice);

  //kernel call
  dim3 blockSize(16, 16, 1);
  dim3 gridSize(width/blockSize.x, height/blockSize.y,1);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,R);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,G);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,B);
  
  cudaDeviceSynchronize();

    cudaMemcpy(image, Dev_Output_Image, sizeof(unsigned char) * height * width * n, cudaMemcpyDeviceToHost);

  stbi_write_png("output_stbimage.png", width, height, n, image, width * n);

  cudaFree(Dev_Input_Image);
  cudaFree(Dev_Output_Image);

  return 0;
}
#包括“lodepng.h”
#定义机顶盒映像的实现
#定义机顶盒映像写入实现
#包括“stb_image.h”
#包括“stb_image_write.h”
#包括
#定义模糊_大小7
#定义r0
#定义g1
#定义b2
__全局字符(无符号字符*输入,无符号字符*输出,整数宽度,整数高度,整数个通道,整数通道){
int col=blockIdx.x*blockDim.x+threadIdx.x;
int row=blockIdx.y*blockDim.y+threadIdx.y;
if(列<宽度&行<高度){
int-pixVal=0;
整数像素=0;
对于(int blurRow=-BLUR\u SIZE;blurRow-1&&curRow-1&&curCol

我尝试了所有可能的方法,但我无法达到我的目的。我试着用一种串行的方式来做,它完全可以用相同的逻辑工作(我的意思是在每个通道上处理模糊)。希望有人能帮助我

您的代码有一个问题,就是您没有在内核代码(或其他任何地方)中将alpha通道从输入复制(或设置)到输出图像。alpha通道实际上未初始化。如果它恰好结束于零,那么无论其他通道如何,您都不会在输出图片中看到任何有趣的内容

当我像这样修复您的代码时:

#include "lodepng.h"
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image.h"
#include "stb_image_write.h"
#include <iostream>
#define BLUR_SIZE 7
#define R 0
#define G 1
#define B 2
#define A 3

__global__ void blurKernel(unsigned char* in, unsigned char* out, int width, int height, int num_channel, int channel, int copy_A) {

  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;

  if(col < width && row < height) {
    int pixVal = 0;
    int pixels = 0;
    if (copy_A)
      out[row*width*num_channel+col*num_channel+A] = in[row*width*num_channel+col*num_channel+A];
    for(int blurRow = -BLUR_SIZE; blurRow < BLUR_SIZE + 1; ++blurRow) {
      for(int blurCol = -BLUR_SIZE; blurCol < BLUR_SIZE + 1; ++blurCol) {
        int curRow = row + blurRow;
        int curCol = col + blurCol;
        if(curRow > -1 && curRow < height && curCol > -1 && curCol < width) {
          pixVal += in[curRow * width * num_channel + curCol * num_channel + channel];
          pixels++;
        }
      }
    }
    out[row * width * num_channel + col * num_channel + channel] = (unsigned char)(pixVal/pixels);
  }
}

int main() {

  int width, height,n;
  unsigned char *image = stbi_load("image4.png",&width,&height,&n,0);
  unsigned char *output = (unsigned char*)malloc(width * height * n *sizeof(unsigned char));
  unsigned char* Dev_Input_Image = NULL;
  unsigned char* Dev_Output_Image = NULL;
  cudaMalloc((void**)&Dev_Input_Image, sizeof(unsigned char)* height * width * n);
  cudaMalloc((void**)&Dev_Output_Image, sizeof(unsigned char)* height * width * n);

  cudaMemcpy(Dev_Input_Image, image, sizeof(unsigned char) * height * width * n, cudaMemcpyHostToDevice);

  //kernel call
  dim3 blockSize(16, 16, 1);
  dim3 gridSize(width/blockSize.x, height/blockSize.y,1);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,R,0);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,G,0);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,B,1);
  
  cudaDeviceSynchronize();

    cudaMemcpy(image, Dev_Output_Image, sizeof(unsigned char) * height * width * n, cudaMemcpyDeviceToHost);
  cudaFree(Dev_Input_Image);
  cudaFree(Dev_Output_Image);
  stbi_write_png("output_stbimage.png", width, height, n, image, width * n);


  return 0;
}
#包括“lodepng.h”
#定义机顶盒映像的实现
#定义机顶盒映像写入实现
#包括“stb_image.h”
#包括“stb_image_write.h”
#包括
#定义模糊_大小7
#定义r0
#定义g1
#定义b2
#定义一个3
__全局函数(无符号字符*输入,无符号字符*输出,整数宽度,整数高度,整数个通道,整数通道,整数拷贝A){
int col=blockIdx.x*blockDim.x+threadIdx.x;
int row=blockIdx.y*blockDim.y+threadIdx.y;
if(列<宽度&行<高度){
int-pixVal=0;
整数像素=0;
如果(复印件A)
out[row*width*num_channel+col*num_channel+A]=in[row*width*num_channel+col*num_channel+A];
对于(int blurRow=-BLUR\u SIZE;blurRow-1&&curRow-1&&curCol
并使用以下命令编译和运行它:

我得到的输出图片如下所示:

#include "lodepng.h"
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image.h"
#include "stb_image_write.h"
#include <iostream>
#define BLUR_SIZE 7
#define R 0
#define G 1
#define B 2
#define A 3

__global__ void blurKernel(unsigned char* in, unsigned char* out, int width, int height, int num_channel, int channel, int copy_A) {

  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;

  if(col < width && row < height) {
    int pixVal = 0;
    int pixels = 0;
    if (copy_A)
      out[row*width*num_channel+col*num_channel+A] = in[row*width*num_channel+col*num_channel+A];
    for(int blurRow = -BLUR_SIZE; blurRow < BLUR_SIZE + 1; ++blurRow) {
      for(int blurCol = -BLUR_SIZE; blurCol < BLUR_SIZE + 1; ++blurCol) {
        int curRow = row + blurRow;
        int curCol = col + blurCol;
        if(curRow > -1 && curRow < height && curCol > -1 && curCol < width) {
          pixVal += in[curRow * width * num_channel + curCol * num_channel + channel];
          pixels++;
        }
      }
    }
    out[row * width * num_channel + col * num_channel + channel] = (unsigned char)(pixVal/pixels);
  }
}

int main() {

  int width, height,n;
  unsigned char *image = stbi_load("image4.png",&width,&height,&n,0);
  unsigned char *output = (unsigned char*)malloc(width * height * n *sizeof(unsigned char));
  unsigned char* Dev_Input_Image = NULL;
  unsigned char* Dev_Output_Image = NULL;
  cudaMalloc((void**)&Dev_Input_Image, sizeof(unsigned char)* height * width * n);
  cudaMalloc((void**)&Dev_Output_Image, sizeof(unsigned char)* height * width * n);

  cudaMemcpy(Dev_Input_Image, image, sizeof(unsigned char) * height * width * n, cudaMemcpyHostToDevice);

  //kernel call
  dim3 blockSize(16, 16, 1);
  dim3 gridSize(width/blockSize.x, height/blockSize.y,1);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,R,0);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,G,0);
  blurKernel <<<gridSize, blockSize>>>(Dev_Input_Image, Dev_Output_Image, width, height,n,B,1);
  
  cudaDeviceSynchronize();

    cudaMemcpy(image, Dev_Output_Image, sizeof(unsigned char) * height * width * n, cudaMemcpyDeviceToHost);
  cudaFree(Dev_Input_Image);
  cudaFree(Dev_Output_Image);
  stbi_write_png("output_stbimage.png", width, height, n, image, width * n);


  return 0;
}

内核中是否存在任何运行时错误?如果原始图像有4个通道,可能需要复制alpha通道。我没有运行时错误,并且我还尝试将alpha通道设置为最大值,以便为图像提供非传输背景看起来正常,在小错误的情况下,它至少应该产生一些东西。尝试将刚加载的图像保存到新文件并进行检查。然后在CUDA中只从输入复制到输出。。。