Image processing 基于CUDA并行处理的彩色图像到灰度图像_Image Processing_Cuda_Parallel Processing_Gpu

Image processing 基于CUDA并行处理的彩色图像到灰度图像

image-processing cuda parallel-processing

Image processing 基于CUDA并行处理的彩色图像到灰度图像,image-processing,cuda,parallel-processing,gpu,Image Processing,Cuda,Parallel Processing,Gpu,我试图解决一个问题，我应该改变一个彩色图像到灰度图像。为此，我使用CUDA并行方法。我在GPU上调用的kerne代码如下所示 __global__ void rgba_to_greyscale(const uchar4* const rgbaImage, unsigned char* const greyImage, int numRows, int numCols) { int absolute_image_p

我试图解决一个问题，我应该改变一个彩色图像到灰度图像。为此，我使用CUDA并行方法。我在GPU上调用的kerne代码如下所示

__global__
void rgba_to_greyscale(const uchar4* const rgbaImage,
                   unsigned char* const greyImage,
                   int numRows, int numCols)
{
    int absolute_image_position_x = blockIdx.x;  
    int absolute_image_position_y = blockIdx.y;

  if ( absolute_image_position_x >= numCols ||
   absolute_image_position_y >= numRows )
 {
     return;
 }
uchar4 rgba = rgbaImage[absolute_image_position_x + absolute_image_position_y];
float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
greyImage[absolute_image_position_x + absolute_image_position_y] = channelSum;

}

void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage,
                            uchar4 * const d_rgbaImage,
                            unsigned char* const d_greyImage,
                            size_t numRows,
                            size_t numCols)
{
  //You must fill in the correct sizes for the blockSize and gridSize
  //currently only one block with one thread is being launched
  const dim3 blockSize(numCols/32, numCols/32 , 1);  //TODO
  const dim3 gridSize(numRows/12, numRows/12 , 1);  //TODO
  rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage,
                                             d_greyImage,
                                             numRows,
                                             numCols);

  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}

\u全局__
无效rgba_至_灰度（常数uchar4*常数rgbaImage，
无符号字符*常量灰度图像，
int numRows，int numCols）
{
int绝对图像位置=blockIdx.x；
int绝对图像位置y=blockIdx.y；
如果（绝对图像位置）=numCols||
绝对图像位置（y>=numRows）
{
返回；
}
uchar4 rgba=rgbaImage[绝对图像位置x+绝对图像位置y]；
浮点channelSum=.299f*rgba.x+.587f*rgba.y+.114f*rgba.z；
灰度图像[绝对图像位置x+绝对图像位置y]=信道和；
}
将您的图像作废为灰度（常数uchar4*常数h\u rgbaImage，
uchar4*恒量d_rgbaImage，
未签名字符*常量d_greyImage，
尺寸（单位：毫米），
尺寸（单位：毫米）
{
//必须为blockSize和gridSize填写正确的大小
//目前只启动一个带有一个线程的块
常量dim3块大小（numCols/32，numCols/32，1）；//TODO
常量dim3网格大小（numRows/12，numRows/12，1）；//TODO
rgba_至_灰度（d_rgbaImage，
d_greyImage，
努姆罗斯，
numCols）；
cudaDeviceSynchronize（）；检查CUDAErrors（cudaGetLastError（））；
}

我在第一个像素行中看到一行点。

我得到的错误是

libdc1394错误：初始化libdc1394失败
位置51处的差异超过5的公差
参考：255
GPU:0
有人能帮我吗？？？提前谢谢

libdc1394错误：初始化libdc1394失败

我不认为这是CUDA的问题。libdc1394是一个用于访问IEEE1394又称FireWire又称iLink视频设备（DV摄像机、Apple iSight摄像机）的库。该库没有正确初始化，因此无法获得有用的结果。基本上是NINO:Nonsens In Nonsens Out。

您正在运行以下数量的块和网格：

  const dim3 blockSize(numCols/32, numCols/32 , 1);  //TODO
  const dim3 gridSize(numRows/12, numRows/12 , 1);  //TODO

然而，您的内核代码中没有使用任何线程

 int absolute_image_position_x = blockIdx.x;  
 int absolute_image_position_y = blockIdx.y;

这样，图像的宽度可以分为列的

绝对图像位置x

部分，图像的高度可以分为行的

绝对图像位置y

部分。现在，它创建的每个横截面的框需要平行地更改/重画所有灰度图像的像素。足够完成一项任务：）

现在，自从我发布了这个问题以来，我一直在不断地解决这个问题
现在我意识到我最初的解决方案是错误的，为了纠正这个问题，应该做一些改进。
要做的更改：-

 1. absolute_position_x =(blockIdx.x * blockDim.x) + threadIdx.x;
 2. absolute_position_y = (blockIdx.y * blockDim.y) + threadIdx.y;

其次，

 1. const dim3 blockSize(24, 24, 1);
 2. const dim3 gridSize((numCols/16), (numRows/16) , 1);

在解决方案中，我们使用了numCols/16*numCols/16的网格
块大小为24*24

代码在0.040576毫秒内执行

@datenwolf：感谢您回答以上问题

绝对x&y图像位置的计算是完美的。但是当你需要访问彩色图像中的特定像素时，你不应该使用以下代码吗

uchar4 rgba = rgbaImage[absolute_image_position_x + (absolute_image_position_y * numCols)];

我也这么认为，当将它与您编写的用于在串行代码中执行相同问题的代码进行比较时。

请让我知道：）

您仍然应该有运行时问题-转换将不会给出正确的结果

台词：

uchar4 rgba=rgbaImage[绝对图像位置x+绝对图像位置y]

灰度图像[绝对图像位置x+绝对图像位置y]=信道和应改为：

uchar4 rgba=rgbaImage[绝对图像位置x+绝对图像位置y*numCols]

灰度图像[绝对图像位置x+绝对图像位置y*numCols]=信道和

\u全局__
无效rgba_至_灰度（常数uchar4*常数rgbaImage，
无符号字符*常量灰度图像，
int numRows，int numCols）
{
int rgba_x=blockIdx.x*blockDim.x+threadIdx.x；
int rgba_y=blockIdx.y*blockDim.y+threadIdx.y；
int pixel_pos=rgba_x+rgba_y*numCols；
uchar4 rgba=rgbaImage[pixel_pos]；
无符号字符灰度=（无符号字符）（0.299f*rgba.x+0.587f*rgba.y+0.114f*rgba.z）；
灰度图像[像素位置]=灰色；
}
将您的图像作废为灰度（常量uchar4*常量h\u rgbaImage，uchar4*常量d\u rgbaImage，
无符号字符*const d_greyImage，size_t numRows，size_t numCols）
{
//必须为blockSize和gridSize填写正确的大小
//目前只启动一个带有一个线程的块
常量dim3块大小（24，24，1）；//TODO
常量dim3网格大小（numCols/24+1，numRows/24+1，1）；//TODO
rgba_至_灰度（d_rgbaImage、d_greyImage、numRows、numCols）；
cudaDeviceSynchronize（）；检查CUDAErrors（cudaGetLastError（））；
}

相同的代码能够处理非标准输入大小的图像

int idx=blockDim.x*blockIdx.x+threadIdx.x;
int idy=blockDim.y*blockIdx.y+threadIdx.y;

uchar4 rgbcell=rgbaImage[idx*numCols+idy];

   greyImage[idx*numCols+idy]=0.299*rgbcell.x+0.587*rgbcell.y+0.114*rgbcell.z;


  }

  void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
                        unsigned char* const d_greyImage, size_t numRows, size_t numCols)
 {
 //You must fill in the correct sizes for the blockSize and gridSize
 //currently only one block with one thread is being launched

int totalpixels=numRows*numCols;
int factors[]={2,4,8,16,24,32};
vector<int> numbers(factors,factors+sizeof(factors)/sizeof(int));
int factor=1;

   while(!numbers.empty())
  {
 if(totalpixels%numbers.back()==0)
 {
     factor=numbers.back();
     break;
 }
   else
   {
  numbers.pop_back();
   }
 }



 const dim3 blockSize(factor, factor, 1);  //TODO
 const dim3 gridSize(numRows/factor+1, numCols/factor+1,1);  //TODO
 rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage,    numRows, numCols);

intidx=blockDim.x*blockIdx.x+threadIdx.x；
int-idy=blockDim.y*blockIdx.y+threadIdx.y；
uchar4 rgbcell=rgbaImage[idx*numCols+idy]；
灰度图像[idx*numCols+idy]=0.299*rgbcell.x+0.587*rgbcell.y+0.114*rgbcell.z；
}
将您的图像作废为灰度（常量uchar4*常量h\u rgbaImage，uchar4*常量d\u rgbaImage，
无符号字符*const d_greyImage，size_t numRows，size_t numCols）
{
//必须为blockSize和gridSize填写正确的大小
//目前只启动一个带有一个线程的块
int totalpixels=numRows*numCols；
整数因子[]={2,4,8,16,24,32}；
向量数（因子，因子+sizeof（因子）/sizeof（int））；
整数因子=1；
而（！数字
int idx=blockDim.x*blockIdx.x+threadIdx.x;
int idy=blockDim.y*blockIdx.y+threadIdx.y;

uchar4 rgbcell=rgbaImage[idx*numCols+idy];

   greyImage[idx*numCols+idy]=0.299*rgbcell.x+0.587*rgbcell.y+0.114*rgbcell.z;


  }

  void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
                        unsigned char* const d_greyImage, size_t numRows, size_t numCols)
 {
 //You must fill in the correct sizes for the blockSize and gridSize
 //currently only one block with one thread is being launched

int totalpixels=numRows*numCols;
int factors[]={2,4,8,16,24,32};
vector<int> numbers(factors,factors+sizeof(factors)/sizeof(int));
int factor=1;

   while(!numbers.empty())
  {
 if(totalpixels%numbers.back()==0)
 {
     factor=numbers.back();
     break;
 }
   else
   {
  numbers.pop_back();
   }
 }



 const dim3 blockSize(factor, factor, 1);  //TODO
 const dim3 gridSize(numRows/factor+1, numCols/factor+1,1);  //TODO
 rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage,    numRows, numCols);

__global__`
void rgba_to_greyscale(const uchar4* const rgbaImage,
               unsigned char* const greyImage,
               int numRows, int numCols)
{`

int pos_x = (blockIdx.x * blockDim.x) + threadIdx.x;
int pos_y = (blockIdx.y * blockDim.y) + threadIdx.y;
if(pos_x >= numCols || pos_y >= numRows)
    return;

uchar4 rgba = rgbaImage[pos_x + pos_y * numCols];
greyImage[pos_x + pos_y * numCols] = (.299f * rgba.x + .587f * rgba.y + .114f * rgba.z); 

}

const dim3 blockSize(16, 16, 1);
const dim3 gridSize((numCols%16) ? numCols/16+1 : numCols/16,
(numRows%16) ? numRows/16+1 : numRows/16, 1);

const dim3 blockSize(16, 16, 1);  //TODO
const dim3 gridSize( (numRows+15)/16, (numCols+15)/16, 1);  //TODO

int x = blockIdx.x * blockDim.x + threadIdx.x;  
int y = blockIdx.y * blockDim.y + threadIdx.y;

uchar4 rgba = rgbaImage[y*numRows + x];
float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
greyImage[y*numRows + x] = channelSum;