C++ cuda多图像侵蚀不起作用_C++_Image Processing_Cuda

C++ cuda多图像侵蚀不起作用

c++ image-processing cuda

C++ cuda多图像侵蚀不起作用,c++,image-processing,cuda,C++,Image Processing,Cuda,我试图用cuda实现多个黑色（0）和白色（255）图像腐蚀，我使用一个方形（5x5）结构元素。我实现的内核采用无符号字符数组缓冲区，其中存储nImg图像200X200 px。为了允许同时侵蚀多个图像，我制作了一个具有3D结构的网格：每个块都有拉伸的尺寸（5x5）网格具有高度=图像高度/blockDim.y，宽度=图像宽度/blockDim.x，z=nImg 我试着实现它，扩展它问题是，如果我将一个线程块的像素存储在块的线程之间的共享缓冲区< /强>共享的像素；为了允许快速访问内存，该

我试图用cuda实现多个黑色（0）和白色（255）图像腐蚀，我使用一个方形（5x5）结构元素。我实现的内核采用无符号字符数组缓冲区，其中存储nImg图像200X200 px。为了允许同时侵蚀多个图像，我制作了一个具有3D结构的网格：

每个块都有拉伸的尺寸（5x5）

网格具有高度=图像高度/blockDim.y，宽度=图像宽度/blockDim.x，z=nImg

我试着实现它，扩展它

问题是，如果我将一个线程块的像素存储在块的线程之间的<强>共享缓冲区< /强>共享的像素；为了允许快速访问内存，该算法无法正常工作。我试图更改我犯错误的bindex，但我找不到解决方案
有什么建议吗
这是我的密码：

//strel size #define STREL_W 5 #define STREL_H 5 // distance from the cente of strel to strel width or height #define R (STREL_H/2) //size of the 2D region that each block consider i.e all the neighborns that each thread in a block consider #define BLOCK_W (STREL_W+(2*R)) #define BLOCK_H (STREL_H+(2*R)) __global__ void erode_multiple_img_SM(unsigned char * buffer_in, unsigned char * buffer_out, int w, int h ){ //array stored in shared memory,that contain all pixel neighborns that each thread in a block consider __shared__ unsigned char fast_acc_arr[BLOCK_W*BLOCK_H]; // map thread in a 3D structure int col = blockIdx.x * STREL_W + threadIdx.x -R ; int row = blockIdx.y * STREL_H + threadIdx.y -R ; int plane = blockIdx.z * blockDim.z + threadIdx.z; // check if a foreground px of strel is not contain in a region of the image with size of strel (if only one px is not contain the image is eroded) bool is_contain = true; // clamp to edge of image col = max(0,col); col = min(col,w-1); row = max(0,row); row = min(row,h-1); //map each thread in one dim coord to map 3D structure(grid) with image buffer(1D) unsigned int index = (plane * h * w) + (row * w) + col; unsigned int bindex = threadIdx.y * blockDim.y + threadIdx.x; //each thread copy its pixel of the block to shared memory (shared with thread of a block) fast_acc_arr[bindex] = buffer_in[index]; __syncthreads(); //the strel must be contain in image, thread.x and thread.y are the coords of the center of the mask that correspond to strel in image, and it must be contain in image if((threadIdx.x >= R) && (threadIdx.x < BLOCK_W-R) && (threadIdx.y >= R) && (threadIdx.y <BLOCK_H-R)){ for(int dy=-R; dy<=R; dy++){ if(is_contain == false) break; for (int dx = -R ; dx <= R; dx++) { //if only one element in mask is different from the value of strel el --> the strel is not contain in the mask --> the center of the mask is eroded (and it's no necessary to consider the other el of the mask this is the motivation of the break) if (fast_acc_arr[bindex + (dy * blockDim.x) + dx ] != 255 ){ buffer_out[index ] = 0; is_contain = false; break; } } } // if the strel is contain into the image the the center is not eroded if(is_contain == true) buffer_out[index] = 255; } }
我的内核调用：

erode_multiple_img_SM<<<grid,block>>>(dimage_src,dimage_dst,200,200);
输入：
输出：

似乎停机坪的像素没有复制到输出缓冲区中
您可能需要阅读以下链接，以获得关于如何实现图像卷积CUDA内核函数的更详细描述和更好的示例代码

基本上，使用大小为（5 x 5）的卷积滤波器并不意味着将线程块的大小设置为（5 x 5）

通常，对于不可分离卷积，可以使用大小为（16 x 16）的线程块来计算输出图像上的（16 x 16）像素块。要实现这一点，您需要协同使用（16 x 16）线程将（（2+16+2）x（2+16+2））像素块从输入图像读取到共享内存。
请改进问题中代码的格式。这是很难阅读张贴。我评论它和缩进，我希望不难阅读与编辑我已经实现了没有共享内存的算法，它工作正常，我会张贴它。谢谢你的回答。@userfi所以现在你知道你在多图像上没有问题了。这只是使用共享内存的问题。是的，但我解决不了。我将阅读您的链接以了解共享内存映射。我尝试按照您的建议进行操作，但现在似乎围裙中的像素没有在输出img中复制有任何建议吗？
erode_multiple_img_SM<<<grid,block>>>(dimage_src,dimage_dst,200,200);

__global__ void erode_multiple_img(unsigned char * buffer_in, unsigned char * buffer_out, int w,int h ){ int col = blockIdx.x * blockDim.x + threadIdx.x; int row = blockIdx.y * blockDim.y + threadIdx.y; int plane = blockIdx.z * blockDim.z +threadIdx.z; bool is_contain = true; col = max(0,col); col = min(col,w-1); row = max(0,row); row = min(row,h-1); for(int dy=-STREL_H/2; dy<=STREL_H/2; dy++){ if(is_contain == false) break; for (int dx = -STREL_W/2 ; dx <= STREL_W/2; dx++) { if (buffer_in[(plane * h * w) +( row + dy) * w + (col + dx) ] !=255 ){ buffer_out[(plane * h * w) + row * w + col ] = 0; is_contain = false; break; } } } if(is_contain == true) buffer_out[(plane * h * w) + row * w +col ] = 255; }

#define STREL_SIZE 5 #define TILE_W 16 #define TILE_H 16 #define R (STREL_H/2) #define BLOCK_W (TILE_W+(2*R)) #define BLOCK_H (TILE_H+(2*R)) __global__ void erode_multiple_img_SM_v2(unsigned char * buffer_in, unsigned char * buffer_out, int w,int h ){ // Data cache: threadIdx.x , threadIdx.y __shared__ unsigned char data[TILE_W +STREL_SIZE ][TILE_W +STREL_SIZE ]; // global mem address of this thread int col = blockIdx.x * blockDim.x + threadIdx.x; int row = blockIdx.y * blockDim.y + threadIdx.y; int plane = blockIdx.z * blockDim.z +threadIdx.z; int gLoc = (plane*h/w)+ row*w +col; bool is_contain = true; // load cache (32x32 shared memory, 16x16 threads blocks) // each threads loads four values from global memory into shared mem int x, y; // image based coordinate if((col<w)&&(row<h)) { data[threadIdx.x][threadIdx.y]=buffer_in[gLoc]; if (threadIdx.y > (h-STREL_SIZE)) data[threadIdx.x][threadIdx.y + STREL_SIZE]=buffer_in[gLoc + STREL_SIZE]; if (threadIdx.x >(w-STREL_SIZE)) data[threadIdx.x + STREL_SIZE][threadIdx.y]=buffer_in[gLoc+STREL_SIZE]; if ((threadIdx.x >(w-STREL_SIZE)) && (threadIdx.y > (h-STREL_SIZE))) data[threadIdx.x+STREL_SIZE][threadIdx.y+STREL_SIZE] = buffer_in[gLoc+2*STREL_SIZE]; //wait for all threads to finish read __syncthreads(); //buffer_out[gLoc] = data[threadIdx.x][threadIdx.y]; unsigned char min_value = 255; for(x=0;x<STREL_SIZE;x++){ for(y=0;y<STREL_SIZE;y++){ min_value = min( (data[threadIdx.x+x][threadIdx.y+y]) , min_value); } } buffer_out[gLoc]= min_value; }

dim3 block(16,16); dim3 grid(512/(block.x),512/(block.y),nImg);