Image CUDA中值滤波器工作不正常

Image CUDA中值滤波器工作不正常,image,filter,cuda,median,Image,Filter,Cuda,Median,我自学了CUDA,并尝试为图像处理实现一个简单的中值滤波器。这就是我想到的,但我似乎无法从中得到任何好的效果。例如,输出的图片相对来说没有噪音,但是图片的饱和度似乎更高,当我尝试维基百科上的这张a的图片时,出于某种原因,他的鼻子变绿了。我变得非常沮丧,想不出任何新的想法,所以如果有人能看到代码中的问题,我将不胜感激。谢谢 这是内核函数: __global__ void median_filter(int *input, int *output, int IMAGE_W, int IMAGE_H)

我自学了CUDA,并尝试为图像处理实现一个简单的中值滤波器。这就是我想到的,但我似乎无法从中得到任何好的效果。例如,输出的图片相对来说没有噪音,但是图片的饱和度似乎更高,当我尝试维基百科上的这张a的图片时,出于某种原因,他的鼻子变绿了。我变得非常沮丧,想不出任何新的想法,所以如果有人能看到代码中的问题,我将不胜感激。谢谢

这是内核函数:

__global__ void median_filter(int *input, int *output, int IMAGE_W, int IMAGE_H){

    __shared__ float window[BLOCK_W*BLOCK_H][9];

    int x, y, tid;
    int i, j, iMin, temp;

    x = blockIdx.x*blockDim.x + threadIdx.x;
    y = blockIdx.y*blockDim.y + threadIdx.y;
    tid = threadIdx.y*blockDim.y + threadIdx.x;

    if(x>=IMAGE_W && y>=IMAGE_H)
        return;

    /* setting 3x3 window elements for median */
    if(y==0 && x==0)
        window[tid][0] = input[y*IMAGE_W+x];
    else if(y==0 && x!=0)
        window[tid][0] = input[y*IMAGE_W+x-1];
    else if(y!=0 && x==0)
        window[tid][0] = input[(y-1)*IMAGE_W+x];
    else
        window[tid][0] = input[(y-1)*IMAGE_W+x-1];

    window[tid][1] = (y==0)?input[y*IMAGE_W+x]:input[(y-1)*IMAGE_W+x];

    if(y==0 && x==IMAGE_W-1)
        window[tid][2] = input[y*IMAGE_W+x];
    else if(y!=0 && x==IMAGE_W-1)
        window[tid][2] = input[(y-1)*IMAGE_W+x];
    else if(y==0 && x!=IMAGE_W-1)
        window[tid][2] = input[(y-1)*IMAGE_W+x+1];
    else
        window[tid][2] = input[(y-1)*IMAGE_W+x+1];

    window[tid][3] = (x==0)?input[y*IMAGE_W+x]:input[y*IMAGE_W+x-1];
    window[tid][4] = input[y*IMAGE_W+x];
    window[tid][5] = (x==IMAGE_W-1)?input[y*IMAGE_W+x]:input[y*IMAGE_W+x+1];

    if(y==IMAGE_H-1 && x==0)
        window[tid][6] = input[y*IMAGE_W+x];
    else if(y!=IMAGE_H-1 && x==0)
        window[tid][6] = input[(y+1)*IMAGE_W+x];
    else if(y==IMAGE_H-1 && x!=0)
        window[tid][6] = input[y*IMAGE_W+x-1];
    else
        window[tid][6] = input[(y+1)*IMAGE_W+x-1];

    window[tid][7] = (y==IMAGE_H-1)?input[y*IMAGE_W+x]:input[(y+1)*IMAGE_W+x];

    if(y==IMAGE_H-1 && x==IMAGE_W-1)
        window[tid][8] = input[y*IMAGE_W+x];
    else if(y!=IMAGE_H-1 && x==IMAGE_W-1)
        window[tid][8] = input[(y+1)*IMAGE_W+x];
    else if(y==IMAGE_H-1 && x!=IMAGE_W-1)
        window[tid][8] = input[y*IMAGE_W+x+1];
    else
        window[tid][8] = input[(y+1)*IMAGE_W+x+1];

    __syncthreads();

    /* sorting window to find median */
    for(j=0; j<8; j++){
        iMin = j;
        for(i=j+1; i<9; i++){
            if(window[tid][i] < window[tid][iMin]){
                iMin = i;
            }
        }
        if(iMin != j){
            temp = window[tid][iMin];
            window[tid][iMin] = window[tid][j];
            window[tid][j] = temp;
        }
        __syncthreads();
    }

    output[y*IMAGE_W + x] = window[tid][4];
}
int main(){
    /*loading picture*/
    char picture[50] = "before.bmp";

    FILE *image = fopen(picture, "rb");

    if(image == NULL)
    {
        printf("Load picture error!\n");
        system("pause");
        exit(1);
    }

    BITMAPFILEHEADER bmpFHeader;
    BITMAPINFOHEADER bmpIHeader;
    fread(&bmpFHeader, sizeof(BITMAPFILEHEADER), 1, image);
    fread(&bmpIHeader, sizeof(BITMAPINFOHEADER), 1, image);

    int imgWidth = bmpIHeader.biWidth;
    int imgHeight = bmpIHeader.biHeight;

    int img_size = imgWidth * imgHeight * sizeof(int);

    int * imgeRedChannel_x = (int *)malloc(img_size);
    int * imgeGreenChannel_x = (int *)malloc(img_size);
    int * imgeBlueChannel_x = (int *)malloc(img_size);

    int * deviceInputRed;
    int * deviceInputGreen;
    int * deviceInputBlue;

    int * deviceOutputRd;
    int * deviceOutputGreen;
    int * deviceOutputBlue;

    for(int i = imgHeight-1; i>=0; i--)
    {
        for(int j = 0; j<imgWidth; j++)
        {

                fread(&(imgeGreenChannel_x[i * (imgWidth) + j]), sizeof(unsigned char), 1, image);
                fread(&(imgeBlueChannel_x[i * (imgWidth) + j]), sizeof(unsigned char), 1, image);
                fread(&(imgeRedChannel_x[i * (imgWidth) + j]), sizeof(unsigned char), 1, image);

        }
    }

    cudaMalloc((void **) &deviceInputRed, sizeof(int) * imgHeight * imgWidth);
    cudaMalloc((void **) &deviceInputBlue, sizeof(int) * imgHeight * imgWidth);
    cudaMalloc((void **) &deviceInputGreen, sizeof(int) * imgHeight * imgWidth);
    cudaMalloc((void **) &deviceOutputRd, sizeof(int) * imgHeight * imgWidth);
    cudaMalloc((void **) &deviceOutputBlue, sizeof(int) * imgHeight * imgWidth);
    cudaMalloc((void **) &deviceOutputGreen, sizeof(int) * imgHeight * imgWidth);

    int dimA = imgWidth*imgHeight;
    int numThreadsPerBlock = 256;
    int numBlocks = dimA / numThreadsPerBlock;
    int sharedMemSize = numThreadsPerBlock*sizeof(int);

    dim3 dimGrid(numBlocks);
    dim3 dimBlock(numThreadsPerBlock);

    cudaMemcpy(deviceInputRed,imgeRedChannel_x,sizeof(int) * imgHeight * imgWidth,cudaMemcpyHostToDevice);
    checkCUDAError("memcpy h-d r");
    cudaMemcpy(deviceInputGreen,imgeGreenChannel_x,sizeof(int) * imgHeight * imgWidth,cudaMemcpyHostToDevice);
    checkCUDAError("memcpy h-d g");
    cudaMemcpy(deviceInputBlue,imgeBlueChannel_x,sizeof(int) * imgHeight * imgWidth,cudaMemcpyHostToDevice);
    checkCUDAError("memcpy h-d b");

    median_filter<<< dimGrid , dimBlock, sharedMemSize>>> (deviceInputRed, deviceOutputRd, imgHeight, imgWidth);
    checkCUDAError("kernel invocation r");
    median_filter<<< dimGrid , dimBlock, sharedMemSize>>> (deviceInputGreen, deviceOutputGreen, imgHeight, imgWidth);
    checkCUDAError("kernel invocation g");
    median_filter<<< dimGrid , dimBlock, sharedMemSize>>> (deviceInputBlue, deviceOutputBlue, imgHeight, imgWidth);
    checkCUDAError("kernel invocation b");

    cudaMemcpy(imgeRedChannel_x, deviceOutputRd, imgHeight * imgWidth * sizeof(int), cudaMemcpyDeviceToHost);
    checkCUDAError("memcpy d-h r");
    cudaMemcpy(imgeGreenChannel_x, deviceOutputGreen, imgHeight * imgWidth * sizeof(int), cudaMemcpyDeviceToHost);
    checkCUDAError("memcpy d-h g");
    cudaMemcpy(imgeBlueChannel_x, deviceOutputBlue, imgHeight * imgWidth * sizeof(int), cudaMemcpyDeviceToHost);
    checkCUDAError("memcpy d-h b");

    cudaFree(deviceInputRed);
    cudaFree(deviceOutputRd);
    cudaFree(deviceInputGreen);
    cudaFree(deviceOutputGreen);
    cudaFree(deviceInputBlue);
    cudaFree(deviceOutputBlue);

    /*saving new picture*/
    fclose(image);

    char title[50]="after";
    strcat(title, ".bmp");

    remove(title);
    image = fopen(title,"wb");

    fwrite(&bmpFHeader, sizeof(BITMAPFILEHEADER), 1, image);
    fwrite(&bmpIHeader, sizeof(BITMAPINFOHEADER), 1, image);

    for(int i = imgHeight-1; i>=0; i--)
    {

        for(int j = 0; j<imgWidth; j++)
        {
            int b = imgeBlueChannel_x[i * (imgWidth) + j];
            int g = imgeGreenChannel_x[i * (imgWidth) + j];
            int r = imgeRedChannel_x[i * (imgWidth) + j]; 

            if(b>255)b=255;
            if(g>255)g=255;
            if(r>255)r=255;



            fwrite(&g, sizeof(unsigned char), 1, image);
            fwrite(&b, sizeof(unsigned char), 1, image);
            fwrite(&r, sizeof(unsigned char), 1, image);
        }
    }

    printf("Success!\n");
    fclose(image);
    system("pause");
    return 0;
}     
\uuuu全局\uuuu无效中值\u过滤器(int*输入,int*输出,int图像,int图像){
__共享浮动窗口[块W*块H][9];
整数x,y,tid;
内部i,j,iMin,temp;
x=块IDX.x*块DIM.x+线程IDX.x;
y=blockIdx.y*blockDim.y+threadIdx.y;
tid=threadIdx.y*blockDim.y+threadIdx.x;
如果(x>=图像W&y>=图像H)
返回;
/*为中间带设置3x3个窗口元素*/
如果(y==0&&x==0)
窗口[tid][0]=输入[y*图像W+x];
如果(y==0&&x!=0),则为else
窗口[tid][0]=输入[y*图像W+x-1];
如果(y!=0&&x==0),则为else
窗口[tid][0]=输入[(y-1)*图像W+x];
其他的
窗口[tid][0]=输入[(y-1)*图像W+x-1];
窗口[tid][1]=(y==0)?输入[y*图像W+x]:输入[(y-1)*图像W+x];
如果(y==0&&x==IMAGE_W-1)
窗口[tid][2]=输入[y*图像W+x];
如果(y!=0&&x==IMAGE\u W-1),则为else
窗口[tid][2]=输入[(y-1)*图像W+x];
如果(y==0&&x!=IMAGE\u W-1)
窗口[tid][2]=输入[(y-1)*图像W+x+1];
其他的
窗口[tid][2]=输入[(y-1)*图像W+x+1];
窗口[tid][3]=(x==0)?输入[y*图像W+x]:输入[y*图像W+x-1];
窗口[tid][4]=输入[y*图像W+x];
窗口[tid][5]=(x==图像W-1)?输入[y*图像W+x]:输入[y*图像W+x+1];
如果(y==IMAGE_H-1&&x==0)
窗口[tid][6]=输入[y*图像W+x];
如果(y!=IMAGE_H-1&&x==0),则为else
窗口[tid][6]=输入[(y+1)*图像W+x];
如果(y==IMAGE_H-1&&x!=0),则为else
窗口[tid][6]=输入[y*图像W+x-1];
其他的
窗口[tid][6]=输入[(y+1)*图像W+x-1];
窗口[tid][7]=(y==图像\U H-1)?输入[y*图像\U W+x]:输入[(y+1)*图像\U W+x];
如果(y==IMAGE\uh-1&&x==IMAGE\uw-1)
窗口[tid][8]=输入[y*图像W+x];
如果(y!=IMAGE\uh-1&&x==IMAGE\uw-1)
窗口[tid][8]=输入[(y+1)*图像W+x];
如果(y==图像H-1&&x!=图像W-1)
窗口[tid][8]=输入[y*图像W+x+1];
其他的
窗口[tid][8]=输入[(y+1)*图像W+x+1];
__同步线程();
/*排序窗口以查找中间值*/
对于(j=0;j(设备输入绿色、设备输出绿色、imgHeight、imgWidth);
checkCUDAError(“内核调用g”);
中值滤波器>(设备输入蓝色、设备输出蓝色、imgHeight、imgWidth);
checkCUDAError(“内核调用b”);
cudaMemcpy(imgeRedChannel_x,设备输出,imghight*imgWidth*sizeof(int),cudaMemcpyDeviceToHost);
检查CUDAERROR(“memcpy d-h r”);
cudaMemcpy(imgeGreenChannel_x,设备输出绿色,imghight*imgWidth*sizeof(int),cudamemcpydevicetoost);
检查CUDAERROR(“memcpy d-h g”);
cudaMemcpy(imgeBlueChannel_x,deviceOutputBlue,imgHeight*imgWidth*sizeof(int),cudaMemcpyDeviceToHost);
检查CUDAERROR(“memcpy d-h b”);
cudaFree(设备输入);
cudaFree(设备输出);
cudaFree(设备输入绿色);
cudaFree(设备输出绿色);
cudaFree(deviceInputBlue);
cudaFree(设备输出蓝色);
/*保存新图片*/
fclose(图像);
字符标题[50]=“之后”;
strcat(标题“.bmp”);
删除(标题);
图像=fopen(标题“wb”);
fwrite(&bmpFHeader,sizeof(BITMAPFILEHEADER),1,image);
fwrite(&bmpIHeader,sizeof(BitMapInfo头文件),1,图像);
对于(int i=imgHeight-1;i>=0;i--)
{
对于(intj=0;j255)b=255;
如果(g>255)g=255;
如果(r>255)r=255;
fwrite&g,sizeof(无符号字符),1,image;
fwrite&b,sizeof(无符号字符),1,image;
fwrite&r,sizeof(无符号字符),1,image;
}
}
printf(“成功!\n”);
fclose(图像);
系统(“暂停”);
返回0;
}     

鼻子变绿意味着代码中有溢出,但这很奇怪,因为中值滤波器永远不会产生溢出。你肯定有一个乱七八糟的代码,内核没有多大意义,特别是你正在做的大量额外工作

在非线性过滤器中,我建议您首先尝试实现最小或最大过滤器,看看它们是否有效。以下是CUDA库中的最大过滤器的工作代码。您的中值内核应该与此相同:

__global__ void median_8u_c3( unsigned char* out,
                              unsigned int width,
                              unsigned int widthStep,
                              unsigned int height){

int xIndex = blockIdx.x*BLOCK_SIZE + threadIdx.x;
int  yIndex = blockIdx.y*BLOCK_SIZE + threadIdx.y;
int tid = yIndex * widthStep + (3*xIndex);

if(xIndex>=width|| yIndex>=height) return;

int limitX = anchorX + fHeight - 1;
int limitY = anchorY + fWidth - 1;

unsigned char MAX_R = 0 , MAX_G = 0, MAX_B = 0;  


 // Instead of Max filter code in the for loops below, you can have median code
for(Cuvi32s i=anchorX ; i<= limitX; i++)
    for(Cuvi32s j=anchorY ; j<= limitY; j++)
    {
        MAX_R = (tex2D(tex8,3*(xIndex + i)  , yIndex + j) > MAX_R) ? tex2D(tex8,3*(xIndex + i)  , yIndex + j) : MAX_R;
        MAX_G = (tex2D(tex8,3*(xIndex + i)+1, yIndex + j) > MAX_G) ? tex2D(tex8,3*(xIndex + i)+1, yIndex + j) : MAX_G;
        MAX_B = (tex2D(tex8,3*(xIndex + i)+2, yIndex + j) > MAX_B) ? tex2D(tex8,3*(xIndex + i)+2, yIndex + j) : MAX_B;
    }


out[tid] = MAX_R;
out[tid + 1] = MAX_G;
out[tid + 2] = MAX_B;
}
\uuuuu全局\uuuuuu无效中位数\u8u\uc3(无符号字符*out,
无符号整数宽度,
无符号整数步长,
无符号整数高度){
int xIndex=blockIdx.x*BLOCK_SIZE+threadIdx.x;
int yIndex=blockIdx.y*BLOCK_SIZE+threadIdx.y;
int-tid=yIndex*widthStep+(3*xIndex);
如果(xIndex>=宽度| yIndex>=高度)返回;
int limitX=主播+高度-1;
int limitY=anchorY+fWidth-1;
无符号字符MAX_R=0,MAX_G=0,MAX_B=0;
//您可以使用中值代码,而不是下面for循环中的Max filter code
对于(Cuvi32s i=anchorX;i MAX_G)?tex2D(tex8,3*(xIndex+i)+1,yIndex+j):MAX_G;
MAX_B=(tex2D(tex8,3*(xIndex+i)+2,yIndex+j)>MAX_B)?tex2D(tex8,3*(xIndex+i)+2,yIndex+j):MAX_B;
}
out[tid]=最大值;
out[tid+1]=最大值;
out[tid+2]=最大值;
}

注意:我使用的是来自纹理的输入。

鼻子变绿意味着您的代码中有溢出,但这很奇怪,因为中值滤波器永远不会生成溢出。您肯定有一个混乱的代码,内核没有多大意义,尤其是您正在做的大量额外工作

在非线性过滤器中,我建议您首先尝试实现最小或最大过滤器,看看它们是否有效