Image processing 在CUDA中将二维Canny边缘图像转换为一维边缘像素阵列-奇怪的行为

Image processing 在CUDA中将二维Canny边缘图像转换为一维边缘像素阵列-奇怪的行为,image-processing,cuda,Image Processing,Cuda,我有一个CUDA内核,它获取边缘图像并对其进行处理以创建一个更小的1D边缘像素阵列。这就是奇怪的行为。每次我运行内核并计算“d_nlist”中的边缘像素数(请参阅printf附近的代码),每次我都会得到一个更大的像素数,即使我使用相同的图像并完全停止程序并重新运行。因此,每次我运行它时,运行的时间都会更长,直到最后,它抛出一个未捕获的异常 我的问题是,如何阻止这种情况发生,以便每次运行内核时都能得到一致的结果 我的设备是Geforce 620 常数: 线程X=32 线程_Y=4 每线程像素=4

我有一个CUDA内核,它获取边缘图像并对其进行处理以创建一个更小的1D边缘像素阵列。这就是奇怪的行为。每次我运行内核并计算“d_nlist”中的边缘像素数(请参阅printf附近的代码),每次我都会得到一个更大的像素数,即使我使用相同的图像并完全停止程序并重新运行。因此,每次我运行它时,运行的时间都会更长,直到最后,它抛出一个未捕获的异常

我的问题是,如何阻止这种情况发生,以便每次运行内核时都能得到一致的结果

我的设备是Geforce 620

常数:

线程X=32
线程_Y=4
每线程像素=4
最大队列长度=每个线程的线程数×线程数×像素数
IMG_宽度=256
IMG_高度=256
IMG\u尺寸=IMG\u宽度*IMG\u高度
块X=IMG\u宽度/(线程X*每个线程的像素数)
块体Y=IMG高度/螺纹Y

内核如下所示:

__global__ void convert2DEdgeImageTo1DArray( unsigned char const * const image, 
unsigned int* const list, int* const glob_index ) {

unsigned int const x = blockIdx.x  * THREADS_X*PIXELS_PER_THREAD + threadIdx.x;
unsigned int const y = blockIdx.y  * THREADS_Y + threadIdx.y;

volatile int qindex = -1;
volatile __shared__ int sh_qindex[THREADS_Y];
volatile __shared__ int sh_qstart[THREADS_Y];
sh_qindex[threadIdx.y] = -1;

// Start by making an array
volatile __shared__ unsigned int sh_queue[MAX_QUEUE_LENGTH];

// Fill the queue
for(int i=0; i<PIXELS_PER_THREAD; i++)
{
    int const xx = i*THREADS_X + x;

    // Read one image pixel from global memory
    unsigned char const pixel = image[y*IMG_WIDTH + xx];
    unsigned int  const queue_val = (y << 16) + xx;

    if(pixel)
    {           
        do {
            qindex++;
            sh_qindex[threadIdx.y] = qindex;
            sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] = queue_val;
        } while (sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] != queue_val);
    }

    // Reload index from smem (last thread to write to smem will have updated it)
    qindex = sh_qindex[threadIdx.y];
}

// Let thread 0 reserve the space required in the global list
__syncthreads();
if(threadIdx.x == 0 && threadIdx.y == 0)
{
    // Find how many items are stored in each list
    int total_index = 0;
    #pragma unroll
    for(int i=0; i<THREADS_Y; i++)
    {
        sh_qstart[i] = total_index;
        total_index += (sh_qindex[i] + 1u);
    }

    // Calculate the offset in the global list
    unsigned int global_offset = atomicAdd(glob_index, total_index);
    #pragma unroll
    for(int i=0; i<THREADS_Y; i++)
    {
        sh_qstart[i] += global_offset;
    }
}
__syncthreads();

// Copy local queues to global queue
for(int i=0; i<=qindex; i+=THREADS_X)
{
    if(i + threadIdx.x > qindex)
        break;

    unsigned int qvalue = sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + i + threadIdx.x];
    list[sh_qstart[threadIdx.y] + i + threadIdx.x] = qvalue;
}
}
\uuuu全局\uuuuu无效转换2dedgeImageToDarray(无符号字符常量*常量图像,
无符号整数*常量列表,整数*常量全局索引){
unsigned int const x=blockIdx.x*线程×像素×每线程×线程×线程idx.x;
unsigned int const y=blockIdx.y*THREADS\u y+threadIdx.y;
volatile int qindex=-1;
volatile uuu shared uuuu int shu qindex[THREADS_Y];
volatile uuu shared uuu int sh_qstart[THREADS_Y];
sh_qindex[threadIdx.y]=-1;
//首先制作一个数组
易失性共享无符号整数sh_队列[最大队列长度];
//排队

对于(int i=0;i作为序言,让我建议一些有用的故障排除步骤:

  • 在代码中插入
  • 使用
    cuda memcheck
    运行代码,例如
    cuda memcheck./myapp
  • 如果您执行上述步骤,您会发现您的内核正在失败,失败与大小为4的全局写入有关。因此,这将使您的注意力集中在内核的最后一个部分,从注释
    //将本地队列复制到全局队列开始
    

    那么,关于您的代码,您至少有两个问题:

  • 内核最后一段的寻址/索引(将单个队列写入全局内存)混乱不堪,我不打算为您调试
  • 您没有将
    d\u nlist
    变量初始化为零。因此,当您对其进行原子添加时,您是在将值添加到垃圾值,而垃圾值会随着您重复该过程而增加
  • 以下是一些已删除问题的代码(我没有尝试整理您的队列副本代码),并添加了错误检查。它为我生成了可重复的结果:

    $ cat t216.cu
    #include <stdio.h>
    #include <stdlib.h>
    
    #define THREADS_X 32
    #define THREADS_Y 4
    #define PIXELS_PER_THREAD 4
    #define MAX_QUEUE_LENGTH (THREADS_X*THREADS_Y*PIXELS_PER_THREAD)
    #define IMG_WIDTH 256
    #define IMG_HEIGHT 256
    #define IMG_SIZE (IMG_WIDTH*IMG_HEIGHT)
    #define BLOCKS_X (IMG_WIDTH/(THREADS_X*PIXELS_PER_THREAD))
    #define BLOCKS_Y (IMG_HEIGHT/THREADS_Y)
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    __global__ void convert2DEdgeImageTo1DArray( unsigned char const * const image,
    unsigned int* const list, int* const glob_index ) {
    
    unsigned int const x = blockIdx.x  * THREADS_X*PIXELS_PER_THREAD + threadIdx.x;
    unsigned int const y = blockIdx.y  * THREADS_Y + threadIdx.y;
    
    volatile int qindex = -1;
    volatile __shared__ int sh_qindex[THREADS_Y];
    volatile __shared__ int sh_qstart[THREADS_Y];
    sh_qindex[threadIdx.y] = -1;
    
    // Start by making an array
    volatile __shared__ unsigned int sh_queue[MAX_QUEUE_LENGTH];
    
    // Fill the queue
    for(int i=0; i<PIXELS_PER_THREAD; i++)
    {
        int const xx = i*THREADS_X + x;
    
        // Read one image pixel from global memory
        unsigned char const pixel = image[y*IMG_WIDTH + xx];
        unsigned int  const queue_val = (y << 16) + xx;
    
        if(pixel)
        {
            do {
                qindex++;
                sh_qindex[threadIdx.y] = qindex;
                sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] = queue_val;
            } while (sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] != queue_val);
        }
    
        // Reload index from smem (last thread to write to smem will have updated it)
        qindex = sh_qindex[threadIdx.y];
    }
    
    // Let thread 0 reserve the space required in the global list
    __syncthreads();
    if(threadIdx.x == 0 && threadIdx.y == 0)
    {
        // Find how many items are stored in each list
        int total_index = 0;
        #pragma unroll
        for(int i=0; i<THREADS_Y; i++)
        {
            sh_qstart[i] = total_index;
            total_index += (sh_qindex[i] + 1u);
        }
    
        // Calculate the offset in the global list
        unsigned int global_offset = atomicAdd(glob_index, total_index);
        #pragma unroll
        for(int i=0; i<THREADS_Y; i++)
        {
            sh_qstart[i] += global_offset;
        }
    
    }
    __syncthreads();
    
    // Copy local queues to global queue
    /*
    for(int i=0; i<=qindex; i+=THREADS_X)
    {
        if(i + threadIdx.x > qindex)
            break;
    
        unsigned int qvalue = sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + i + threadIdx.x];
        list[sh_qstart[threadIdx.y] + i + threadIdx.x] = qvalue;
    }
    */
    }
    
    void call2DTo1DKernel(unsigned char const * const h_image)
    {
        // Device side allocation
        unsigned char *d_image = NULL;
        unsigned int *d_list = NULL;
        int h_nlist=0, *d_nlist = NULL;
        cudaMalloc((void**)&d_image, sizeof(unsigned char)*IMG_SIZE);
        cudaMalloc((void**)&d_list, sizeof(unsigned int)*IMG_SIZE);
        cudaMalloc((void**)&d_nlist, sizeof(int));
        cudaCheckErrors("cudamalloc fail");
    
        // Time measurement initialization
        cudaEvent_t start, stop, startio, stopio;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventCreate(&startio);
        cudaEventCreate(&stopio);
        float et, etio;
    
        // Start timer w/ io
        cudaEventRecord(startio,0);
        cudaMemcpy(d_nlist, &h_nlist, sizeof(int), cudaMemcpyHostToDevice);
        // Copy image data to device
        cudaMemcpy((void*)d_image, (void*)h_image, sizeof(unsigned char)*IMG_SIZE,    cudaMemcpyHostToDevice);
        cudaCheckErrors("cudamemcpy 1");
        // Start timer
        cudaEventRecord(start,0);
    
        // Kernel call
        // Phase 1 : Convert 2D binary image to 1D pixel array
        dim3 dimBlock1(THREADS_X, THREADS_Y);
        dim3 dimGrid1(BLOCKS_X, BLOCKS_Y);
        convert2DEdgeImageTo1DArray<<<dimGrid1, dimBlock1>>>(d_image, d_list, d_nlist);
        cudaDeviceSynchronize();
        cudaCheckErrors("kernel fail");
        // Stop timer
        cudaEventRecord(stop,0);
        cudaEventSynchronize(stop);
    
        // Stop timer w/ io
        cudaEventRecord(stopio,0);
        cudaEventSynchronize(stopio);
    
        // Time measurement
        cudaEventElapsedTime(&et,start,stop);
        cudaEventElapsedTime(&etio,startio,stopio);
    
        // Time measurement deinitialization
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
        cudaEventDestroy(startio);
        cudaEventDestroy(stopio);
    
        // Get list size
        cudaMemcpy((void*)&h_nlist, (void*)d_nlist, sizeof(int), cudaMemcpyDeviceToHost);
        cudaCheckErrors("cudaMemcpy 2");
        // Report on console
        printf("%d pixels processed...\n", h_nlist);
    
        // Device side dealloc
        cudaFree(d_image);
    //    cudaFree(d_space);
        cudaFree(d_list);
        cudaFree(d_nlist);
    }
    
    int main(){
    
      unsigned char *image;
    
      image = (unsigned char *)malloc(IMG_SIZE * sizeof(unsigned char));
      if (image == 0) {printf("malloc fail\n"); return 0;}
    
      for (int i =0 ; i<IMG_SIZE; i++)
        image[i] = rand()%2;
    
      call2DTo1DKernel(image);
      call2DTo1DKernel(image);
      call2DTo1DKernel(image);
      call2DTo1DKernel(image);
      call2DTo1DKernel(image);
      cudaCheckErrors("some error");
      return 0;
    }
    
    $ nvcc -arch=sm_20 -O3 -o t216 t216.cu
    $ ./t216
    32617 pixels processed...
    32617 pixels processed...
    32617 pixels processed...
    32617 pixels processed...
    32617 pixels processed...
    $ ./t216
    32617 pixels processed...
    32617 pixels processed...
    32617 pixels processed...
    32617 pixels processed...
    32617 pixels processed...
    $
    
    $cat t216.cu
    #包括
    #包括
    #定义线程\u X 32
    #定义线程4
    #定义每个线程的像素4
    #定义最大队列长度(线程X*线程Y*每个线程的像素)
    #定义IMG_宽度256
    #定义IMG_高度256
    #定义IMG_尺寸(IMG_宽度*IMG_高度)
    #定义块X(IMG\u宽度/(线程X*每个线程的像素)
    #定义块(IMG高度/螺纹)
    #定义cudaCheckErrors(msg)\
    做{\
    cudaError\u t\u err=cudaGetLastError()\
    如果(_err!=cudaSuccess){\
    fprintf(标准,“致命错误:%s(%s位于%s:%d)\n”\
    msg,cudaGetErrorString(_err)\
    __文件(行)\
    fprintf(stderr,“***失败-中止\n”)\
    出口(1)\
    } \
    }而(0)
    __全局_uu; void convert2degeImageToDarray(无符号字符常量*常量图像,
    无符号整数*常量列表,整数*常量全局索引){
    unsigned int const x=blockIdx.x*线程×像素×每线程×线程×线程idx.x;
    unsigned int const y=blockIdx.y*THREADS\u y+threadIdx.y;
    volatile int qindex=-1;
    volatile uuu shared uuuu int shu qindex[THREADS_Y];
    volatile uuu shared uuu int sh_qstart[THREADS_Y];
    sh_qindex[threadIdx.y]=-1;
    //首先制作一个数组
    易失性共享无符号整数sh_队列[最大队列长度];
    //排队
    
    对于(int i=0;我这里有一个实际问题吗?我看到的只是大量代码和症状列表。到底是什么错了,同样重要的是,为什么错了?你期望答案会告诉你什么?帮助我们帮助你……基本上,每次处理的像素数应该与我使用的相同图像相同。问题是s、 printf每次都会给我一个不同的结果。程序应该逐字读取图像,并将边缘像素放入一个列表中,这个列表比原始图像小。因此,我每次都应该在数组中得到相同数量的像素。我的问题是,我想,我如何才能阻止这种情况发生?希望这能有所帮助。那么,这个问题在哪里您显示的代码中的e
    houghKernel2\u 3\u phase1
    ?对不起,我的错误,我已将该方法重命名为Convert2DegeImageToDarray。
    $ cat t216.cu
    #include <stdio.h>
    #include <stdlib.h>
    
    #define THREADS_X 32
    #define THREADS_Y 4
    #define PIXELS_PER_THREAD 4
    #define MAX_QUEUE_LENGTH (THREADS_X*THREADS_Y*PIXELS_PER_THREAD)
    #define IMG_WIDTH 256
    #define IMG_HEIGHT 256
    #define IMG_SIZE (IMG_WIDTH*IMG_HEIGHT)
    #define BLOCKS_X (IMG_WIDTH/(THREADS_X*PIXELS_PER_THREAD))
    #define BLOCKS_Y (IMG_HEIGHT/THREADS_Y)
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    __global__ void convert2DEdgeImageTo1DArray( unsigned char const * const image,
    unsigned int* const list, int* const glob_index ) {
    
    unsigned int const x = blockIdx.x  * THREADS_X*PIXELS_PER_THREAD + threadIdx.x;
    unsigned int const y = blockIdx.y  * THREADS_Y + threadIdx.y;
    
    volatile int qindex = -1;
    volatile __shared__ int sh_qindex[THREADS_Y];
    volatile __shared__ int sh_qstart[THREADS_Y];
    sh_qindex[threadIdx.y] = -1;
    
    // Start by making an array
    volatile __shared__ unsigned int sh_queue[MAX_QUEUE_LENGTH];
    
    // Fill the queue
    for(int i=0; i<PIXELS_PER_THREAD; i++)
    {
        int const xx = i*THREADS_X + x;
    
        // Read one image pixel from global memory
        unsigned char const pixel = image[y*IMG_WIDTH + xx];
        unsigned int  const queue_val = (y << 16) + xx;
    
        if(pixel)
        {
            do {
                qindex++;
                sh_qindex[threadIdx.y] = qindex;
                sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] = queue_val;
            } while (sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] != queue_val);
        }
    
        // Reload index from smem (last thread to write to smem will have updated it)
        qindex = sh_qindex[threadIdx.y];
    }
    
    // Let thread 0 reserve the space required in the global list
    __syncthreads();
    if(threadIdx.x == 0 && threadIdx.y == 0)
    {
        // Find how many items are stored in each list
        int total_index = 0;
        #pragma unroll
        for(int i=0; i<THREADS_Y; i++)
        {
            sh_qstart[i] = total_index;
            total_index += (sh_qindex[i] + 1u);
        }
    
        // Calculate the offset in the global list
        unsigned int global_offset = atomicAdd(glob_index, total_index);
        #pragma unroll
        for(int i=0; i<THREADS_Y; i++)
        {
            sh_qstart[i] += global_offset;
        }
    
    }
    __syncthreads();
    
    // Copy local queues to global queue
    /*
    for(int i=0; i<=qindex; i+=THREADS_X)
    {
        if(i + threadIdx.x > qindex)
            break;
    
        unsigned int qvalue = sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + i + threadIdx.x];
        list[sh_qstart[threadIdx.y] + i + threadIdx.x] = qvalue;
    }
    */
    }
    
    void call2DTo1DKernel(unsigned char const * const h_image)
    {
        // Device side allocation
        unsigned char *d_image = NULL;
        unsigned int *d_list = NULL;
        int h_nlist=0, *d_nlist = NULL;
        cudaMalloc((void**)&d_image, sizeof(unsigned char)*IMG_SIZE);
        cudaMalloc((void**)&d_list, sizeof(unsigned int)*IMG_SIZE);
        cudaMalloc((void**)&d_nlist, sizeof(int));
        cudaCheckErrors("cudamalloc fail");
    
        // Time measurement initialization
        cudaEvent_t start, stop, startio, stopio;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventCreate(&startio);
        cudaEventCreate(&stopio);
        float et, etio;
    
        // Start timer w/ io
        cudaEventRecord(startio,0);
        cudaMemcpy(d_nlist, &h_nlist, sizeof(int), cudaMemcpyHostToDevice);
        // Copy image data to device
        cudaMemcpy((void*)d_image, (void*)h_image, sizeof(unsigned char)*IMG_SIZE,    cudaMemcpyHostToDevice);
        cudaCheckErrors("cudamemcpy 1");
        // Start timer
        cudaEventRecord(start,0);
    
        // Kernel call
        // Phase 1 : Convert 2D binary image to 1D pixel array
        dim3 dimBlock1(THREADS_X, THREADS_Y);
        dim3 dimGrid1(BLOCKS_X, BLOCKS_Y);
        convert2DEdgeImageTo1DArray<<<dimGrid1, dimBlock1>>>(d_image, d_list, d_nlist);
        cudaDeviceSynchronize();
        cudaCheckErrors("kernel fail");
        // Stop timer
        cudaEventRecord(stop,0);
        cudaEventSynchronize(stop);
    
        // Stop timer w/ io
        cudaEventRecord(stopio,0);
        cudaEventSynchronize(stopio);
    
        // Time measurement
        cudaEventElapsedTime(&et,start,stop);
        cudaEventElapsedTime(&etio,startio,stopio);
    
        // Time measurement deinitialization
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
        cudaEventDestroy(startio);
        cudaEventDestroy(stopio);
    
        // Get list size
        cudaMemcpy((void*)&h_nlist, (void*)d_nlist, sizeof(int), cudaMemcpyDeviceToHost);
        cudaCheckErrors("cudaMemcpy 2");
        // Report on console
        printf("%d pixels processed...\n", h_nlist);
    
        // Device side dealloc
        cudaFree(d_image);
    //    cudaFree(d_space);
        cudaFree(d_list);
        cudaFree(d_nlist);
    }
    
    int main(){
    
      unsigned char *image;
    
      image = (unsigned char *)malloc(IMG_SIZE * sizeof(unsigned char));
      if (image == 0) {printf("malloc fail\n"); return 0;}
    
      for (int i =0 ; i<IMG_SIZE; i++)
        image[i] = rand()%2;
    
      call2DTo1DKernel(image);
      call2DTo1DKernel(image);
      call2DTo1DKernel(image);
      call2DTo1DKernel(image);
      call2DTo1DKernel(image);
      cudaCheckErrors("some error");
      return 0;
    }
    
    $ nvcc -arch=sm_20 -O3 -o t216 t216.cu
    $ ./t216
    32617 pixels processed...
    32617 pixels processed...
    32617 pixels processed...
    32617 pixels processed...
    32617 pixels processed...
    $ ./t216
    32617 pixels processed...
    32617 pixels processed...
    32617 pixels processed...
    32617 pixels processed...
    32617 pixels processed...
    $