C++ 用模板模式优化CUDA内核

C++ 用模板模式优化CUDA内核,c++,cuda,C++,Cuda,在编写数字图像处理程序的过程中,我编写了一个运行缓慢的CUDA内核。代码如下: __global__ void Kernel ( int* inputArray, float* outputArray, float3* const col_image, int height, int width, int kc2 ) { float G, h; float fx[3]; float fy[3]; float g[2][2]; float k10 = 0.0

在编写数字图像处理程序的过程中,我编写了一个运行缓慢的CUDA内核。代码如下:

__global__ void Kernel ( int* inputArray, float* outputArray, float3* const col_image, int height, int width, int kc2 ) {
    float G, h;
    float fx[3];
    float fy[3];
    float g[2][2];
    float k10 = 0.0;
    float k11 = 0.0;
    float k12 = 0.0;
    float k20 = 0.0;
    float k21 = 0.0;
    float k22 = 0.0;
    float k30 = 0.0;
    float k31 = 0.0;
    float k32 = 0.0;

    int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

    if ((xIndex < width - kc2/2) && (xIndex >= kc2/2) && (yIndex < height - kc2/2) && (yIndex >= kc2/2))
    {
        int idx0 = yIndex * width + xIndex;
        if (inputArray[idx0] > 0)
        {
            for (int i = 0; i < kc2; i++)
            {
                for (int j = 0; j < kc2; j++)
                {
                    int idx1 = (yIndex + i - kc2/2) * width + (xIndex + j - kc2/2);
                    float3 rgb = col_image[idx1];
                    k10 = k10 + constMat1[i * kc2 + j] * rgb.x;
                    k11 = k11 + constMat1[i * kc2 + j] * rgb.y;
                    k12 = k12 + constMat1[i * kc2 + j] * rgb.z;

                    k20 = k20 + constMat2[i * kc2 + j] * rgb.x;
                    k21 = k21 + constMat2[i * kc2 + j] * rgb.y;
                    k22 = k22 + constMat2[i * kc2 + j] * rgb.z;

                    k30 = k30 + constMat3[i * kc2 + j] * rgb.x;
                    k31 = k31 + constMat3[i * kc2 + j] * rgb.y;
                    k32 = k32 + constMat3[i * kc2 + j] * rgb.z;
                }
            }
            fx[0] = kc2 * (k30 - k20);
            fx[1] = kc2 * (k31 - k21);
            fx[2] = kc2 * (k32 - k22);
            fy[0] = kc2 * (k10 - k20);
            fy[1] = kc2 * (k11 - k21);
            fy[2] = kc2 * (k12 - k22);

            g[0][0] = fx[0] * fx[0] + fx[1] * fx[1] + fx[2] * fx[2];
            g[0][1] = fx[0] * fy[0] + fx[1] * fy[1] + fx[2] * fy[2];
            g[1][0] = g[0][1];
            g[1][1] = fy[0] * fy[0] + fy[1] * fy[1] + fy[2] * fy[2]
            G = g[0][0] * g[1][1] - g[0][1] * g[1][0];
            h = g[0][0] + g[1][1];

            // Output
            int idx2 = (yIndex - kc2/2) * (width - kc2) + (xIndex - kc2/2);
            outputArray[idx2] = (h * h) / G;
        }
    }
}
然后我们计算特征fx,fy,g{ij},h,g,并将结果值写入outputArray的相应单元格中

重要的是,所有指定的数据都存储在全局内存中,并且输入数组可以足够大(大约4000万个点)。所有这些都直接影响内核的速度

我们如何加快这个内核的执行速度(欢迎使用任何技术:使用共享内存/纹理、使用模具模板等)?

我所说的“标准”使用共享内存缓冲一块
col_image
供threadblock使用(和重用)的建议在这里是“标准”的

根据我的测试,它似乎提供了实质性的改进。由于您没有提供完整的代码,或任何类型的数据集或结果验证,因此我也将跳过所有这些。接下来是一个未经真正测试的共享内存到现有代码中的实现,将
col\u图像的“buffer”a(threadblockwidth+kc2)*(threadblockheight+kc2)“patch”输入到共享内存缓冲区中。此后,在双嵌套for循环期间,数据从共享内存缓冲区中读取

像这样的2D共享内存模具操作是索引的练习,也是处理边缘情况的练习。您的代码稍微简单一点,我们只需要考虑“右边”和“向下”的边,就可以考虑将数据缓存到共享内存中的“光晕”。 我没有尝试验证此代码是否完美。然而,它应该为您提供一个如何实现2D共享内存缓冲系统的“路线图”,并为这项工作提供一些动力:我看到这样做的速度提高了约5倍,尽管YMMV,而且完全有可能我犯了一个性能错误

下面是一个工作示例,显示了Pascal Titan X、CUDA 8.0.61和Linux上的加速:

$ cat t390.cu
#include <stdio.h>
#include <iostream>

const int adim = 6000;
const int KC2 = 5;
const int thx = 32;
const int thy = 32;
__constant__ float constMat1[KC2*KC2];
__constant__ float constMat2[KC2*KC2];
__constant__ float constMat3[KC2*KC2];

__global__ void Kernel ( int* inputArray, float* outputArray, float3* const col_image, int height, int width, int kc2 ) {
    float G, h;
    float fx[3];
    float fy[3];
    float g[2][2];
    float k10 = 0.0;
    float k11 = 0.0;
    float k12 = 0.0;
    float k20 = 0.0;
    float k21 = 0.0;
    float k22 = 0.0;
    float k30 = 0.0;
    float k31 = 0.0;
    float k32 = 0.0;

    int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
    int idx0 = yIndex * width + xIndex;

#ifdef USE_SHARED
    __shared__ float3 s_col_image[thy+KC2][thx+KC2];
    int idx = xIndex;
    int idy = yIndex;
    int DATAHSIZE= height;
    int WSIZE = kc2;
    int DATAWSIZE = width;
    float3 *input = col_image;
    int BLKWSIZE = thx;
    int BLKHSIZE = thy;
    if ((idx < DATAHSIZE+WSIZE) && (idy < DATAWSIZE+WSIZE))
      s_col_image[threadIdx.y][threadIdx.x]=input[idx0];
    if ((idx < DATAHSIZE+WSIZE) && (idy < DATAWSIZE) && (threadIdx.y > BLKWSIZE - WSIZE))
      s_col_image[threadIdx.y + (WSIZE-1)][threadIdx.x] = input[idx0+(WSIZE-1)*width];
    if ((idx < DATAHSIZE) && (idy < DATAWSIZE+WSIZE) && (threadIdx.x > BLKHSIZE - WSIZE))
      s_col_image[threadIdx.y][threadIdx.x + (WSIZE-1)] = input[idx0+(WSIZE-1)];
    if ((idx < DATAHSIZE) && (idy < DATAWSIZE) && (threadIdx.x > BLKHSIZE - WSIZE) && (threadIdx.y > BLKWSIZE - WSIZE))
      s_col_image[threadIdx.y + (WSIZE-1)][threadIdx.x + (WSIZE-1)] = input[idx0+(WSIZE-1)*width + (WSIZE-1)];
    __syncthreads();
#endif


    if ((xIndex < width - kc2/2) && (xIndex >= kc2/2) && (yIndex < height - kc2/2) && (yIndex >= kc2/2))
    {
        if (inputArray[idx0] > 0)
        {
            for (int i = 0; i < kc2; i++)
            {
                for (int j = 0; j < kc2; j++)
                {
#ifdef USE_SHARED
                    float3 rgb = s_col_image[threadIdx.y][threadIdx.x];
#else
                    int idx1 = (yIndex + i - kc2/2) * width + (xIndex + j - kc2/2);
                    float3 rgb = col_image[idx1];
#endif
                    k10 = k10 + constMat1[i * kc2 + j] * rgb.x;
                    k11 = k11 + constMat1[i * kc2 + j] * rgb.y;
                    k12 = k12 + constMat1[i * kc2 + j] * rgb.z;

                    k20 = k20 + constMat2[i * kc2 + j] * rgb.x;
                    k21 = k21 + constMat2[i * kc2 + j] * rgb.y;
                    k22 = k22 + constMat2[i * kc2 + j] * rgb.z;

                    k30 = k30 + constMat3[i * kc2 + j] * rgb.x;
                    k31 = k31 + constMat3[i * kc2 + j] * rgb.y;
                    k32 = k32 + constMat3[i * kc2 + j] * rgb.z;
                }
            }
            fx[0] = kc2 * (k30 - k20);
            fx[1] = kc2 * (k31 - k21);
            fx[2] = kc2 * (k32 - k22);
            fy[0] = kc2 * (k10 - k20);
            fy[1] = kc2 * (k11 - k21);
            fy[2] = kc2 * (k12 - k22);

            g[0][0] = fx[0] * fx[0] + fx[1] * fx[1] + fx[2] * fx[2];
            g[0][1] = fx[0] * fy[0] + fx[1] * fy[1] + fx[2] * fy[2];
            g[1][0] = g[0][1];
            g[1][1] = fy[0] * fy[0] + fy[1] * fy[1] + fy[2] * fy[2]; // had a missing semicolon
            G = g[0][0] * g[1][1] - g[0][1] * g[1][0];
            h = g[0][0] + g[1][1];

            // Output
            int idx2 = (yIndex - kc2/2) * (width - kc2) + (xIndex - kc2/2); // possible indexing bug here
            outputArray[idx2] = (h * h) / G;
        }
    }
}

int main(){

  int *d_inputArray;
  int height = adim;
  int width = adim;
  float *d_outputArray;
  float3 *d_col_image;
  int kc2 = KC2;
  cudaMalloc(&d_inputArray, height*width*sizeof(int));
  cudaMemset(d_inputArray, 1, height*width*sizeof(int));
  cudaMalloc(&d_col_image, (height+kc2)*(width+kc2)*sizeof(float3));
  cudaMalloc(&d_outputArray, height*width*sizeof(float));
  dim3 threads(thx,thy);
  dim3 blocks((adim+threads.x-1)/threads.x, (adim+threads.y-1)/threads.y);
  Kernel<<<blocks,threads>>>( d_inputArray, d_outputArray, d_col_image, height, width, kc2 );
  cudaDeviceSynchronize();
}
$ nvcc -arch=sm_61 -o t390 t390.cu
$ cuda-memcheck ./t390
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t390
==1473== NVPROF is profiling process 1473, command: ./t390
==1473== Profiling application: ./t390
==1473== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 97.29%  34.705ms         1  34.705ms  34.705ms  34.705ms  Kernel(int*, float*, float3*, int, int, int)
  2.71%  965.14us         1  965.14us  965.14us  965.14us  [CUDA memset]

==1473== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 88.29%  310.69ms         3  103.56ms  550.23us  309.46ms  cudaMalloc
  9.86%  34.712ms         1  34.712ms  34.712ms  34.712ms  cudaDeviceSynchronize
  1.05%  3.6801ms       364  10.110us     247ns  453.59us  cuDeviceGetAttribute
  0.70%  2.4483ms         4  612.07us  547.62us  682.25us  cuDeviceTotalMem
  0.08%  284.32us         4  71.079us  63.098us  79.616us  cuDeviceGetName
  0.01%  29.533us         1  29.533us  29.533us  29.533us  cudaMemset
  0.01%  21.189us         1  21.189us  21.189us  21.189us  cudaLaunch
  0.00%  5.2730us        12     439ns     253ns  1.1660us  cuDeviceGet
  0.00%  3.4710us         6     578ns     147ns  2.4820us  cudaSetupArgument
  0.00%  3.1090us         3  1.0360us     340ns  2.1660us  cuDeviceGetCount
  0.00%  1.0370us         1  1.0370us  1.0370us  1.0370us  cudaConfigureCall
ubuntu@titanxp-DiGiTS-Dev-Box:~/bobc/misc$ nvcc -arch=sm_61 -o t390 t390.cu -DUSE_SHARED
ubuntu@titanxp-DiGiTS-Dev-Box:~/bobc/misc$ cuda-memcheck ./t390
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t390
==1545== NVPROF is profiling process 1545, command: ./t390
==1545== Profiling application: ./t390
==1545== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 86.17%  5.4181ms         1  5.4181ms  5.4181ms  5.4181ms  Kernel(int*, float*, float3*, int, int, int)
 13.83%  869.94us         1  869.94us  869.94us  869.94us  [CUDA memset]

==1545== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 96.13%  297.15ms         3  99.050ms  555.80us  295.90ms  cudaMalloc
  1.76%  5.4281ms         1  5.4281ms  5.4281ms  5.4281ms  cudaDeviceSynchronize
  1.15%  3.5664ms       364  9.7970us     247ns  435.92us  cuDeviceGetAttribute
  0.86%  2.6475ms         4  661.88us  642.85us  682.42us  cuDeviceTotalMem
  0.09%  266.42us         4  66.603us  62.005us  77.380us  cuDeviceGetName
  0.01%  29.624us         1  29.624us  29.624us  29.624us  cudaMemset
  0.01%  19.147us         1  19.147us  19.147us  19.147us  cudaLaunch
  0.00%  4.8560us        12     404ns     248ns     988ns  cuDeviceGet
  0.00%  3.3390us         6     556ns     134ns  2.3510us  cudaSetupArgument
  0.00%  3.1190us         3  1.0390us     331ns  2.0780us  cuDeviceGetCount
  0.00%  1.1940us         1  1.1940us  1.1940us  1.1940us  cudaConfigureCall
$
鉴于我本可以预料到这一点:

    int idx2 = (yIndex - kc2/2) * width + (xIndex - kc2/2);
但是我没有仔细考虑,所以我可能错了

  • 在将来,如果你想在这样的问题上得到帮助,我建议你至少提供我所拥有的完整的代码框架和描述。提供一个完整的代码,其他人可以立即获取和测试,而无需编写自己的代码。还要定义您所在的平台以及您的绩效衡量标准


  • 非常感谢您提供了详细的答案,以及我将在测试后编写的代码。
        int idx2 = (yIndex - kc2/2) * (width - kc2) + (xIndex - kc2/2);
    
        int idx2 = (yIndex - kc2/2) * width + (xIndex - kc2/2);