C++ 用模板模式优化CUDA内核
在编写数字图像处理程序的过程中,我编写了一个运行缓慢的CUDA内核。代码如下:C++ 用模板模式优化CUDA内核,c++,cuda,C++,Cuda,在编写数字图像处理程序的过程中,我编写了一个运行缓慢的CUDA内核。代码如下: __global__ void Kernel ( int* inputArray, float* outputArray, float3* const col_image, int height, int width, int kc2 ) { float G, h; float fx[3]; float fy[3]; float g[2][2]; float k10 = 0.0
__global__ void Kernel ( int* inputArray, float* outputArray, float3* const col_image, int height, int width, int kc2 ) {
float G, h;
float fx[3];
float fy[3];
float g[2][2];
float k10 = 0.0;
float k11 = 0.0;
float k12 = 0.0;
float k20 = 0.0;
float k21 = 0.0;
float k22 = 0.0;
float k30 = 0.0;
float k31 = 0.0;
float k32 = 0.0;
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
if ((xIndex < width - kc2/2) && (xIndex >= kc2/2) && (yIndex < height - kc2/2) && (yIndex >= kc2/2))
{
int idx0 = yIndex * width + xIndex;
if (inputArray[idx0] > 0)
{
for (int i = 0; i < kc2; i++)
{
for (int j = 0; j < kc2; j++)
{
int idx1 = (yIndex + i - kc2/2) * width + (xIndex + j - kc2/2);
float3 rgb = col_image[idx1];
k10 = k10 + constMat1[i * kc2 + j] * rgb.x;
k11 = k11 + constMat1[i * kc2 + j] * rgb.y;
k12 = k12 + constMat1[i * kc2 + j] * rgb.z;
k20 = k20 + constMat2[i * kc2 + j] * rgb.x;
k21 = k21 + constMat2[i * kc2 + j] * rgb.y;
k22 = k22 + constMat2[i * kc2 + j] * rgb.z;
k30 = k30 + constMat3[i * kc2 + j] * rgb.x;
k31 = k31 + constMat3[i * kc2 + j] * rgb.y;
k32 = k32 + constMat3[i * kc2 + j] * rgb.z;
}
}
fx[0] = kc2 * (k30 - k20);
fx[1] = kc2 * (k31 - k21);
fx[2] = kc2 * (k32 - k22);
fy[0] = kc2 * (k10 - k20);
fy[1] = kc2 * (k11 - k21);
fy[2] = kc2 * (k12 - k22);
g[0][0] = fx[0] * fx[0] + fx[1] * fx[1] + fx[2] * fx[2];
g[0][1] = fx[0] * fy[0] + fx[1] * fy[1] + fx[2] * fy[2];
g[1][0] = g[0][1];
g[1][1] = fy[0] * fy[0] + fy[1] * fy[1] + fy[2] * fy[2]
G = g[0][0] * g[1][1] - g[0][1] * g[1][0];
h = g[0][0] + g[1][1];
// Output
int idx2 = (yIndex - kc2/2) * (width - kc2) + (xIndex - kc2/2);
outputArray[idx2] = (h * h) / G;
}
}
}
然后我们计算特征fx,fy,g{ij},h,g,并将结果值写入outputArray的相应单元格中
重要的是,所有指定的数据都存储在全局内存中,并且输入数组可以足够大(大约4000万个点)。所有这些都直接影响内核的速度
我们如何加快这个内核的执行速度(欢迎使用任何技术:使用共享内存/纹理、使用模具模板等)?我所说的“标准”使用共享内存缓冲一块col_image
供threadblock使用(和重用)的建议在这里是“标准”的
根据我的测试,它似乎提供了实质性的改进。由于您没有提供完整的代码,或任何类型的数据集或结果验证,因此我也将跳过所有这些。接下来是一个未经真正测试的共享内存到现有代码中的实现,将col\u图像的“buffer”a(threadblockwidth+kc2)*(threadblockheight+kc2)“patch”输入到共享内存缓冲区中。此后,在双嵌套for循环期间,数据从共享内存缓冲区中读取
像这样的2D共享内存模具操作是索引的练习,也是处理边缘情况的练习。您的代码稍微简单一点,我们只需要考虑“右边”和“向下”的边,就可以考虑将数据缓存到共享内存中的“光晕”。
我没有尝试验证此代码是否完美。然而,它应该为您提供一个如何实现2D共享内存缓冲系统的“路线图”,并为这项工作提供一些动力:我看到这样做的速度提高了约5倍,尽管YMMV,而且完全有可能我犯了一个性能错误
下面是一个工作示例,显示了Pascal Titan X、CUDA 8.0.61和Linux上的加速:
$ cat t390.cu
#include <stdio.h>
#include <iostream>
const int adim = 6000;
const int KC2 = 5;
const int thx = 32;
const int thy = 32;
__constant__ float constMat1[KC2*KC2];
__constant__ float constMat2[KC2*KC2];
__constant__ float constMat3[KC2*KC2];
__global__ void Kernel ( int* inputArray, float* outputArray, float3* const col_image, int height, int width, int kc2 ) {
float G, h;
float fx[3];
float fy[3];
float g[2][2];
float k10 = 0.0;
float k11 = 0.0;
float k12 = 0.0;
float k20 = 0.0;
float k21 = 0.0;
float k22 = 0.0;
float k30 = 0.0;
float k31 = 0.0;
float k32 = 0.0;
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
int idx0 = yIndex * width + xIndex;
#ifdef USE_SHARED
__shared__ float3 s_col_image[thy+KC2][thx+KC2];
int idx = xIndex;
int idy = yIndex;
int DATAHSIZE= height;
int WSIZE = kc2;
int DATAWSIZE = width;
float3 *input = col_image;
int BLKWSIZE = thx;
int BLKHSIZE = thy;
if ((idx < DATAHSIZE+WSIZE) && (idy < DATAWSIZE+WSIZE))
s_col_image[threadIdx.y][threadIdx.x]=input[idx0];
if ((idx < DATAHSIZE+WSIZE) && (idy < DATAWSIZE) && (threadIdx.y > BLKWSIZE - WSIZE))
s_col_image[threadIdx.y + (WSIZE-1)][threadIdx.x] = input[idx0+(WSIZE-1)*width];
if ((idx < DATAHSIZE) && (idy < DATAWSIZE+WSIZE) && (threadIdx.x > BLKHSIZE - WSIZE))
s_col_image[threadIdx.y][threadIdx.x + (WSIZE-1)] = input[idx0+(WSIZE-1)];
if ((idx < DATAHSIZE) && (idy < DATAWSIZE) && (threadIdx.x > BLKHSIZE - WSIZE) && (threadIdx.y > BLKWSIZE - WSIZE))
s_col_image[threadIdx.y + (WSIZE-1)][threadIdx.x + (WSIZE-1)] = input[idx0+(WSIZE-1)*width + (WSIZE-1)];
__syncthreads();
#endif
if ((xIndex < width - kc2/2) && (xIndex >= kc2/2) && (yIndex < height - kc2/2) && (yIndex >= kc2/2))
{
if (inputArray[idx0] > 0)
{
for (int i = 0; i < kc2; i++)
{
for (int j = 0; j < kc2; j++)
{
#ifdef USE_SHARED
float3 rgb = s_col_image[threadIdx.y][threadIdx.x];
#else
int idx1 = (yIndex + i - kc2/2) * width + (xIndex + j - kc2/2);
float3 rgb = col_image[idx1];
#endif
k10 = k10 + constMat1[i * kc2 + j] * rgb.x;
k11 = k11 + constMat1[i * kc2 + j] * rgb.y;
k12 = k12 + constMat1[i * kc2 + j] * rgb.z;
k20 = k20 + constMat2[i * kc2 + j] * rgb.x;
k21 = k21 + constMat2[i * kc2 + j] * rgb.y;
k22 = k22 + constMat2[i * kc2 + j] * rgb.z;
k30 = k30 + constMat3[i * kc2 + j] * rgb.x;
k31 = k31 + constMat3[i * kc2 + j] * rgb.y;
k32 = k32 + constMat3[i * kc2 + j] * rgb.z;
}
}
fx[0] = kc2 * (k30 - k20);
fx[1] = kc2 * (k31 - k21);
fx[2] = kc2 * (k32 - k22);
fy[0] = kc2 * (k10 - k20);
fy[1] = kc2 * (k11 - k21);
fy[2] = kc2 * (k12 - k22);
g[0][0] = fx[0] * fx[0] + fx[1] * fx[1] + fx[2] * fx[2];
g[0][1] = fx[0] * fy[0] + fx[1] * fy[1] + fx[2] * fy[2];
g[1][0] = g[0][1];
g[1][1] = fy[0] * fy[0] + fy[1] * fy[1] + fy[2] * fy[2]; // had a missing semicolon
G = g[0][0] * g[1][1] - g[0][1] * g[1][0];
h = g[0][0] + g[1][1];
// Output
int idx2 = (yIndex - kc2/2) * (width - kc2) + (xIndex - kc2/2); // possible indexing bug here
outputArray[idx2] = (h * h) / G;
}
}
}
int main(){
int *d_inputArray;
int height = adim;
int width = adim;
float *d_outputArray;
float3 *d_col_image;
int kc2 = KC2;
cudaMalloc(&d_inputArray, height*width*sizeof(int));
cudaMemset(d_inputArray, 1, height*width*sizeof(int));
cudaMalloc(&d_col_image, (height+kc2)*(width+kc2)*sizeof(float3));
cudaMalloc(&d_outputArray, height*width*sizeof(float));
dim3 threads(thx,thy);
dim3 blocks((adim+threads.x-1)/threads.x, (adim+threads.y-1)/threads.y);
Kernel<<<blocks,threads>>>( d_inputArray, d_outputArray, d_col_image, height, width, kc2 );
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_61 -o t390 t390.cu
$ cuda-memcheck ./t390
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t390
==1473== NVPROF is profiling process 1473, command: ./t390
==1473== Profiling application: ./t390
==1473== Profiling result:
Time(%) Time Calls Avg Min Max Name
97.29% 34.705ms 1 34.705ms 34.705ms 34.705ms Kernel(int*, float*, float3*, int, int, int)
2.71% 965.14us 1 965.14us 965.14us 965.14us [CUDA memset]
==1473== API calls:
Time(%) Time Calls Avg Min Max Name
88.29% 310.69ms 3 103.56ms 550.23us 309.46ms cudaMalloc
9.86% 34.712ms 1 34.712ms 34.712ms 34.712ms cudaDeviceSynchronize
1.05% 3.6801ms 364 10.110us 247ns 453.59us cuDeviceGetAttribute
0.70% 2.4483ms 4 612.07us 547.62us 682.25us cuDeviceTotalMem
0.08% 284.32us 4 71.079us 63.098us 79.616us cuDeviceGetName
0.01% 29.533us 1 29.533us 29.533us 29.533us cudaMemset
0.01% 21.189us 1 21.189us 21.189us 21.189us cudaLaunch
0.00% 5.2730us 12 439ns 253ns 1.1660us cuDeviceGet
0.00% 3.4710us 6 578ns 147ns 2.4820us cudaSetupArgument
0.00% 3.1090us 3 1.0360us 340ns 2.1660us cuDeviceGetCount
0.00% 1.0370us 1 1.0370us 1.0370us 1.0370us cudaConfigureCall
ubuntu@titanxp-DiGiTS-Dev-Box:~/bobc/misc$ nvcc -arch=sm_61 -o t390 t390.cu -DUSE_SHARED
ubuntu@titanxp-DiGiTS-Dev-Box:~/bobc/misc$ cuda-memcheck ./t390
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t390
==1545== NVPROF is profiling process 1545, command: ./t390
==1545== Profiling application: ./t390
==1545== Profiling result:
Time(%) Time Calls Avg Min Max Name
86.17% 5.4181ms 1 5.4181ms 5.4181ms 5.4181ms Kernel(int*, float*, float3*, int, int, int)
13.83% 869.94us 1 869.94us 869.94us 869.94us [CUDA memset]
==1545== API calls:
Time(%) Time Calls Avg Min Max Name
96.13% 297.15ms 3 99.050ms 555.80us 295.90ms cudaMalloc
1.76% 5.4281ms 1 5.4281ms 5.4281ms 5.4281ms cudaDeviceSynchronize
1.15% 3.5664ms 364 9.7970us 247ns 435.92us cuDeviceGetAttribute
0.86% 2.6475ms 4 661.88us 642.85us 682.42us cuDeviceTotalMem
0.09% 266.42us 4 66.603us 62.005us 77.380us cuDeviceGetName
0.01% 29.624us 1 29.624us 29.624us 29.624us cudaMemset
0.01% 19.147us 1 19.147us 19.147us 19.147us cudaLaunch
0.00% 4.8560us 12 404ns 248ns 988ns cuDeviceGet
0.00% 3.3390us 6 556ns 134ns 2.3510us cudaSetupArgument
0.00% 3.1190us 3 1.0390us 331ns 2.0780us cuDeviceGetCount
0.00% 1.1940us 1 1.1940us 1.1940us 1.1940us cudaConfigureCall
$
鉴于我本可以预料到这一点:
int idx2 = (yIndex - kc2/2) * width + (xIndex - kc2/2);
但是我没有仔细考虑,所以我可能错了
在将来,如果你想在这样的问题上得到帮助,我建议你至少提供我所拥有的完整的代码框架和描述。提供一个完整的代码,其他人可以立即获取和测试,而无需编写自己的代码。还要定义您所在的平台以及您的绩效衡量标准
非常感谢您提供了详细的答案,以及我将在测试后编写的代码。
int idx2 = (yIndex - kc2/2) * (width - kc2) + (xIndex - kc2/2);
int idx2 = (yIndex - kc2/2) * width + (xIndex - kc2/2);