Optimization 优化opencl内核
我正在尝试优化这个内核。这个内核的CPU版本比GPU版本快4倍。我希望GPU版本会更快。 这可能是因为我们有很多内存访问,这就是为什么我们的性能很低。我使用的是Intel HD 2500和OpenCL 1.2 GPU内核是:Optimization 优化opencl内核,optimization,kernel,opencl,gpu,Optimization,Kernel,Opencl,Gpu,我正在尝试优化这个内核。这个内核的CPU版本比GPU版本快4倍。我希望GPU版本会更快。 这可能是因为我们有很多内存访问,这就是为什么我们的性能很低。我使用的是Intel HD 2500和OpenCL 1.2 GPU内核是: __kernel void mykernel(__global unsigned char *inp1, __global unsigned char *inp2,
__kernel void mykernel(__global unsigned char *inp1,
__global unsigned char *inp2,
__global unsigned char *inp3,
__global unsigned char *inp4,
__global unsigned char *outp1,
__global unsigned char *outp2,
__global unsigned char *outp3,
__global unsigned char *outp4,
__global unsigned char *lut,
uint size
)
{
unsigned char x1, x2, x3, x4;
unsigned char y1, y2, y3, y4;
const int x = get_global_id(0);
const int y = get_global_id(1);
const int width = get_global_size(0);
const uint id = y * width + x;
x1 = inp1[id];
x2 = inp2[id];
x3 = inp3[id];
x4 = inp4[id];
y1 = (x1 & 0xff) | (x2>>2 & 0xaa) | (x3>>4 & 0x0d) | (x4>>6 & 0x02);
y2 = (x1<<2 & 0xff) | (x2 & 0xaa) | (x3>>2 & 0x0d) | (x4>>4 & 0x02);
y3 = (x1<<4 & 0xff) | (x2<<2 & 0xaa) | (x3 & 0x0d) | (x4>>2 & 0x02);
y4 = (x1<<6 & 0xff) | (x2<<4 & 0xaa) | (x3<<2 & 0x0d) | (x4 & 0x02);
// lookup table
y1 = lut[y1];
y2 = lut[y2];
y3 = lut[y3];
y4 = lut[y4];
outp1[id] = (y1 & 0xc0)
| ((y2 & 0xc0) >> 2)
| ((y3 & 0xc0) >> 4)
| ((y4 & 0xc0) >> 6);
outp2[id] = ((y1 & 0x30) << 2)
| (y2 & 0x30)
| ((y3 & 0x30) >> 2)
| ((y4 & 0x30) >> 4);
outp3[id] = ((y1 & 0x0c) << 4)
| ((y2 & 0x0c) << 2)
| (y3 & 0x0c)
| ((y4 & 0x0c) >> 2);
outp4[id] = ((y1 & 0x03) << 6)
| ((y2 & 0x03) << 4)
| ((y3 & 0x03) << 2)
| (y4 & 0x03);
}
LocalWorkSize可以在1到256之间变化
for LocalWorkSize = 1 I have
CPU = 0.067Sec
GPU = 0.20Sec
for LocalWorkSize = 256 I have
CPU = 0.067Sec
GPU = 0.34Sec
这真的很奇怪。你能告诉我为什么我会得到这些奇怪的数字吗?关于如何优化这个内核,你有什么建议吗
我的主要观点如下:
int main(int argc, char** argv)
{
int err,err1,j,i; // error code returned from api calls and other
clock_t start, end; // measuring performance variables
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program_ms_naive; // compute program
cl_kernel kernel_ms_naive; // compute kernel
// ... dynamically allocate arrays
// ... initialize arrays
cl_uint dev_cnt = 0;
clGetPlatformIDs(0, 0, &dev_cnt);
cl_platform_id platform_ids[100];
clGetPlatformIDs(dev_cnt, platform_ids, NULL);
// Connect to a compute device
err = clGetDeviceIDs(platform_ids[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
// Create a compute context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
// Create a command queue
commands = clCreateCommandQueue(context, device_id, 0, &err);
// Create the compute programs from the source file
program_ms_naive = clCreateProgramWithSource(context, 1, (const char **) &kernelSource_ms, NULL, &err);
// Build the programs executable
err = clBuildProgram(program_ms_naive, 0, NULL, NULL, NULL, NULL);
// Create the compute kernel in the program we wish to run
kernel_ms_naive = clCreateKernel(program_ms_naive, "ms_naive", &err);
d_A1 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A1, &err);
d_A2 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A2, &err);
d_A3 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A3, &err);
d_A4 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A4, &err);
d_lut = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 256, h_ltable, &err);
d_B1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B2 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B3 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B4 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
int size = YCOLUMNS*XROWS/4;
int size_b = size * 4;
err = clSetKernelArg(kernel_ms_naive, 0, sizeof(cl_mem), (void *)&(d_A1));
err |= clSetKernelArg(kernel_ms_naive, 1, sizeof(cl_mem), (void *)&(d_A2));
err |= clSetKernelArg(kernel_ms_naive, 2, sizeof(cl_mem), (void *)&(d_A3));
err |= clSetKernelArg(kernel_ms_naive, 3, sizeof(cl_mem), (void *)&(d_A4));
err |= clSetKernelArg(kernel_ms_naive, 4, sizeof(cl_mem), (void *)&d_B1);
err |= clSetKernelArg(kernel_ms_naive, 5, sizeof(cl_mem), (void *)&(d_B2));
err |= clSetKernelArg(kernel_ms_naive, 6, sizeof(cl_mem), (void *)&(d_B3));
err |= clSetKernelArg(kernel_ms_naive, 7, sizeof(cl_mem), (void *)&(d_B4));
err |= clSetKernelArg(kernel_ms_naive, 8, sizeof(cl_mem), (void *)&d_lut); //__global
err |= clSetKernelArg(kernel_ms_naive, 9, sizeof(cl_uint), (void *)&size_b);
size_t localWorkSize[1], globalWorkSize[1];
localWorkSize[0] = 256;
globalWorkSize[0] = XROWS*YCOLUMNS;
start = clock();
for (i=0;i< EXECUTION_TIMES;i++)
{
err1 = clEnqueueNDRangeKernel(commands, kernel_ms_naive, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
err = clFinish(commands);
}
end = clock();
return 0;
}
int main(int argc,char**argv)
{
int err,err1,j,i;//从api调用和其他调用返回的错误代码
时钟\u t开始,结束;//测量性能变量
cl\U设备\U id设备\U id;//计算设备id
cl_context context;//计算上下文
cl_命令_队列命令;//计算命令队列
cl_program program_ms_naive;//计算程序
cl_kernel kernel_ms_naive;//计算内核
//…动态分配阵列
//…初始化数组
cl_uint dev_cnt=0;
clGetPlatformIDs(0、0和开发工具);
cl_平台识别码平台识别码[100];
clGetPlatformIDs(开发人员,平台ID,空);
//连接到计算设备
err=CLGetDeviceID(平台\u id[0],CL\u设备\u类型\u GPU,1,&设备\u id,NULL);
//创建一个计算上下文
context=clCreateContext(0、1和设备id、NULL、NULL和err);
//创建命令队列
commands=clCreateCommandQueue(上下文、设备id、0和错误);
//从源文件创建计算程序
program_ms_naive=clCreateProgramWithSource(上下文,1,(常量字符**)和kernelSource_ms,NULL和err);
//构建可执行的程序
err=clBuildProgram(program_ms_naive,0,NULL,NULL,NULL);
//在我们希望运行的程序中创建计算内核
kernel_-ms_-naive=clCreateKernel(program_-ms_-naive,“ms_-naive”和&err);
d_A1=clCreateBuffer(上下文、CL_MEM_READ_ONLY、CL_MEM_COPY_HOST_PTR、MEM_size_cpy/4、h_A1和err);
d_A2=clCreateBuffer(上下文、CL_MEM_READ_ONLY、CL_MEM_COPY_HOST_PTR、MEM_size_cpy/4、h_A2和err);
d_A3=clCreateBuffer(上下文、CL_MEM_只读、CL_MEM_复制、主机、内存大小、cpy/4、h_A3和err);
d_A4=clCreateBuffer(上下文、CL_MEM_READ_ONLY、CL_MEM_COPY_HOST_PTR、MEM_size_cpy/4、h_A4和err);
d_lut=clCreateBuffer(上下文、CL_MEM_READ_ONLY、CL_MEM_COPY_HOST_PTR、256、h_ltable和err);
d_B1=clCreateBuffer(上下文、仅CL_MEM_WRITE_、MEM_size_cpy/4、NULL和err);
d_B2=clCreateBuffer(上下文、仅CL_MEM_WRITE_、MEM_size_cpy/4、NULL和err);
d_B3=clCreateBuffer(上下文、仅CL_MEM_WRITE_、MEM_size_cpy/4、NULL和err);
d_B4=clCreateBuffer(上下文、CL_MEM_WRITE_ONLY、MEM_size_cpy/4、NULL和err);
int size=y列*x行/4;
int size_b=大小*4;
err=clSetKernelArg(kernel_ms_naive,0,sizeof(cl_mem),(void*)和(d_A1));
err |=clSetKernelArg(kernel_ms_naive,1,sizeof(cl_mem),(void*)和(d_A2));
err |=clSetKernelArg(kernel_ms_naive,2,sizeof(cl_mem),(void*)和(d_A3));
err |=clSetKernelArg(kernel_ms_naive,3,sizeof(cl_mem),(void*)和(d_A4));
err |=clSetKernelArg(kernel_ms_naive,4,sizeof(cl_mem),(void*)和d_B1);
err |=clSetKernelArg(kernel_ms_naive,5,sizeof(cl_mem),(void*)和(d_B2));
err |=clSetKernelArg(kernel_ms_naive,6,sizeof(cl_mem),(void*)和(d_B3));
err |=clSetKernelArg(kernel_ms_naive,7,sizeof(cl_mem),(void*)和(d_B4));
err |=clSetKernelArg(kernel_ms_naive,8,sizeof(cl_mem),(void*)和d_lut);//全局
err |=clSetKernelArg(kernel_ms_naive,9,sizeof(cl_uint),(void*)和size_b);
大小\u t本地工作大小[1],全局工作大小[1];
localWorkSize[0]=256;
globalWorkSize[0]=X行*Y列;
开始=时钟();
对于(i=0;i<执行次数;i++)
{
err1=clEnqueueNDRangeKernel(命令,kernel_ms_naive,1,NULL,globalWorkSize,localWorkSize,0,NULL,NULL);
err=clFinish(命令);
}
结束=时钟();
返回0;
}
恒定内存用于向所有工作项广播少量值,其作用类似于恒定私有寄存器,因此访问速度非常快。普通GPU设备可以支持高达16kb的恒定内存。应该足够容纳LUT
您可以尝试使用恒定内存,作为解决全局访问瓶颈的简单方法:
__kernel void mykernel(const __global unsigned char *inp1,
const __global unsigned char *inp2,
const __global unsigned char *inp3,
const __global unsigned char *inp4,
__global unsigned char *outp1,
__global unsigned char *outp2,
__global unsigned char *outp3,
__global unsigned char *outp4,
__constant unsigned char *lut,
uint size
)
{
...
}
但正确的解决方案是重塑代码:
- 使用char4的向量而不是4个不同的缓冲区(因为 打破合并)[它可以给你一个高达x4的巨大提升]
- 对向量进行操作[轻微提升]
- 对LUT使用本地/常量内存[它可以减少LUT的1个非合并读取,可能是2x-3x]
但是,由于IO限制太大,很难击败CPU方法。请发布一篇文章。本地工作大小应该是256ish,而不是1。1=最低硬件占用率和最低性能。可能需要最少8个或8的倍数。如果将
CL\u MEM\u COPY\u HOST\u PTR
更改为CL\u MEM\u USE\u HOST\u PTR
,该怎么办?也可能是内核做得太少,并且在lut
中随机访问全局内存。尝试在内核中添加更多要做的工作,并在本地内存中缓存lut
,以便更快地访问。我是否必须将lut作为全局传递,然后在内核内部将其复制到本地数组中?是的,你需要这样做。如果是,我怎么能只做一次?我不希望每个线程每次执行时都执行此初始化。是的,您需要在每次执行内核时执行此初始化(您不能通过内核参数传递本地数据),因此您可能需要为每个内核添加更多的工作。@Nick当然,您将始终有一些操作。但全局内存读取的成本非常高,比操作高出几个数量级。因此,一些移位和或操作比内存IO(2 re)快得多
__kernel void mykernel(const __global unsigned char *inp1,
const __global unsigned char *inp2,
const __global unsigned char *inp3,
const __global unsigned char *inp4,
__global unsigned char *outp1,
__global unsigned char *outp2,
__global unsigned char *outp3,
__global unsigned char *outp4,
__constant unsigned char *lut,
uint size
)
{
...
}