Optimization GPU上的矩阵乘法。内存库冲突和延迟隐藏
编辑:随着时间的推移所取得的成就列在这个问题的末尾(~1Tflops/s) <>我使用C++(DLL)编写的OpenCL(GPU)编写了C语言的数学库,并对单精度方矩阵矩阵乘法器进行了优化(以便于学习和以后在神经网络程序中重用的可能性)。下面的内核代码将v1 1D数组作为matrix1(1024x1024)的行,将v2 1D数组作为matrix2的列((1024x1024)转置优化),并将结果作为matrix-3的行放入v3 1D数组中。(1024x1024) 目前,对于HD7870,1024x1024平方矩阵乘法的内核执行时间为3.6毫秒 已完成优化:Optimization GPU上的矩阵乘法。内存库冲突和延迟隐藏,optimization,opencl,gpgpu,matrix-multiplication,flops,Optimization,Opencl,Gpgpu,Matrix Multiplication,Flops,编辑:随着时间的推移所取得的成就列在这个问题的末尾(~1Tflops/s) 我使用C++(DLL)编写的OpenCL(GPU)编写了C语言的数学库,并对单精度方矩阵矩阵乘法器进行了优化(以便于学习和以后在神经网络程序中重用的可能性)。下面的内核代码将v1 1D数组作为matrix1(1024x1024)的行,将v2 1D数组作为matrix2的列((1024x1024)转置优化),并将结果作为matrix-3的行放入v3 1D数组中。(1024x1024) 目前,对于HD7870,1024x10
- 第二个矩阵的转置(改进的时间)
- 使用32x32子矩阵在本地内存中进行计算(4x 16x16,因为我的HD7870上的最大工作组大小为256,gpu不接受超过24kB的本地内存,但由于某些原因,在线来源称为64kB?)(无论如何,时间大大缩短了)
- 在将结果写入本地和全局变量之前,增加数据与私有变量的重复使用。(缩短了时间)
- 在最内层循环中访问本地2D数组的列主数组。(缩短时间)
- 每个补丁共享两个累加器寄存器的加法。(时间缩短,数值稳定性降低)
- 循环展开最里面的循环并没有改善时间(在第四次展开后甚至变得更糟)(因此必须放松整数alu)
__kernel void squareGpuMatrixMul(__global float * v1, __global float * v2, __global float * v3)
{
int localRow = get_local_id(0);
int localCol = get_local_id(1);
int selectRowFromA = get_group_id(0)*32;
int selectColFromB = get_group_id(1)*32;
int lid= localCol*16+localRow;
__local float Lcache1[ 16][ 16];
__local float Lcache2[ 16][ 16];
__local float Lcache3[ 16][ 16];
__local float Lcache1a[ 16][ 16];
__local float Lcache2a[ 16][ 16];
__local float Lcache3a[ 16][ 16];
__local float Lcache1b[ 16][ 16];
__local float Lcache2b[ 16][ 16];
__local float Lcache3b[ 16][ 16];
__local float Lcache1c[ 16][ 16];
__local float Lcache2c[ 16][ 16];
__local float Lcache3c[ 16][ 16];
float tmp0=0.0f;
float tmp1=0.0f;
float tmp2=0.0f;
float tmp3=0.0f;
float tmp4=0.0f;
float tmp5=0.0f;
float tmp6=0.0f;
float tmp7=0.0f;
float sumPatch=0.0f;
float sumPatcha=0.0f;
float sumPatchb=0.0f;
float sumPatchc=0.0f;
float sumPatch2=0.0f;
float sumPatcha2=0.0f;
float sumPatchb2=0.0f;
float sumPatchc2=0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
Lcache3[localRow][localCol]=0.0f;
Lcache3a[localRow][localCol]=0.0f;
Lcache3b[localRow][localCol]=0.0f;
Lcache3c[localRow][localCol]=0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
for(int i=0;i<1024;i+=32) // this is A's row and B's column parsed by sub-matrices
{
barrier(CLK_LOCAL_MEM_FENCE);
Lcache1[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024];
Lcache2[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024];
Lcache1a[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16];
Lcache2a[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16];
Lcache1b[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+16384];
Lcache2b[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+16384];
Lcache1c[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16+16384];
Lcache2c[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16+16384];
barrier(CLK_LOCAL_MEM_FENCE);
sumPatch=0.0f;
sumPatcha=0.0f;
sumPatchb=0.0f;
sumPatchc=0.0f;
sumPatch2=0.0f;
sumPatcha2=0.0f;
sumPatchb2=0.0f;
sumPatchc2=0.0f;
for(int kk=0;kk< 16;kk++) //this is sub-matrix multiplication
{
read_mem_fence(CLK_LOCAL_MEM_FENCE);
tmp0=Lcache1[kk][localRow]; // row-major
tmp1=Lcache1a[kk][localRow]; // accesses
tmp2=Lcache1b[kk][localRow]; //to local memory
tmp3=Lcache1c[kk][localRow];
tmp4=Lcache2[kk][localCol];
tmp5=Lcache2a[kk][localCol];
tmp6=Lcache2b[kk][localCol];
tmp7=Lcache2c[kk][localCol];
read_mem_fence(CLK_LOCAL_MEM_FENCE);
sumPatch+=tmp0*tmp4;
sumPatcha+=tmp0*tmp6;
sumPatchb+=tmp2*tmp4;
sumPatchc+=tmp2*tmp6;
sumPatch2+=tmp1*tmp5;
sumPatcha2+=tmp1*tmp7;
sumPatchb2+=tmp3*tmp5;
sumPatchc2+=tmp3*tmp7;
}
Lcache3[localRow][localCol]+=sumPatch+sumPatch2;
Lcache3a[localRow][localCol]+=sumPatcha+sumPatcha2;
Lcache3b[localRow][localCol]+=sumPatchb+sumPatchb2;
Lcache3c[localRow][localCol]+=sumPatchc+sumPatchc2;
}
barrier(CLK_LOCAL_MEM_FENCE);
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024]=Lcache3[localRow][localCol];
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16]=Lcache3a[localRow][localCol];
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+16384]=Lcache3b[localRow][localCol];
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16+16384]=Lcache3c[localRow][localCol];
barrier(CLK_LOCAL_MEM_FENCE);
}
\uuuuuu内核void squareGpuMatrixMul(\uuuu全局浮点*v1、\uuuu全局浮点*v2、\uuuu全局浮点*v3)
{
int localRow=get\u local\u id(0);
int localCol=get_local_id(1);
int selectRowFromA=获取组id(0)*32;
int selectColFromB=get\u group\u id(1)*32;
int lid=localCol*16+localRow;
__本地浮动Lcache1[16][16];
__本地浮动Lcache2[16][16];
__本地浮动Lcache3[16][16];
__本地浮动Lcache1a[16][16];
__本地浮动Lcache2a[16][16];
__本地浮动Lcache3a[16][16];
__本地浮动Lcache1b[16][16];
__本地浮动Lcache2b[16][16];
__本地浮动Lcache3b[16][16];
__本地浮动Lcache1c[16][16];
__本地浮点数Lcache2c[16][16];
__本地浮动Lcache3c[16][16];
浮动tmp0=0.0f;
浮动tmp1=0.0f;
浮动tmp2=0.0f;
浮动tmp3=0.0f;
浮动tmp4=0.0f;
浮动tmp5=0.0f;
浮动tmp6=0.0f;
浮动tmp7=0.0f;
浮子集水坑=0.0f;
浮子集水坑A=0.0f;
浮子集水坑B=0.0f;
浮子集水坑C=0.0f;
浮子集水坑2=0.0f;
浮子集水坑A2=0.0f;
浮子集水坑B2=0.0f;
浮子集水坑C2=0.0f;
屏障(CLK_本地_MEM_围栏);
Lcache3[localRow][localCol]=0.0f;
Lcache3a[localRow][localCol]=0.0f;
Lcache3b[localRow][localCol]=0.0f;
Lcache3c[localRow][localCol]=0.0f;
屏障(CLK_本地_MEM_围栏);
对于(int i=0;i为什么有这么多围栏?事实上,我认为你根本不需要它们。你只需要一个围栏,当一个线程写入本地时,其他线程就会读取它。而不是当该线程读写本地内存时
顺便说一句,围栏比屏障要好得多。在屏障中,你会强制线程同步。在某些情况下,这会降低性能
我认为,通过更改内存访问模型,您可以重写代码以获得相当多的速度
如果这样做效果更好,您可以试试(我做了许多明显的优化,甚至不知道您的代码在做什么):
\uuuuuu内核void squareGpuMatrixMul(\uuuu全局浮点*v1、\uuuu全局浮点*v2、\uuuu全局浮点*v3)
{
int localRow=get\u local\u id(0);
int localCol=get_local_id(1);
int selectRowFromA=获取组id(0)*32;
int selectColFromB=get\u group\u id(1)*32;
int lid=localCol*16+localRow;
__本地浮动Lcache1[16][16];
__本地浮动Lcache2[16][16];
__本地浮动Lcache3[16][16];
__本地浮动Lcache1a[16][16];
__本地浮动Lcache2a[16][16];
__本地浮动Lcache3a[16][16];
__本地浮动Lcache1b[16][16];
__本地浮动Lcache2b[16][16];
__本地浮动Lcache3b[16][16];
__本地浮动Lcache1c[16][16];
__本地浮点数Lcache2c[16][16];
__本地浮动Lcache3c[16][16];
浮动tmp0=0.0f;
浮动tmp1=0.0f;
浮动tmp2=0.0f;
浮动tmp3=0.0f;
浮动tmp4=0.0f;
浮动tmp5=0.0f;
浮动tmp6=0.0f;
浮动tmp7=0.0f;
浮子集水坑=0.0f;
浮子集水坑A=0.0f;
浮子集水坑B=0.0f;
浮子集水坑C=0.0f;
浮子集水坑2=0.0f;
浮子集水坑A2=0.0f;
浮子集水坑B2=0.0f;
浮子集水坑C2=0.0f;
Lcache3[localRow][localCol]=0.0f;
Lcache3a[localRow][localCol]=0.0f;
Lcache3b[localRow][localCol]=0.0f;
Lcache3c[localRow][localCol]=0.0f;
对于(int i=0;iCompiler将已经
for(int kk=0;kk< 16;kk++)
{
int nc=(kk+lid)&15;//different for all local threads
//but does not exceed 0-15 range
//summation order is not important
//0.+1.+...15. or 14.+15.+0.+..13.
//gives correct answer
read_mem_fence(CLK_LOCAL_MEM_FENCE);
tmp0=Lcache1[nc][localRow];
tmp1=Lcache1a[nc][localRow];
tmp2=Lcache1b[nc][localRow];
tmp3=Lcache1c[nc][localRow];
tmp4=Lcache2[nc][localCol];
tmp5=Lcache2a[nc][localCol];
tmp6=Lcache2b[nc][localCol];
tmp7=Lcache2c[nc][localCol];
read_mem_fence(CLK_LOCAL_MEM_FENCE);
sumPatch+=tmp0*tmp4;
sumPatcha+=tmp0*tmp6;
sumPatchb+=tmp2*tmp4;
sumPatchc+=tmp2*tmp6;
sumPatch2+=tmp1*tmp5;
sumPatcha2+=tmp1*tmp7;
sumPatchb2+=tmp3*tmp5;
sumPatchc2+=tmp3*tmp7;
}
for(int kk=0;kk< 16;kk++)
{
int nc=(kk+lid)&15;//different for all local threads
//but does not exceed 0-15 range
//summation order is not important
//0.+1.+...15. or 14.+15.+0.+..13.
//gives correct answer
read_mem_fence(CLK_LOCAL_MEM_FENCE);
tmp0=Lcache1[nc][localRow];
tmp4=Lcache2[nc][localCol];
sumPatch+=tmp0*tmp4;
tmp6=Lcache2b[nc][localCol];
sumPatcha+=tmp0*tmp6;
tmp1=Lcache1a[nc][localRow];
tmp7=Lcache2c[nc][localCol];
sumPatcha2+=tmp1*tmp7;
tmp5=Lcache2a[nc][localCol];
sumPatch2+=tmp1*tmp5;
tmp2=Lcache1b[nc][localRow];
sumPatchb+=tmp2*tmp4;
sumPatchc+=tmp2*tmp6;
tmp3=Lcache1c[nc][localRow];
sumPatchb2+=tmp3*tmp5;
sumPatchc2+=tmp3*tmp7;
read_mem_fence(CLK_LOCAL_MEM_FENCE);//this lines' position does not change time
}
__kernel void squareGpuMatrixMul(__global float * v1, __global float * v2, __global float * v3)
{
int localRow = get_local_id(0);
int localCol = get_local_id(1);
int selectRowFromA = get_group_id(0)*32;
int selectColFromB = get_group_id(1)*32;
int lid= localCol*16+localRow;
__local float Lcache1[ 16][ 16];
__local float Lcache2[ 16][ 16];
__local float Lcache3[ 16][ 16];
__local float Lcache1a[ 16][ 16];
__local float Lcache2a[ 16][ 16];
__local float Lcache3a[ 16][ 16];
__local float Lcache1b[ 16][ 16];
__local float Lcache2b[ 16][ 16];
__local float Lcache3b[ 16][ 16];
__local float Lcache1c[ 16][ 16];
__local float Lcache2c[ 16][ 16];
__local float Lcache3c[ 16][ 16];
float tmp0=0.0f;
float tmp1=0.0f;
float tmp2=0.0f;
float tmp3=0.0f;
float tmp4=0.0f;
float tmp5=0.0f;
float tmp6=0.0f;
float tmp7=0.0f;
float sumPatch=0.0f;
float sumPatcha=0.0f;
float sumPatchb=0.0f;
float sumPatchc=0.0f;
float sumPatch2=0.0f;
float sumPatcha2=0.0f;
float sumPatchb2=0.0f;
float sumPatchc2=0.0f;
Lcache3[localRow][localCol]=0.0f;
Lcache3a[localRow][localCol]=0.0f;
Lcache3b[localRow][localCol]=0.0f;
Lcache3c[localRow][localCol]=0.0f;
for(int i=0;i<1024;i+=32) // this is A's row and B's column parsed by sub-matrices
{
Lcache1[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024];
Lcache2[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024];
Lcache1a[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16];
Lcache2a[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16];
Lcache1b[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+16384];
Lcache2b[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+16384];
Lcache1c[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16+16384];
Lcache2c[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16+16384];
mem_fence(CLK_LOCAL_MEM_FENCE);
sumPatch=0.0f;
sumPatcha=0.0f;
sumPatchb=0.0f;
sumPatchc=0.0f;
sumPatch2=0.0f;
sumPatcha2=0.0f;
sumPatchb2=0.0f;
sumPatchc2=0.0f;
for(int kk=0;kk< 16;kk++) //this is sub-matrix multiplication
{
tmp0=Lcache1[kk][localRow]; // row-major
tmp1=Lcache1a[kk][localRow]; // accesses
tmp2=Lcache1b[kk][localRow]; //to local memory
tmp3=Lcache1c[kk][localRow];
tmp4=Lcache2[kk][localCol];
tmp5=Lcache2a[kk][localCol];
tmp6=Lcache2b[kk][localCol];
tmp7=Lcache2c[kk][localCol];
sumPatch+=tmp0*tmp4;
sumPatcha+=tmp0*tmp6;
sumPatchb+=tmp2*tmp4;
sumPatchc+=tmp2*tmp6;
sumPatch2+=tmp1*tmp5;
sumPatcha2+=tmp1*tmp7;
sumPatchb2+=tmp3*tmp5;
sumPatchc2+=tmp3*tmp7;
}
Lcache3[localRow][localCol]+=sumPatch+sumPatch2;
Lcache3a[localRow][localCol]+=sumPatcha+sumPatcha2;
Lcache3b[localRow][localCol]+=sumPatchb+sumPatchb2;
Lcache3c[localRow][localCol]+=sumPatchc+sumPatchc2;
}
mem_fence(CLK_LOCAL_MEM_FENCE);
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024]=Lcache3[localRow][localCol];
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16]=Lcache3a[localRow][localCol];
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+16384]=Lcache3b[localRow][localCol];
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16+16384]=Lcache3c[localRow][localCol];
}