Optimization GPU上的矩阵乘法。内存库冲突和延迟隐藏_Optimization_Opencl_Gpgpu_Matrix Multiplication_Flops

Optimization GPU上的矩阵乘法。内存库冲突和延迟隐藏

optimization opencl

Optimization GPU上的矩阵乘法。内存库冲突和延迟隐藏,optimization,opencl,gpgpu,matrix-multiplication,flops,Optimization,Opencl,Gpgpu,Matrix Multiplication,Flops,编辑：随着时间的推移所取得的成就列在这个问题的末尾（~1Tflops/s）我使用C++（DLL）编写的OpenCL（GPU）编写了C语言的数学库，并对单精度方矩阵矩阵乘法器进行了优化（以便于学习和以后在神经网络程序中重用的可能性）。下面的内核代码将v1 1D数组作为matrix1（1024x1024）的行，将v2 1D数组作为matrix2的列（（1024x1024）转置优化），并将结果作为matrix-3的行放入v3 1D数组中。（1024x1024）目前，对于HD7870，1024x10

编辑：随着时间的推移所取得的成就列在这个问题的末尾（~1Tflops/s）

<>我使用C++（DLL）编写的OpenCL（GPU）编写了C语言的数学库，并对单精度方矩阵矩阵乘法器进行了优化（以便于学习和以后在神经网络程序中重用的可能性）。下面的内核代码将v1 1D数组作为matrix1（1024x1024）的行，将v2 1D数组作为matrix2的列（（1024x1024）转置优化），并将结果作为matrix-3的行放入v3 1D数组中。（1024x1024）

目前，对于HD7870，1024x1024平方矩阵乘法的内核执行时间为3.6毫秒

已完成优化：

第二个矩阵的转置（改进的时间）
使用32x32子矩阵在本地内存中进行计算（4x 16x16，因为我的HD7870上的最大工作组大小为256，gpu不接受超过24kB的本地内存，但由于某些原因，在线来源称为64kB？）（无论如何，时间大大缩短了）
在将结果写入本地和全局变量之前，增加数据与私有变量的重复使用。（缩短了时间）
在最内层循环中访问本地2D数组的列主数组。（缩短时间）
每个补丁共享两个累加器寄存器的加法。（时间缩短，数值稳定性降低）
循环展开最里面的循环并没有改善时间（在第四次展开后甚至变得更糟）（因此必须放松整数alu）

问题：我无法完成一些优化，例如消除所有本地（lds）库冲突和指令重新排序以隐藏内存延迟如何改进此数学函数的性能？

这个内核当然是本地内存带宽（冲突）有界的，乘法有3.2毫秒=

（1024*1024*1024*（1和+1乘=2）/0.0036秒）=596x10^9每秒（596gflops）我在GTX680上看到了一些CUDA的在线基准测试，他们已经突破了1TFlops点。因为它每个计算单元有更多的本地内存，或者有更多的内核，或者两者都有

（1024*1024*1024*（2个浮点读取）*（每个浮点4字节）/0.0036秒）=每秒2386x10^9字节但是这个内核读取8个浮点数并使用它们16次，每个浮点数重复使用2次数据

2386x10^9字节/重复使用（2）=1193 GB/s

HD7870的理论最大值为：

计算能力=每秒2560千兆浮点运算，LDS带宽=2560 GB/s，寄存器访问带宽=15360 GB/s

以下是内核：

__kernel void squareGpuMatrixMul(__global float * v1, __global float * v2, __global float * v3) 
{
    int localRow = get_local_id(0); 
    int localCol = get_local_id(1);  
    int selectRowFromA = get_group_id(0)*32;     
    int selectColFromB = get_group_id(1)*32;     
    int lid= localCol*16+localRow; 
    __local float Lcache1[ 16][ 16]; 
    __local float Lcache2[ 16][ 16]; 
    __local float Lcache3[ 16][ 16]; 

    __local float Lcache1a[ 16][ 16]; 
    __local float Lcache2a[ 16][ 16]; 
    __local float Lcache3a[ 16][ 16]; 

    __local float Lcache1b[ 16][ 16]; 
    __local float Lcache2b[ 16][ 16]; 
    __local float Lcache3b[ 16][ 16]; 

    __local float Lcache1c[ 16][ 16]; 
    __local float Lcache2c[ 16][ 16]; 
    __local float Lcache3c[ 16][ 16]; 

    float tmp0=0.0f; 
    float tmp1=0.0f; 
    float tmp2=0.0f; 
    float tmp3=0.0f; 

    float tmp4=0.0f; 
    float tmp5=0.0f; 
    float tmp6=0.0f; 
    float tmp7=0.0f; 

    float sumPatch=0.0f; 
    float sumPatcha=0.0f; 
    float sumPatchb=0.0f; 
    float sumPatchc=0.0f; 
    float sumPatch2=0.0f; 
    float sumPatcha2=0.0f; 
    float sumPatchb2=0.0f; 
    float sumPatchc2=0.0f; 

    barrier(CLK_LOCAL_MEM_FENCE); 
    Lcache3[localRow][localCol]=0.0f; 
    Lcache3a[localRow][localCol]=0.0f; 
    Lcache3b[localRow][localCol]=0.0f; 
    Lcache3c[localRow][localCol]=0.0f; 
    barrier(CLK_LOCAL_MEM_FENCE); 
    for(int i=0;i<1024;i+=32)  // this is A's row and B's column parsed by sub-matrices
    { 
        barrier(CLK_LOCAL_MEM_FENCE); 
        Lcache1[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024];
        Lcache2[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024];
        Lcache1a[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16];
        Lcache2a[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16];
        Lcache1b[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+16384];
        Lcache2b[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+16384];
        Lcache1c[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16+16384];
        Lcache2c[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16+16384];
        barrier(CLK_LOCAL_MEM_FENCE); 
        sumPatch=0.0f; 
        sumPatcha=0.0f; 
        sumPatchb=0.0f; 
        sumPatchc=0.0f; 
        sumPatch2=0.0f; 
        sumPatcha2=0.0f; 
        sumPatchb2=0.0f; 
        sumPatchc2=0.0f; 
        for(int kk=0;kk< 16;kk++) //this is sub-matrix multiplication
        {   
            read_mem_fence(CLK_LOCAL_MEM_FENCE); 
            tmp0=Lcache1[kk][localRow];  // row-major
            tmp1=Lcache1a[kk][localRow]; // accesses
            tmp2=Lcache1b[kk][localRow]; //to local memory
            tmp3=Lcache1c[kk][localRow]; 
            tmp4=Lcache2[kk][localCol]; 
            tmp5=Lcache2a[kk][localCol]; 
            tmp6=Lcache2b[kk][localCol]; 
            tmp7=Lcache2c[kk][localCol]; 
            read_mem_fence(CLK_LOCAL_MEM_FENCE); 
            sumPatch+=tmp0*tmp4; 
            sumPatcha+=tmp0*tmp6; 
            sumPatchb+=tmp2*tmp4; 
            sumPatchc+=tmp2*tmp6; 
            sumPatch2+=tmp1*tmp5; 
            sumPatcha2+=tmp1*tmp7; 
            sumPatchb2+=tmp3*tmp5; 
            sumPatchc2+=tmp3*tmp7; 
        } 
        Lcache3[localRow][localCol]+=sumPatch+sumPatch2; 
        Lcache3a[localRow][localCol]+=sumPatcha+sumPatcha2; 
        Lcache3b[localRow][localCol]+=sumPatchb+sumPatchb2; 
        Lcache3c[localRow][localCol]+=sumPatchc+sumPatchc2; 
    } 
    barrier(CLK_LOCAL_MEM_FENCE); 
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024]=Lcache3[localRow][localCol];                   
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16]=Lcache3a[localRow][localCol];              
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+16384]=Lcache3b[localRow][localCol];     
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16+16384]=Lcache3c[localRow][localCol];     
    barrier(CLK_LOCAL_MEM_FENCE); 
}

\uuuuuu内核void squareGpuMatrixMul（\uuuu全局浮点*v1、\uuuu全局浮点*v2、\uuuu全局浮点*v3）
{
int localRow=get\u local\u id（0）；
int localCol=get_local_id（1）；
int selectRowFromA=获取组id（0）*32；
int selectColFromB=get\u group\u id（1）*32；
int lid=localCol*16+localRow；
__本地浮动Lcache1[16][16]；
__本地浮动Lcache2[16][16]；
__本地浮动Lcache3[16][16]；
__本地浮动Lcache1a[16][16]；
__本地浮动Lcache2a[16][16]；
__本地浮动Lcache3a[16][16]；
__本地浮动Lcache1b[16][16]；
__本地浮动Lcache2b[16][16]；
__本地浮动Lcache3b[16][16]；
__本地浮动Lcache1c[16][16]；
__本地浮点数Lcache2c[16][16]；
__本地浮动Lcache3c[16][16]；
浮动tmp0=0.0f；
浮动tmp1=0.0f；
浮动tmp2=0.0f；
浮动tmp3=0.0f；
浮动tmp4=0.0f；
浮动tmp5=0.0f；
浮动tmp6=0.0f；
浮动tmp7=0.0f；
浮子集水坑=0.0f；
浮子集水坑A=0.0f；
浮子集水坑B=0.0f；
浮子集水坑C=0.0f；
浮子集水坑2=0.0f；
浮子集水坑A2=0.0f；
浮子集水坑B2=0.0f；
浮子集水坑C2=0.0f；
屏障（CLK_本地_MEM_围栏）；
Lcache3[localRow][localCol]=0.0f；
Lcache3a[localRow][localCol]=0.0f；
Lcache3b[localRow][localCol]=0.0f；
Lcache3c[localRow][localCol]=0.0f；
屏障（CLK_本地_MEM_围栏）；
对于（int i=0；i为什么有这么多围栏？事实上，我认为你根本不需要它们。你只需要一个围栏，当一个线程写入本地时，其他线程就会读取它。而不是当该线程读写本地内存时
顺便说一句，围栏比屏障要好得多。在屏障中，你会强制线程同步。在某些情况下，这会降低性能
我认为，通过更改内存访问模型，您可以重写代码以获得相当多的速度
如果这样做效果更好，您可以试试（我做了许多明显的优化，甚至不知道您的代码在做什么）：
\uuuuuu内核void squareGpuMatrixMul（\uuuu全局浮点*v1、\uuuu全局浮点*v2、\uuuu全局浮点*v3）
{
int localRow=get\u local\u id（0）；
int localCol=get_local_id（1）；
int selectRowFromA=获取组id（0）*32；
int selectColFromB=get\u group\u id（1）*32；
int lid=localCol*16+localRow；
__本地浮动Lcache1[16][16]；
__本地浮动Lcache2[16][16]；
__本地浮动Lcache3[16][16]；
__本地浮动Lcache1a[16][16]；
__本地浮动Lcache2a[16][16]；
__本地浮动Lcache3a[16][16]；
__本地浮动Lcache1b[16][16]；
__本地浮动Lcache2b[16][16]；
__本地浮动Lcache3b[16][16]；
__本地浮动Lcache1c[16][16]；
__本地浮点数Lcache2c[16][16]；
__本地浮动Lcache3c[16][16]；
浮动tmp0=0.0f；
浮动tmp1=0.0f；
浮动tmp2=0.0f；
浮动tmp3=0.0f；
浮动tmp4=0.0f；
浮动tmp5=0.0f；
浮动tmp6=0.0f；
浮动tmp7=0.0f；
浮子集水坑=0.0f；
浮子集水坑A=0.0f；
浮子集水坑B=0.0f；
浮子集水坑C=0.0f；
浮子集水坑2=0.0f；
浮子集水坑A2=0.0f；
浮子集水坑B2=0.0f；
浮子集水坑C2=0.0f；
Lcache3[localRow][localCol]=0.0f；
Lcache3a[localRow][localCol]=0.0f；
Lcache3b[localRow][localCol]=0.0f；
Lcache3c[localRow][localCol]=0.0f；
对于（int i=0；iCompiler将已经
for(int kk=0;kk< 16;kk++) 
{   
    int nc=(kk+lid)&15;//different for all local threads
                       //but does not exceed 0-15 range
                       //summation order is not important
                       //0.+1.+...15. or 14.+15.+0.+..13.
                       //gives correct answer
    read_mem_fence(CLK_LOCAL_MEM_FENCE); 
    tmp0=Lcache1[nc][localRow]; 
    tmp1=Lcache1a[nc][localRow]; 
    tmp2=Lcache1b[nc][localRow]; 
    tmp3=Lcache1c[nc][localRow]; 
    tmp4=Lcache2[nc][localCol]; 
    tmp5=Lcache2a[nc][localCol]; 
    tmp6=Lcache2b[nc][localCol]; 
    tmp7=Lcache2c[nc][localCol]; 
    read_mem_fence(CLK_LOCAL_MEM_FENCE);
    sumPatch+=tmp0*tmp4;
    sumPatcha+=tmp0*tmp6;
    sumPatchb+=tmp2*tmp4;
    sumPatchc+=tmp2*tmp6;
    sumPatch2+=tmp1*tmp5;
    sumPatcha2+=tmp1*tmp7;
    sumPatchb2+=tmp3*tmp5;
    sumPatchc2+=tmp3*tmp7;
} 

for(int kk=0;kk< 16;kk++) 
{   
    int nc=(kk+lid)&15;//different for all local threads
                       //but does not exceed 0-15 range
                       //summation order is not important
                       //0.+1.+...15. or 14.+15.+0.+..13.
                       //gives correct answer
    read_mem_fence(CLK_LOCAL_MEM_FENCE); 
    tmp0=Lcache1[nc][localRow]; 
    tmp4=Lcache2[nc][localCol];
    sumPatch+=tmp0*tmp4; 
    tmp6=Lcache2b[nc][localCol];
    sumPatcha+=tmp0*tmp6; 
    tmp1=Lcache1a[nc][localRow];
    tmp7=Lcache2c[nc][localCol]; 
    sumPatcha2+=tmp1*tmp7; 
    tmp5=Lcache2a[nc][localCol];
    sumPatch2+=tmp1*tmp5; 
    tmp2=Lcache1b[nc][localRow]; 
    sumPatchb+=tmp2*tmp4;
    sumPatchc+=tmp2*tmp6; 
    tmp3=Lcache1c[nc][localRow]; 
    sumPatchb2+=tmp3*tmp5;
    sumPatchc2+=tmp3*tmp7;  
    read_mem_fence(CLK_LOCAL_MEM_FENCE);//this lines' position does not change time 
}

__kernel void squareGpuMatrixMul(__global float * v1, __global float * v2, __global float * v3) 
{
    int localRow = get_local_id(0); 
    int localCol = get_local_id(1);  
    int selectRowFromA = get_group_id(0)*32;     
    int selectColFromB = get_group_id(1)*32;     
    int lid= localCol*16+localRow; 
    __local float Lcache1[ 16][ 16]; 
    __local float Lcache2[ 16][ 16]; 
    __local float Lcache3[ 16][ 16]; 

    __local float Lcache1a[ 16][ 16]; 
    __local float Lcache2a[ 16][ 16]; 
    __local float Lcache3a[ 16][ 16]; 

    __local float Lcache1b[ 16][ 16]; 
    __local float Lcache2b[ 16][ 16]; 
    __local float Lcache3b[ 16][ 16]; 

    __local float Lcache1c[ 16][ 16]; 
    __local float Lcache2c[ 16][ 16]; 
    __local float Lcache3c[ 16][ 16]; 

    float tmp0=0.0f; 
    float tmp1=0.0f; 
    float tmp2=0.0f; 
    float tmp3=0.0f; 

    float tmp4=0.0f; 
    float tmp5=0.0f; 
    float tmp6=0.0f; 
    float tmp7=0.0f; 

    float sumPatch=0.0f; 
    float sumPatcha=0.0f; 
    float sumPatchb=0.0f; 
    float sumPatchc=0.0f; 
    float sumPatch2=0.0f; 
    float sumPatcha2=0.0f; 
    float sumPatchb2=0.0f; 
    float sumPatchc2=0.0f; 

    Lcache3[localRow][localCol]=0.0f; 
    Lcache3a[localRow][localCol]=0.0f; 
    Lcache3b[localRow][localCol]=0.0f; 
    Lcache3c[localRow][localCol]=0.0f; 
    for(int i=0;i<1024;i+=32)  // this is A's row and B's column parsed by sub-matrices
    { 
        Lcache1[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024];
        Lcache2[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024];
        Lcache1a[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16];
        Lcache2a[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16];
        Lcache1b[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+16384];
        Lcache2b[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+16384];
        Lcache1c[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16+16384];
        Lcache2c[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16+16384];
        mem_fence(CLK_LOCAL_MEM_FENCE);  
        sumPatch=0.0f; 
        sumPatcha=0.0f; 
        sumPatchb=0.0f; 
        sumPatchc=0.0f; 
        sumPatch2=0.0f; 
        sumPatcha2=0.0f; 
        sumPatchb2=0.0f; 
        sumPatchc2=0.0f; 
        for(int kk=0;kk< 16;kk++) //this is sub-matrix multiplication
        {   
            tmp0=Lcache1[kk][localRow];  // row-major
            tmp1=Lcache1a[kk][localRow]; // accesses
            tmp2=Lcache1b[kk][localRow]; //to local memory
            tmp3=Lcache1c[kk][localRow]; 
            tmp4=Lcache2[kk][localCol]; 
            tmp5=Lcache2a[kk][localCol]; 
            tmp6=Lcache2b[kk][localCol]; 
            tmp7=Lcache2c[kk][localCol]; 
            sumPatch+=tmp0*tmp4; 
            sumPatcha+=tmp0*tmp6; 
            sumPatchb+=tmp2*tmp4; 
            sumPatchc+=tmp2*tmp6; 
            sumPatch2+=tmp1*tmp5; 
            sumPatcha2+=tmp1*tmp7; 
            sumPatchb2+=tmp3*tmp5; 
            sumPatchc2+=tmp3*tmp7; 
        } 
        Lcache3[localRow][localCol]+=sumPatch+sumPatch2; 
        Lcache3a[localRow][localCol]+=sumPatcha+sumPatcha2; 
        Lcache3b[localRow][localCol]+=sumPatchb+sumPatchb2; 
        Lcache3c[localRow][localCol]+=sumPatchc+sumPatchc2; 
    } 
    mem_fence(CLK_LOCAL_MEM_FENCE); 
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024]=Lcache3[localRow][localCol];                   
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16]=Lcache3a[localRow][localCol];              
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+16384]=Lcache3b[localRow][localCol];     
    v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16+16384]=Lcache3c[localRow][localCol];     

}