Can'；t矩阵乘法的最大理论性能不能超过50%_C_Optimization_Matrix_Openmp_Sse

Can'；t矩阵乘法的最大理论性能不能超过50%

c optimization matrix

Can'；t矩阵乘法的最大理论性能不能超过50%,c,optimization,matrix,openmp,sse,C,Optimization,Matrix,Openmp,Sse,问题我正在学习HPC和代码优化。我试图复制Goto的开创性矩阵乘法论文（）中的结果。尽管我尽了最大努力，但我无法获得超过50%的理论CPU性能背景请参阅此处的相关问题（），包括有关我的硬件的信息我所尝试的这篇相关的论文（）很好地描述了Goto的算法结构。我在下面提供了我的源代码我的问题我请求一般帮助。我在这方面工作的时间太长了，已经尝试了许多不同的算法，内联汇编，各种大小的内核（2x2，4x4，2x8，…，m和n大的mxn），但是我似乎无法打破50%的CPU Gflops。这纯粹是

问题

我正在学习HPC和代码优化。我试图复制Goto的开创性矩阵乘法论文（）中的结果。尽管我尽了最大努力，但我无法获得超过50%的理论CPU性能

背景

请参阅此处的相关问题（），包括有关我的硬件的信息

我所尝试的

这篇相关的论文（）很好地描述了Goto的算法结构。我在下面提供了我的源代码

我的问题

我请求一般帮助。我在这方面工作的时间太长了，已经尝试了许多不同的算法，内联汇编，各种大小的内核（2x2，4x4，2x8，…，m和n大的mxn），但是我似乎无法打破50%的CPU Gflops。这纯粹是为了教育目的，而不是家庭作业

源代码

希望这是可以理解的。如果没有，请询问。我设置了宏结构（用于循环），如上面第二篇文章所述。我将这两篇文章中讨论的矩阵打包，如图11所示（）。我的内部内核计算2x8块，因为这似乎是Nehalem体系结构的最佳计算（参见GotoBLAS源代码-内核）。内部内核基于计算秩1更新的概念，如下所述（）

#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
//定义一些预取函数
#定义预取NTA（地址，nRofBytesHead）\
_预取（（（字符*）（地址））+nRofBytesHead，_-mm_-HINT_-NTA）
#定义预取T0（地址，nRofBytesHead）\
_预取（（（字符*）（地址））+nRofBytesHead，mm_HINT_T0）
#定义预取T1（地址，nRofBytesHead）\
_mm_预取（（（char*）（addr））+nROFBytesHead，_mm_HINT_T1）
#定义预取T2（地址，nRofBytesHead）\
_预取（（（字符*）（地址））+nRofBytesHead，mm_HINT_T2）
//定义一个min函数
#ifndef min
#定义最小值（a，b）（（a）<（b））？（a）：（b））
#恩迪夫
//矩阵归零
void zeromat（双*C，整数n）
{
int i=n；
而(我--){
int j=n；
而（j--）{
*（C+i*n+j）=0.0；
}
}
}
//从（2 x kc）x（kc x 8）矩阵计算2x8块
内联空隙
__属性（gnu内联）
__属性（对齐（64））dgemm_2x8_sse(
int k，
常数双*限制a1，常数整数C_a，
常数双*限制b1，常数整数R_b，
双*限制c11，常数int r\u c
)
{
寄存器xmm128d xmm1，xmm4//
r8、r9、r10、r11、r12、r13、r14、r15；//蓄能器
//这里申报了10份登记册
r8=_-mm\u-xor\u-pd（r8，r8）；//ab
r9=_-mm_-xor_-pd（r9，r9）；
r10=_-mm\u-xor\u-pd（r10，r10）；
r11=_-mm\u-xor\u-pd（r11，r11）；
r12=_-mm\u-xor\u-pd（r12，r12）；//ab+8
r13=_-mm_-xor_-pd（r13，r13）；
r14=_-mm_-xor_-pd（r14，r14）；
r15=_-mm_-xor_-pd（r15，r15）；
//预取t2（b1,0）；
//预取t2（b1,64）；
//int l=k；
而（k--）{
//预取0（a1,0）；//从a1中提取64字节
//i=0
xmm1=_mm_load1_pd（a1）；
xmm4=毫米负载（b1）；
xmm4=_mm_mul_pd（xmm1，xmm4）；
r8=_mm_add_pd（r8，xmm4）；
xmm4=毫米负载（b1+2）；
xmm4=_mm_mul_pd（xmm1，xmm4）；
r9=_mm_add_pd（r9，xmm4）；
xmm4=毫米负载（b1+4）；
xmm4=_mm_mul_pd（xmm1，xmm4）；
r10=_mm_add_pd（r10，xmm4）；
xmm4=毫米负载（b1+6）；
xmm4=_mm_mul_pd（xmm1，xmm4）；
r11=_mm_add_pd（r11，xmm4）；
//
//i=1
xmm1=_mm_load1_pd（a1+1）；
xmm4=毫米负载（b1）；
xmm4=_mm_mul_pd（xmm1，xmm4）；
r12=_mm_add_pd（r12，xmm4）；
xmm4=毫米负载（b1+2）；
xmm4=_mm_mul_pd（xmm1，xmm4）；
r13=_mm_add_pd（r13，xmm4）；
xmm4=毫米负载（b1+4）；
xmm4=_mm_mul_pd（xmm1，xmm4）；
r14=_mm_add_pd（r14，xmm4）；
xmm4=毫米负载（b1+6）；
xmm4=_mm_mul_pd（xmm1，xmm4）；
r15=_mm_add_pd（r15，xmm4）；
a1+=cs_a；
b1+=rs_b；
//预取t2（b1,0）；
//预取t2（b1,64）；
}
//将结果复制到C中
预取t0（c11,0）；
xmm1=_-mm_-load_-pd（c11）；
xmm1=_mm_add_pd（xmm1，r8）；
_mm_-store_-pd（c11，xmm1）；
xmm1=_mm_负载_pd（c11+2）；
xmm1=_mm_add_pd（xmm1，r9）；
_mm_-store_-pd（c11+2，xmm1）；
xmm1=_mm_负载_pd（c11+4）；
xmm1=_mm_add_pd（xmm1，r10）；
_mm_-store_-pd（c11+4，xmm1）；
xmm1=_mm_载荷_pd（c11+6）；
xmm1=_mm_add_pd（xmm1，r11）；
_mm_-store_-pd（c11+6，xmm1）；
c11+=rs_c；
预取t0（c11,0）；
xmm1=_-mm_-load_-pd（c11）；
xmm1=_mm_add_pd（xmm1，r12）；
_mm_-store_-pd（c11，xmm1）；
xmm1=_mm_负载_pd（c11+2）；
xmm1=_mm_add_pd（xmm1，r13）；
_mm_-store_-pd（c11+2，xmm1）；
xmm1=_mm_负载_pd（c11+4）；
xmm1=_mm_add_pd（xmm1，r14）；
_mm_-store_-pd（c11+4，xmm1）；
xmm1=_mm_载荷_pd（c11+6）；
xmm1=_mm_add_pd（xmm1，r15）；
_mm_-store_-pd（c11+6，xmm1）；
}
//将矩阵打包成一行一行的条子
内联空隙
__属性（gnu内联）
__属性（对齐（64））rpack（双*限制dst，
const double*restrict src，
常数内部kc、常数内部mc、常数内部mr、常数内部n）
{
双tmp[mc*kc]uuuuuu属性（对齐（64））；
双*限制ptr=&tmp[0]；
对于（int i=0；i#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <x86intrin.h>
#include <math.h>
#include <omp.h>
#include <stdint.h>


// define some prefetch functions
#define PREFETCHNTA(addr,nrOfBytesAhead) \
        _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_NTA)

#define PREFETCHT0(addr,nrOfBytesAhead) \
        _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0)

#define PREFETCHT1(addr,nrOfBytesAhead) \
        _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T1)

#define PREFETCHT2(addr,nrOfBytesAhead) \
        _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T2)

// define a min function
#ifndef min
    #define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
#endif

// zero a matrix
void zeromat(double *C, int n)
{
    int i = n;
    while (i--) {
        int j = n;
        while (j--) {
            *(C + i*n + j) = 0.0;
        }
    }
}

// compute a 2x8 block from (2 x kc) x (kc x 8) matrices
inline void 
__attribute__ ((gnu_inline))        
__attribute__ ((aligned(64))) dgemm_2x8_sse(
                int k,
                const double* restrict a1, const int cs_a,
                const double* restrict b1, const int rs_b,
                      double* restrict c11, const int rs_c
                )
{

    register __m128d xmm1, xmm4, //
                    r8, r9, r10, r11, r12, r13, r14, r15; // accumulators

    // 10 registers declared here

    r8 = _mm_xor_pd(r8,r8); // ab
    r9 = _mm_xor_pd(r9,r9);
    r10 = _mm_xor_pd(r10,r10);
    r11 = _mm_xor_pd(r11,r11);

    r12 = _mm_xor_pd(r12,r12); // ab + 8
    r13 = _mm_xor_pd(r13,r13);
    r14 = _mm_xor_pd(r14,r14);
    r15 = _mm_xor_pd(r15,r15);

        // PREFETCHT2(b1,0);
        // PREFETCHT2(b1,64);



    //int l = k;
    while (k--) {

        //PREFETCHT0(a1,0); // fetch 64 bytes from a1

            // i = 0
            xmm1 = _mm_load1_pd(a1);

            xmm4 = _mm_load_pd(b1);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r8 = _mm_add_pd(r8,xmm4);

            xmm4 = _mm_load_pd(b1 + 2);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r9 = _mm_add_pd(r9,xmm4);

            xmm4 = _mm_load_pd(b1 + 4);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r10 = _mm_add_pd(r10,xmm4);

            xmm4 = _mm_load_pd(b1 + 6);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r11 = _mm_add_pd(r11,xmm4);

            //
            // i = 1
            xmm1 = _mm_load1_pd(a1 + 1);

            xmm4 = _mm_load_pd(b1);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r12 = _mm_add_pd(r12,xmm4);

            xmm4 = _mm_load_pd(b1 + 2);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r13 = _mm_add_pd(r13,xmm4);

            xmm4 = _mm_load_pd(b1 + 4);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r14 = _mm_add_pd(r14,xmm4);

            xmm4 = _mm_load_pd(b1 + 6);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r15 = _mm_add_pd(r15,xmm4);

        a1 += cs_a;
        b1 += rs_b;

        //PREFETCHT2(b1,0);
        //PREFETCHT2(b1,64);

    }

        // copy result into C

        PREFETCHT0(c11,0);
        xmm1 = _mm_load_pd(c11);
        xmm1 = _mm_add_pd(xmm1,r8);
        _mm_store_pd(c11,xmm1);

        xmm1 = _mm_load_pd(c11 + 2);
        xmm1 = _mm_add_pd(xmm1,r9);
        _mm_store_pd(c11 + 2,xmm1);

        xmm1 = _mm_load_pd(c11 + 4);
        xmm1 = _mm_add_pd(xmm1,r10);
        _mm_store_pd(c11 + 4,xmm1);

        xmm1 = _mm_load_pd(c11 + 6);
        xmm1 = _mm_add_pd(xmm1,r11);
        _mm_store_pd(c11 + 6,xmm1);

        c11 += rs_c;

        PREFETCHT0(c11,0);
        xmm1 = _mm_load_pd(c11);
        xmm1 = _mm_add_pd(xmm1,r12);
        _mm_store_pd(c11,xmm1);

        xmm1 = _mm_load_pd(c11 + 2);
        xmm1 = _mm_add_pd(xmm1,r13);
        _mm_store_pd(c11 + 2,xmm1);

        xmm1 = _mm_load_pd(c11 + 4);
        xmm1 = _mm_add_pd(xmm1,r14);
        _mm_store_pd(c11 + 4,xmm1);

        xmm1 = _mm_load_pd(c11 + 6);
        xmm1 = _mm_add_pd(xmm1,r15);
        _mm_store_pd(c11 + 6,xmm1);

}

// packs a matrix into rows of slivers
inline void 
__attribute__ ((gnu_inline))        
__attribute__ ((aligned(64))) rpack(        double* restrict dst, 
          const double* restrict src, 
            const int kc, const int mc, const int mr, const int n)
{
    double tmp[mc*kc] __attribute__ ((aligned(64)));
    double* restrict ptr = &tmp[0];

    for (int i = 0; i < mc; ++i)
        for (int j = 0; j < kc; ++j)
            *ptr++ = *(src + i*n + j);

    ptr = &tmp[0];

    //const int inc_dst = mr*kc;
    for (int k = 0; k < mc; k+=mr)
        for (int j = 0; j < kc; ++j)
            for (int i = 0; i < mr*kc; i+=kc)
                *dst++ = *(ptr + k*kc + j + i);

}

// packs a matrix into columns of slivers
inline void 
__attribute__ ((gnu_inline))        
__attribute__ ((aligned(64)))  cpack(double* restrict dst, 
                const double* restrict src, 
                const int nc, 
                const int kc, 
                const int nr, 
                const int n)
{
    double tmp[kc*nc] __attribute__ ((aligned(64)));
    double* restrict ptr = &tmp[0];

    for (int i = 0; i < kc; ++i)
        for (int j = 0; j < nc; ++j)
            *ptr++ = *(src + i*n + j);

    ptr = &tmp[0];

    // const int inc_k = nc/nr;
    for (int k = 0; k < nc; k+=nr)
        for (int j = 0; j < kc*nc; j+=nc)
            for (int i = 0; i < nr; ++i)
                *dst++ = *(ptr + k + i + j);

}

void blis_dgemm_ref(
        const int n,
        const double* restrict A,
        const double* restrict B,
        double* restrict C,
        const int mc,
        const int nc,
        const int kc
    )
{
    int mr = 2;
    int nr = 8;
    double locA[mc*kc] __attribute__ ((aligned(64)));
    double locB[kc*nc] __attribute__ ((aligned(64)));
    int ii,jj,kk,i,j;
    #pragma omp parallel num_threads(4) shared(A,B,C) private(ii,jj,kk,i,j,locA,locB)
    {//use all threads in parallel
        #pragma omp for
        // partitions C and B into wide column panels
        for ( jj = 0; jj < n; jj+=nc) {
        // A and the current column of B are partitioned into col and row panels
            for ( kk = 0; kk < n; kk+=kc) {
                cpack(locB, B + kk*n + jj, nc, kc, nr, n);
                // partition current panel of A into blocks
                for ( ii = 0; ii < n; ii+=mc) {
                    rpack(locA, A + ii*n + kk, kc, mc, mr, n);
                    for ( i = 0; i < min(n-ii,mc); i+=mr) {
                        for ( j = 0; j < min(n-jj,nc); j+=nr) {
                            // inner kernel that compues 2 x 8 block
                            dgemm_2x8_sse( kc,
                                       locA + i*kc          ,  mr,
                                       locB + j*kc          ,  nr,
                                       C + (i+ii)*n + (j+jj),  n );
                        }
                    }
                }
            }
        }
    }
}

double compute_gflops(const double time, const int n)
{
    // computes the gigaflops for a square matrix-matrix multiplication
    double gflops;
    gflops = (double) (2.0*n*n*n)/time/1.0e9;
    return(gflops);
}

// ******* MAIN ********//
void main() {
    clock_t time1, time2;
    double time3;
    double gflops;
    const int trials = 10;

    int nmax = 4096;
    printf("%10s %10s\n","N","Gflops/s");

    int mc = 128;
    int kc = 256;
    int nc = 128;

    for (int n = kc; n <= nmax; n+=kc) { //assuming kc is the max dim
        double *A = NULL;
        double *B = NULL;
        double *C = NULL;

        A = _mm_malloc (n*n * sizeof(*A),64);
        B = _mm_malloc (n*n * sizeof(*B),64);
        C = _mm_malloc (n*n * sizeof(*C),64);

        srand(time(NULL));

        // Create the matrices
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < n; j++) {
                A[i*n + j] = (double) rand()/RAND_MAX;
                B[i*n + j] = (double) rand()/RAND_MAX;
                //D[j*n + i] = B[i*n + j]; // Transpose
                C[i*n + j] = 0.0;
            }
        }

            // warmup
            zeromat(C,n);
            blis_dgemm_ref(n,A,B,C,mc,nc,kc);
            zeromat(C,n);
            time2 = 0;
            for (int count = 0; count < trials; count++){// iterations per experiment here
                    time1 = clock();
                    blis_dgemm_ref(n,A,B,C,mc,nc,kc);
                    time2 += clock() - time1;
                    zeromat(C,n);
                }
            time3 = (double)(time2)/CLOCKS_PER_SEC/trials;
            gflops = compute_gflops(time3, n);
            printf("%10d %10f\n",n,gflops);

        _mm_free(A);
        _mm_free(B);
        _mm_free(C);

        }

    printf("tests are done\n");
}


rpack(locA, A + ii*n + kk, kc, mc, mr, n);

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <immintrin.h>

extern "C" void inner64(const float *a, const float *b, float *c);
void (*fp)(const float *a, const float *b, float *c) = inner64;

void reorder(float * __restrict a, float * __restrict b, int n, int bs) {
    int nb = n/bs;
    #pragma omp parallel for
    for(int i=0; i<nb; i++) {
        for(int j=0; j<nb; j++) {
            for(int i2=0; i2<bs; i2++) {
                for(int j2=0; j2<bs; j2++) {
                    b[bs*bs*(nb*i+j) + bs*i2+j2]= a[bs*(i*n+j) + i2*n + j2];    
                }
            }
        }
    }
}

inline void gemm_block(float * __restrict a, float * __restrict b, float * __restrict c, int n, int n2) {
    for(int i=0; i<n2; i++) {
        fp(&a[i*n], b, &c[i*n]);
    }
}

void gemm(float * __restrict a, float * __restrict b, float * __restrict c, int n, int bs) {
    int nb = n/bs;
    float *b2 = (float*)_mm_malloc(sizeof(float)*n*n,64);
    reorder(b,b2,n,bs);
    #pragma omp parallel for
    for(int i=0; i<nb; i++) {
        for(int j=0; j<nb; j++) {
            for(int k=0; k<nb; k++) {
                gemm_block(&a[bs*(i*n+k)],&b2[bs*bs*(k*nb+j)],&c[bs*(i*n+j)], n, bs);
            }
        }
    }
    _mm_free(b2);
}

int main() {
    float peak = 1.0f*8*4*2*3.69f;
    const int n = 4096;
    float flop = 2.0f*n*n*n*1E-9f;
    omp_set_num_threads(4);

    float *a = (float*)_mm_malloc(sizeof(float)*n*n,64);
    float *b = (float*)_mm_malloc(sizeof(float)*n*n,64);
    float *c = (float*)_mm_malloc(sizeof(float)*n*n,64);
    for(int i=0; i<n*n; i++) {
        a[i] = 1.0f*rand()/RAND_MAX;
        b[i] = 1.0f*rand()/RAND_MAX;
    }

    gemm(a,b,c,n,64); //warm OpenMP up
    while(1) {
        for(int i=0; i<n*n; i++) c[i] = 0;
        double dtime = omp_get_wtime();
        gemm(a,b,c,n,64);   
        dtime = omp_get_wtime() - dtime;
        printf("time %.2f s, efficiency %.2f%%\n", dtime, 100*flop/dtime/peak);
    }
}