C++ 为什么是naï;ve C++;矩阵乘法比BLAS慢100倍?

C++ 为什么是naï;ve C++;矩阵乘法比BLAS慢100倍?,c++,linux,matlab,c++11,matrix-multiplication,C++,Linux,Matlab,C++11,Matrix Multiplication,我正在研究大型矩阵乘法,并运行以下实验以形成基线测试: 从标准正常值(0平均值,1标准差)随机生成两个4096x4096矩阵X,Y Z=X*Y 求Z元素的和(以确保它们被访问)和输出 这里是n++的C++实现: #include <iostream> #include <algorithm> using namespace std; int main() { constexpr size_t dim = 4096; float* x = new fl

我正在研究大型矩阵乘法,并运行以下实验以形成基线测试:

  • 从标准正常值(0平均值,1标准差)随机生成两个4096x4096矩阵X,Y
  • Z=X*Y
  • 求Z元素的和(以确保它们被访问)和输出
  • 这里是n++的C++实现:

    #include <iostream>
    #include <algorithm>
    
    using namespace std;
    
    int main()
    {
        constexpr size_t dim = 4096;
    
        float* x = new float[dim*dim];
        float* y = new float[dim*dim];
        float* z = new float[dim*dim];
    
        random_device rd;
        mt19937 gen(rd());
        normal_distribution<float> dist(0, 1);
    
        for (size_t i = 0; i < dim*dim; i++)
        {
            x[i] = dist(gen);
            y[i] = dist(gen);
        }
    
        for (size_t row = 0; row < dim; row++)
            for (size_t col = 0; col < dim; col++)
            {
                float acc = 0;
    
                for (size_t k = 0; k < dim; k++)
                    acc += x[row*dim + k] * y[k*dim + col];
    
                z[row*dim + col] = acc;
            }
    
        float t = 0;
    
        for (size_t i = 0; i < dim*dim; i++)
            t += z[i];
    
        cout << t << endl;
    
        delete x;
        delete y;
        delete z;
    }
    
    以下是倍频程/matlab实现:

    X = stdnormal_rnd(4096, 4096);
    Y = stdnormal_rnd(4096, 4096);
    Z = X*Y;
    sum(sum(Z))
    
    运行:

    八度音阶版本耗时6秒:

    real    0m5.985s
    user    0m10.881s
    sys         0m0.144s
    
    我知道BLAS优化到了地狱般的地步,天真的算法完全忽略了缓存等等,但说真的——90次

    有人能解释这种差异吗?BLAS实现的架构到底是什么?我看到它在使用Fortran,但是CPU级别上发生了什么?它使用什么算法?它是如何使用CPU缓存的?它调用什么x86-64机器指令?(是否使用了AVX等高级CPU功能?)

    对C++算法的哪些关键优化可以使其与BLA版本相媲美?

    我在gdb下运行了八度音阶,并在计算到一半时停止了几次。它启动了第二个线程,下面是堆栈(所有停止点看起来都很相似):

    它正在按预期调用BLAS
    gemm

    第一个线程似乎要加入第二个线程,因此我不确定这两个线程是否占观察到的200%的CPU使用率

    哪个库是atldgemmlibblas.so.3,它的代码在哪里

    $ ls -al /usr/lib/libblas.so.3
    /usr/lib/libblas.so.3 -> /etc/alternatives/libblas.so.3
    
    $ ls -al /etc/alternatives/libblas.so.3
    /etc/alternatives/libblas.so.3 -> /usr/lib/atlas-base/atlas/libblas.so.3
    
    $ ls -al /usr/lib/atlas-base/atlas/libblas.so.3
    /usr/lib/atlas-base/atlas/libblas.so.3 -> libblas.so.3.0
    
    $ ls -al /usr/lib/atlas-base/atlas/libblas.so.3.0
    /usr/lib/atlas-base/atlas/libblas.so.3.0
    
    $ dpkg -S /usr/lib/atlas-base/atlas/libblas.so.3.0
    libatlas3-base: /usr/lib/atlas-base/atlas/libblas.so.3.0
    
    $ apt-get source libatlas3-base
    
    这是图集3.8.4

    以下是我后来实施的优化:

    使用平铺方法,将64x64块X、Y和Z预加载到单独的阵列中

    更改每个块的计算,使内部循环如下所示:

    for (size_t tcol = 0; tcol < block_width; tcol++)
        bufz[trow][tcol] += B * bufy[tk][tcol];
    
    所有操作都在calc_block函数中—90%以上的时间都花在了该函数中

    新的时间是:

    real    0m17.370s
    user    0m17.213s
    sys 0m0.092s
    
    更接近

    calc_block函数的反编译如下所示:

    0000000000401460 <_Z10calc_blockv>:
      401460:   b8 e0 21 60 00          mov    $0x6021e0,%eax
      401465:   41 b8 e0 23 61 00       mov    $0x6123e0,%r8d
      40146b:   31 ff                   xor    %edi,%edi
      40146d:   49 29 c0                sub    %rax,%r8
      401470:   49 8d 34 00             lea    (%r8,%rax,1),%rsi
      401474:   48 89 f9                mov    %rdi,%rcx
      401477:   ba e0 a1 60 00          mov    $0x60a1e0,%edx
      40147c:   48 c1 e1 09             shl    $0x9,%rcx
      401480:   48 81 c1 e0 21 61 00    add    $0x6121e0,%rcx
      401487:   66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
      40148e:   00 00 
      401490:   c4 e2 7d 19 01          vbroadcastsd (%rcx),%ymm0
      401495:   48 83 c1 08             add    $0x8,%rcx
      401499:   c5 fd 59 0a             vmulpd (%rdx),%ymm0,%ymm1
      40149d:   c5 f5 58 08             vaddpd (%rax),%ymm1,%ymm1
      4014a1:   c5 fd 29 08             vmovapd %ymm1,(%rax)
      4014a5:   c5 fd 59 4a 20          vmulpd 0x20(%rdx),%ymm0,%ymm1
      4014aa:   c5 f5 58 48 20          vaddpd 0x20(%rax),%ymm1,%ymm1
      4014af:   c5 fd 29 48 20          vmovapd %ymm1,0x20(%rax)
      4014b4:   c5 fd 59 4a 40          vmulpd 0x40(%rdx),%ymm0,%ymm1
      4014b9:   c5 f5 58 48 40          vaddpd 0x40(%rax),%ymm1,%ymm1
      4014be:   c5 fd 29 48 40          vmovapd %ymm1,0x40(%rax)
      4014c3:   c5 fd 59 4a 60          vmulpd 0x60(%rdx),%ymm0,%ymm1
      4014c8:   c5 f5 58 48 60          vaddpd 0x60(%rax),%ymm1,%ymm1
      4014cd:   c5 fd 29 48 60          vmovapd %ymm1,0x60(%rax)
      4014d2:   c5 fd 59 8a 80 00 00    vmulpd 0x80(%rdx),%ymm0,%ymm1
      4014d9:   00 
      4014da:   c5 f5 58 88 80 00 00    vaddpd 0x80(%rax),%ymm1,%ymm1
      4014e1:   00 
      4014e2:   c5 fd 29 88 80 00 00    vmovapd %ymm1,0x80(%rax)
      4014e9:   00 
      4014ea:   c5 fd 59 8a a0 00 00    vmulpd 0xa0(%rdx),%ymm0,%ymm1
      4014f1:   00 
      4014f2:   c5 f5 58 88 a0 00 00    vaddpd 0xa0(%rax),%ymm1,%ymm1
      4014f9:   00 
      4014fa:   c5 fd 29 88 a0 00 00    vmovapd %ymm1,0xa0(%rax)
      401501:   00 
      401502:   c5 fd 59 8a c0 00 00    vmulpd 0xc0(%rdx),%ymm0,%ymm1
      401509:   00 
      40150a:   c5 f5 58 88 c0 00 00    vaddpd 0xc0(%rax),%ymm1,%ymm1
      401511:   00 
      401512:   c5 fd 29 88 c0 00 00    vmovapd %ymm1,0xc0(%rax)
      401519:   00 
      40151a:   c5 fd 59 8a e0 00 00    vmulpd 0xe0(%rdx),%ymm0,%ymm1
      401521:   00 
      401522:   c5 f5 58 88 e0 00 00    vaddpd 0xe0(%rax),%ymm1,%ymm1
      401529:   00 
      40152a:   c5 fd 29 88 e0 00 00    vmovapd %ymm1,0xe0(%rax)
      401531:   00 
      401532:   c5 fd 59 8a 00 01 00    vmulpd 0x100(%rdx),%ymm0,%ymm1
      401539:   00 
      40153a:   c5 f5 58 88 00 01 00    vaddpd 0x100(%rax),%ymm1,%ymm1
      401541:   00 
      401542:   c5 fd 29 88 00 01 00    vmovapd %ymm1,0x100(%rax)
      401549:   00 
      40154a:   c5 fd 59 8a 20 01 00    vmulpd 0x120(%rdx),%ymm0,%ymm1
      401551:   00 
      401552:   c5 f5 58 88 20 01 00    vaddpd 0x120(%rax),%ymm1,%ymm1
      401559:   00 
      40155a:   c5 fd 29 88 20 01 00    vmovapd %ymm1,0x120(%rax)
      401561:   00 
      401562:   c5 fd 59 8a 40 01 00    vmulpd 0x140(%rdx),%ymm0,%ymm1
      401569:   00 
      40156a:   c5 f5 58 88 40 01 00    vaddpd 0x140(%rax),%ymm1,%ymm1
      401571:   00 
      401572:   c5 fd 29 88 40 01 00    vmovapd %ymm1,0x140(%rax)
      401579:   00 
      40157a:   c5 fd 59 8a 60 01 00    vmulpd 0x160(%rdx),%ymm0,%ymm1
      401581:   00 
      401582:   c5 f5 58 88 60 01 00    vaddpd 0x160(%rax),%ymm1,%ymm1
      401589:   00 
      40158a:   c5 fd 29 88 60 01 00    vmovapd %ymm1,0x160(%rax)
      401591:   00 
      401592:   c5 fd 59 8a 80 01 00    vmulpd 0x180(%rdx),%ymm0,%ymm1
      401599:   00 
      40159a:   c5 f5 58 88 80 01 00    vaddpd 0x180(%rax),%ymm1,%ymm1
      4015a1:   00 
      4015a2:   c5 fd 29 88 80 01 00    vmovapd %ymm1,0x180(%rax)
      4015a9:   00 
      4015aa:   c5 fd 59 8a a0 01 00    vmulpd 0x1a0(%rdx),%ymm0,%ymm1
      4015b1:   00 
      4015b2:   c5 f5 58 88 a0 01 00    vaddpd 0x1a0(%rax),%ymm1,%ymm1
      4015b9:   00 
      4015ba:   c5 fd 29 88 a0 01 00    vmovapd %ymm1,0x1a0(%rax)
      4015c1:   00 
      4015c2:   c5 fd 59 8a c0 01 00    vmulpd 0x1c0(%rdx),%ymm0,%ymm1
      4015c9:   00 
      4015ca:   c5 f5 58 88 c0 01 00    vaddpd 0x1c0(%rax),%ymm1,%ymm1
      4015d1:   00 
      4015d2:   c5 fd 29 88 c0 01 00    vmovapd %ymm1,0x1c0(%rax)
      4015d9:   00 
      4015da:   c5 fd 59 82 e0 01 00    vmulpd 0x1e0(%rdx),%ymm0,%ymm0
      4015e1:   00 
      4015e2:   c5 fd 58 80 e0 01 00    vaddpd 0x1e0(%rax),%ymm0,%ymm0
      4015e9:   00 
      4015ea:   48 81 c2 00 02 00 00    add    $0x200,%rdx
      4015f1:   48 39 ce                cmp    %rcx,%rsi
      4015f4:   c5 fd 29 80 e0 01 00    vmovapd %ymm0,0x1e0(%rax)
      4015fb:   00 
      4015fc:   0f 85 8e fe ff ff       jne    401490 <_Z10calc_blockv+0x30>
      401602:   48 83 c7 01             add    $0x1,%rdi
      401606:   48 05 00 02 00 00       add    $0x200,%rax
      40160c:   48 83 ff 40             cmp    $0x40,%rdi
      401610:   0f 85 5a fe ff ff       jne    401470 <_Z10calc_blockv+0x10>
      401616:   c5 f8 77                vzeroupper 
      401619:   c3                      retq   
      40161a:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
    
    0000000000 401460:
    401460:b8 e0 21 60 00 mov$0x6021e0,%eax
    401465:41 b8 e0 23 61 00 mov$0x6123e0,%r8d
    40146b:31 ff异或%edi,%edi
    40146d:49 29 c0子%rax,%r8
    401470:49 8d 34 00 lea(%r8,%rax,1),%rsi
    401474:48 89 f9 mov%rdi,%rcx
    401477:ba e0 a1 6000 mov$0x60a1e0,%edx
    40147c:48 c1 e1 09 shl$0x9,%rcx
    401480:48 81 c1 e0 21 61 00添加$0x6121e0,%rcx
    401487:66 0f 1f 84 00 nopw 0x0(%rax,%rax,1)
    40148e:00
    401490:c4 e2 7d 19 01 vbroadcastsd(%rcx),%ymm0
    401495:48 83 c1 08添加$0x8,%rcx
    401499:c5 fd 59 0a vmulpd(%rdx),%ymm0,%ymm1
    40149d:c5 f5 58 08 vaddpd(%rax),%ymm1,%ymm1
    4014a1:c5 fd 29 08 vmovapd%ymm1,(%rax)
    4014a5:c5 fd 59 4a 20 vmulpd 0x20(%rdx),%ymm0,%ymm1
    4014aa:c5 f5 58 48 20 vaddpd 0x20(%rax),%ymm1,%ymm1
    4014af:c5 fd 29 48 20 vmovapd%ymm1,0x20(%rax)
    4014b4:c5 fd 59 4a 40 vmulpd 0x40(%rdx),%ymm0,%ymm1
    4014b9:c5 f5 58 48 40 vaddpd 0x40(%rax),%ymm1,%ymm1
    4014be:c5 fd 29 48 40 vmovapd%ymm1,0x40(%rax)
    4014c3:c5 fd 59 4a 60 vmulpd 0x60(%rdx),%ymm0,%ymm1
    4014c8:c5 f5 58 48 60 vaddpd 0x60(%rax),%ymm1,%ymm1
    4014cd:c5 fd 29 48 60 vmovapd%ymm1,0x60(%rax)
    4014d2:c5 fd 59 8a 80 00 vmulpd 0x80(%rdx),%ymm0,%ymm1
    4014d9:00
    4014da:c5 f5 58 88 80 00 vaddpd 0x80(%rax),%ymm1,%ymm1
    4014e1:00
    4014e2:c5 fd 29 88 80 00 vmovapd%ymm1,0x80(%rax)
    4014e9:00
    4014ea:c5 fd 59 8a a0 00 vmulpd 0xa0(%rdx),%ymm0,%ymm1
    4014f1:00
    4014f2:c5 f5 58 88 a0 00 vaddpd 0xa0(%rax),%ymm1,%ymm1
    4014f9:00
    4014fa:c5 fd 29 88 a0 00 00 vmovapd%ymm1,0xa0(%rax)
    401501:   00 
    401502:c5 fd 59 8a C000 00 vmulpd 0xc0(%rdx),%ymm0,%ymm1
    401509:   00 
    40150a:c5 f5 58 88 c0 00 vaddpd 0xc0(%rax),%ymm1,%ymm1
    401511:   00 
    401512:c5 fd 29 88 c0 00 00 vmovapd%ymm1,0xc0(%rax)
    401519:   00 
    40151a:c5 fd 59 8a e0 00 vmulpd 0xe0(%rdx),%ymm0,%ymm1
    401521:   00 
    401522:c5 f5 58 88 e0 00 vaddpd 0xe0(%rax),%ymm1,%ymm1
    401529:   00 
    40152a:c5 fd 29 88 e0 00 00 vmovapd%ymm1,0xe0(%rax)
    401531:   00 
    401532:c5 fd 59 8a 00 01 00 vmulpd 0x100(%rdx),%ymm0,%ymm1
    401539:   00 
    40153a:c5 f5 58 88 00 01 00 vaddpd 0x100(%rax),%ymm1,%ymm1
    401541:   00 
    401542:c5 fd 29 88 00 01 00 vmovapd%ymm1,0x100(%rax)
    401549:   00 
    40154a:c5 fd 59 8a 20 01 00 vmulpd 0x120(%rdx),%ymm0,%ymm1
    401551:   00 
    401552:c5 f5 58 88 20 01 00 vaddpd 0x120(%rax),%ymm1,%ymm1
    401559:   00 
    40155a:c5 fd 29 88 20 01 00 vmovapd%ymm1,0x120(%rax)
    401561:   00 
    401562:c5 fd 59 8a 40 01 00 vmulpd 0x140(%rdx),%ymm0,%ymm1
    401569:   00 
    40156a:c5 f5 58 88 40 01 00 vaddpd 0x140(%rax),%ymm1,%ymm1
    401571:   00 
    401572:c5 fd 29 88 40 01 00 vmovapd%ymm1,0x140(%rax)
    401579:   00 
    40157a:c5 fd 59 8a 60 01 00 vmulpd 0x160(%rdx),%ymm0,%ymm1
    401581:   00 
    401582:c5 f5 58 88 60 01 00 vaddpd 0x160(%rax),%ymm1,%ymm1
    401589:   00 
    40158a:c5 fd 29 88 60 01 00 vmovapd%ymm1,0x160(%rax)
    401591:   00 
    401592:c5 fd 59 8a 80 01 00 vmulpd 0x180(%rdx),%ymm0,%ymm1
    401599:   00 
    40159a:c5 f5 58 88 80 01 00 vaddpd 0x180(%rax),%ymm1,%ymm1
    4015a1:00
    4015a2:c5 fd 29 88 80 01 00 vmovapd%ymm1,0x180(%rax)
    4015a9:00
    4015A:c5 fd 59 8a a0 01 00 vmulpd 0x1a0(%rdx),%ymm0,%ymm1
    4015b1:00
    4015b2:c5 f5 58 88 a0 01 00 vaddpd 0x1a0(%rax),%ymm1,%ymm1
    4015b9:00
    4015ba:c5 fd 29 88 a0 01 00 vmovapd%ymm1,0x1a0(%rax)
    4015c1:00
    4015c2:c5 fd 59 8a c0 0
    
    $ ls -al /usr/lib/libblas.so.3
    /usr/lib/libblas.so.3 -> /etc/alternatives/libblas.so.3
    
    $ ls -al /etc/alternatives/libblas.so.3
    /etc/alternatives/libblas.so.3 -> /usr/lib/atlas-base/atlas/libblas.so.3
    
    $ ls -al /usr/lib/atlas-base/atlas/libblas.so.3
    /usr/lib/atlas-base/atlas/libblas.so.3 -> libblas.so.3.0
    
    $ ls -al /usr/lib/atlas-base/atlas/libblas.so.3.0
    /usr/lib/atlas-base/atlas/libblas.so.3.0
    
    $ dpkg -S /usr/lib/atlas-base/atlas/libblas.so.3.0
    libatlas3-base: /usr/lib/atlas-base/atlas/libblas.so.3.0
    
    $ apt-get source libatlas3-base
    
    for (size_t tcol = 0; tcol < block_width; tcol++)
        bufz[trow][tcol] += B * bufy[tk][tcol];
    
    #include <iostream>
    #include <algorithm>
    
    using namespace std;
    
    constexpr size_t dim = 4096;
    constexpr size_t block_width = 64;
    constexpr size_t num_blocks = dim / block_width;
    
    double X[dim][dim], Y[dim][dim], Z[dim][dim];
    
    double bufx[block_width][block_width];
    double bufy[block_width][block_width];
    double bufz[block_width][block_width];
    
    void calc_block()
    {
        for (size_t trow = 0; trow < block_width; trow++)
            for (size_t tk = 0; tk < block_width; tk++)
            {
                double B = bufx[trow][tk];
    
                for (size_t tcol = 0; tcol < block_width; tcol++)
                    bufz[trow][tcol] += B * bufy[tk][tcol];
            }
    }
    
    int main()
    {
        random_device rd;
        mt19937 gen(rd());
        normal_distribution<double> dist(0, 1);
    
        for (size_t row = 0; row < dim; row++)
            for (size_t col = 0; col < dim; col++)
            {
                X[row][col] = dist(gen);
                Y[row][col] = dist(gen);
                Z[row][col] = 0;
            }
    
        for (size_t block_row = 0; block_row < num_blocks; block_row++)
            for (size_t block_col = 0; block_col < num_blocks; block_col++)
            {
                for (size_t trow = 0; trow < block_width; trow++)
                    for (size_t tcol = 0; tcol < block_width; tcol++)
                        bufz[trow][tcol] = 0;
    
                for (size_t block_k = 0; block_k < num_blocks; block_k++)
                {
                    for (size_t trow = 0; trow < block_width; trow++)
                        for (size_t tcol = 0; tcol < block_width; tcol++)
                        {
                            bufx[trow][tcol] = X[block_row*block_width + trow][block_k*block_width + tcol];
                            bufy[trow][tcol] = Y[block_k*block_width + trow][block_col*block_width + tcol];
                        }
    
                    calc_block();
                }
    
                for (size_t trow = 0; trow < block_width; trow++)
                    for (size_t tcol = 0; tcol < block_width; tcol++)
                        Z[block_row*block_width + trow][block_col*block_width + tcol] = bufz[trow][tcol];
    
            }
    
        double t = 0;
    
        for (size_t row = 0; row < dim; row++)
            for (size_t col = 0; col < dim; col++)
                t += Z[row][col];
    
        cout << t << endl;
    }
    
    real    0m17.370s
    user    0m17.213s
    sys 0m0.092s
    
    0000000000401460 <_Z10calc_blockv>:
      401460:   b8 e0 21 60 00          mov    $0x6021e0,%eax
      401465:   41 b8 e0 23 61 00       mov    $0x6123e0,%r8d
      40146b:   31 ff                   xor    %edi,%edi
      40146d:   49 29 c0                sub    %rax,%r8
      401470:   49 8d 34 00             lea    (%r8,%rax,1),%rsi
      401474:   48 89 f9                mov    %rdi,%rcx
      401477:   ba e0 a1 60 00          mov    $0x60a1e0,%edx
      40147c:   48 c1 e1 09             shl    $0x9,%rcx
      401480:   48 81 c1 e0 21 61 00    add    $0x6121e0,%rcx
      401487:   66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
      40148e:   00 00 
      401490:   c4 e2 7d 19 01          vbroadcastsd (%rcx),%ymm0
      401495:   48 83 c1 08             add    $0x8,%rcx
      401499:   c5 fd 59 0a             vmulpd (%rdx),%ymm0,%ymm1
      40149d:   c5 f5 58 08             vaddpd (%rax),%ymm1,%ymm1
      4014a1:   c5 fd 29 08             vmovapd %ymm1,(%rax)
      4014a5:   c5 fd 59 4a 20          vmulpd 0x20(%rdx),%ymm0,%ymm1
      4014aa:   c5 f5 58 48 20          vaddpd 0x20(%rax),%ymm1,%ymm1
      4014af:   c5 fd 29 48 20          vmovapd %ymm1,0x20(%rax)
      4014b4:   c5 fd 59 4a 40          vmulpd 0x40(%rdx),%ymm0,%ymm1
      4014b9:   c5 f5 58 48 40          vaddpd 0x40(%rax),%ymm1,%ymm1
      4014be:   c5 fd 29 48 40          vmovapd %ymm1,0x40(%rax)
      4014c3:   c5 fd 59 4a 60          vmulpd 0x60(%rdx),%ymm0,%ymm1
      4014c8:   c5 f5 58 48 60          vaddpd 0x60(%rax),%ymm1,%ymm1
      4014cd:   c5 fd 29 48 60          vmovapd %ymm1,0x60(%rax)
      4014d2:   c5 fd 59 8a 80 00 00    vmulpd 0x80(%rdx),%ymm0,%ymm1
      4014d9:   00 
      4014da:   c5 f5 58 88 80 00 00    vaddpd 0x80(%rax),%ymm1,%ymm1
      4014e1:   00 
      4014e2:   c5 fd 29 88 80 00 00    vmovapd %ymm1,0x80(%rax)
      4014e9:   00 
      4014ea:   c5 fd 59 8a a0 00 00    vmulpd 0xa0(%rdx),%ymm0,%ymm1
      4014f1:   00 
      4014f2:   c5 f5 58 88 a0 00 00    vaddpd 0xa0(%rax),%ymm1,%ymm1
      4014f9:   00 
      4014fa:   c5 fd 29 88 a0 00 00    vmovapd %ymm1,0xa0(%rax)
      401501:   00 
      401502:   c5 fd 59 8a c0 00 00    vmulpd 0xc0(%rdx),%ymm0,%ymm1
      401509:   00 
      40150a:   c5 f5 58 88 c0 00 00    vaddpd 0xc0(%rax),%ymm1,%ymm1
      401511:   00 
      401512:   c5 fd 29 88 c0 00 00    vmovapd %ymm1,0xc0(%rax)
      401519:   00 
      40151a:   c5 fd 59 8a e0 00 00    vmulpd 0xe0(%rdx),%ymm0,%ymm1
      401521:   00 
      401522:   c5 f5 58 88 e0 00 00    vaddpd 0xe0(%rax),%ymm1,%ymm1
      401529:   00 
      40152a:   c5 fd 29 88 e0 00 00    vmovapd %ymm1,0xe0(%rax)
      401531:   00 
      401532:   c5 fd 59 8a 00 01 00    vmulpd 0x100(%rdx),%ymm0,%ymm1
      401539:   00 
      40153a:   c5 f5 58 88 00 01 00    vaddpd 0x100(%rax),%ymm1,%ymm1
      401541:   00 
      401542:   c5 fd 29 88 00 01 00    vmovapd %ymm1,0x100(%rax)
      401549:   00 
      40154a:   c5 fd 59 8a 20 01 00    vmulpd 0x120(%rdx),%ymm0,%ymm1
      401551:   00 
      401552:   c5 f5 58 88 20 01 00    vaddpd 0x120(%rax),%ymm1,%ymm1
      401559:   00 
      40155a:   c5 fd 29 88 20 01 00    vmovapd %ymm1,0x120(%rax)
      401561:   00 
      401562:   c5 fd 59 8a 40 01 00    vmulpd 0x140(%rdx),%ymm0,%ymm1
      401569:   00 
      40156a:   c5 f5 58 88 40 01 00    vaddpd 0x140(%rax),%ymm1,%ymm1
      401571:   00 
      401572:   c5 fd 29 88 40 01 00    vmovapd %ymm1,0x140(%rax)
      401579:   00 
      40157a:   c5 fd 59 8a 60 01 00    vmulpd 0x160(%rdx),%ymm0,%ymm1
      401581:   00 
      401582:   c5 f5 58 88 60 01 00    vaddpd 0x160(%rax),%ymm1,%ymm1
      401589:   00 
      40158a:   c5 fd 29 88 60 01 00    vmovapd %ymm1,0x160(%rax)
      401591:   00 
      401592:   c5 fd 59 8a 80 01 00    vmulpd 0x180(%rdx),%ymm0,%ymm1
      401599:   00 
      40159a:   c5 f5 58 88 80 01 00    vaddpd 0x180(%rax),%ymm1,%ymm1
      4015a1:   00 
      4015a2:   c5 fd 29 88 80 01 00    vmovapd %ymm1,0x180(%rax)
      4015a9:   00 
      4015aa:   c5 fd 59 8a a0 01 00    vmulpd 0x1a0(%rdx),%ymm0,%ymm1
      4015b1:   00 
      4015b2:   c5 f5 58 88 a0 01 00    vaddpd 0x1a0(%rax),%ymm1,%ymm1
      4015b9:   00 
      4015ba:   c5 fd 29 88 a0 01 00    vmovapd %ymm1,0x1a0(%rax)
      4015c1:   00 
      4015c2:   c5 fd 59 8a c0 01 00    vmulpd 0x1c0(%rdx),%ymm0,%ymm1
      4015c9:   00 
      4015ca:   c5 f5 58 88 c0 01 00    vaddpd 0x1c0(%rax),%ymm1,%ymm1
      4015d1:   00 
      4015d2:   c5 fd 29 88 c0 01 00    vmovapd %ymm1,0x1c0(%rax)
      4015d9:   00 
      4015da:   c5 fd 59 82 e0 01 00    vmulpd 0x1e0(%rdx),%ymm0,%ymm0
      4015e1:   00 
      4015e2:   c5 fd 58 80 e0 01 00    vaddpd 0x1e0(%rax),%ymm0,%ymm0
      4015e9:   00 
      4015ea:   48 81 c2 00 02 00 00    add    $0x200,%rdx
      4015f1:   48 39 ce                cmp    %rcx,%rsi
      4015f4:   c5 fd 29 80 e0 01 00    vmovapd %ymm0,0x1e0(%rax)
      4015fb:   00 
      4015fc:   0f 85 8e fe ff ff       jne    401490 <_Z10calc_blockv+0x30>
      401602:   48 83 c7 01             add    $0x1,%rdi
      401606:   48 05 00 02 00 00       add    $0x200,%rax
      40160c:   48 83 ff 40             cmp    $0x40,%rdi
      401610:   0f 85 5a fe ff ff       jne    401470 <_Z10calc_blockv+0x10>
      401616:   c5 f8 77                vzeroupper 
      401619:   c3                      retq   
      40161a:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
    
    bufz[trow][tcol] += B * bufy[tk][tcol];