C++ 为什么是naï;ve C++;矩阵乘法比BLAS慢100倍?
我正在研究大型矩阵乘法,并运行以下实验以形成基线测试:C++ 为什么是naï;ve C++;矩阵乘法比BLAS慢100倍?,c++,linux,matlab,c++11,matrix-multiplication,C++,Linux,Matlab,C++11,Matrix Multiplication,我正在研究大型矩阵乘法,并运行以下实验以形成基线测试: 从标准正常值(0平均值,1标准差)随机生成两个4096x4096矩阵X,Y Z=X*Y 求Z元素的和(以确保它们被访问)和输出 这里是n++的C++实现: #include <iostream> #include <algorithm> using namespace std; int main() { constexpr size_t dim = 4096; float* x = new fl
这里是n++的C++实现:
#include <iostream>
#include <algorithm>
using namespace std;
int main()
{
constexpr size_t dim = 4096;
float* x = new float[dim*dim];
float* y = new float[dim*dim];
float* z = new float[dim*dim];
random_device rd;
mt19937 gen(rd());
normal_distribution<float> dist(0, 1);
for (size_t i = 0; i < dim*dim; i++)
{
x[i] = dist(gen);
y[i] = dist(gen);
}
for (size_t row = 0; row < dim; row++)
for (size_t col = 0; col < dim; col++)
{
float acc = 0;
for (size_t k = 0; k < dim; k++)
acc += x[row*dim + k] * y[k*dim + col];
z[row*dim + col] = acc;
}
float t = 0;
for (size_t i = 0; i < dim*dim; i++)
t += z[i];
cout << t << endl;
delete x;
delete y;
delete z;
}
以下是倍频程/matlab实现:
X = stdnormal_rnd(4096, 4096);
Y = stdnormal_rnd(4096, 4096);
Z = X*Y;
sum(sum(Z))
运行:
八度音阶版本耗时6秒:
real 0m5.985s
user 0m10.881s
sys 0m0.144s
我知道BLAS优化到了地狱般的地步,天真的算法完全忽略了缓存等等,但说真的——90次
有人能解释这种差异吗?BLAS实现的架构到底是什么?我看到它在使用Fortran,但是CPU级别上发生了什么?它使用什么算法?它是如何使用CPU缓存的?它调用什么x86-64机器指令?(是否使用了AVX等高级CPU功能?)
对C++算法的哪些关键优化可以使其与BLA版本相媲美?
我在gdb下运行了八度音阶,并在计算到一半时停止了几次。它启动了第二个线程,下面是堆栈(所有停止点看起来都很相似): 它正在按预期调用BLASgemm
第一个线程似乎要加入第二个线程,因此我不确定这两个线程是否占观察到的200%的CPU使用率
哪个库是atldgemmlibblas.so.3,它的代码在哪里
$ ls -al /usr/lib/libblas.so.3
/usr/lib/libblas.so.3 -> /etc/alternatives/libblas.so.3
$ ls -al /etc/alternatives/libblas.so.3
/etc/alternatives/libblas.so.3 -> /usr/lib/atlas-base/atlas/libblas.so.3
$ ls -al /usr/lib/atlas-base/atlas/libblas.so.3
/usr/lib/atlas-base/atlas/libblas.so.3 -> libblas.so.3.0
$ ls -al /usr/lib/atlas-base/atlas/libblas.so.3.0
/usr/lib/atlas-base/atlas/libblas.so.3.0
$ dpkg -S /usr/lib/atlas-base/atlas/libblas.so.3.0
libatlas3-base: /usr/lib/atlas-base/atlas/libblas.so.3.0
$ apt-get source libatlas3-base
这是图集3.8.4
以下是我后来实施的优化:
使用平铺方法,将64x64块X、Y和Z预加载到单独的阵列中
更改每个块的计算,使内部循环如下所示:
for (size_t tcol = 0; tcol < block_width; tcol++)
bufz[trow][tcol] += B * bufy[tk][tcol];
所有操作都在calc_block函数中—90%以上的时间都花在了该函数中
新的时间是:
real 0m17.370s
user 0m17.213s
sys 0m0.092s
更接近
calc_block函数的反编译如下所示:
0000000000401460 <_Z10calc_blockv>:
401460: b8 e0 21 60 00 mov $0x6021e0,%eax
401465: 41 b8 e0 23 61 00 mov $0x6123e0,%r8d
40146b: 31 ff xor %edi,%edi
40146d: 49 29 c0 sub %rax,%r8
401470: 49 8d 34 00 lea (%r8,%rax,1),%rsi
401474: 48 89 f9 mov %rdi,%rcx
401477: ba e0 a1 60 00 mov $0x60a1e0,%edx
40147c: 48 c1 e1 09 shl $0x9,%rcx
401480: 48 81 c1 e0 21 61 00 add $0x6121e0,%rcx
401487: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
40148e: 00 00
401490: c4 e2 7d 19 01 vbroadcastsd (%rcx),%ymm0
401495: 48 83 c1 08 add $0x8,%rcx
401499: c5 fd 59 0a vmulpd (%rdx),%ymm0,%ymm1
40149d: c5 f5 58 08 vaddpd (%rax),%ymm1,%ymm1
4014a1: c5 fd 29 08 vmovapd %ymm1,(%rax)
4014a5: c5 fd 59 4a 20 vmulpd 0x20(%rdx),%ymm0,%ymm1
4014aa: c5 f5 58 48 20 vaddpd 0x20(%rax),%ymm1,%ymm1
4014af: c5 fd 29 48 20 vmovapd %ymm1,0x20(%rax)
4014b4: c5 fd 59 4a 40 vmulpd 0x40(%rdx),%ymm0,%ymm1
4014b9: c5 f5 58 48 40 vaddpd 0x40(%rax),%ymm1,%ymm1
4014be: c5 fd 29 48 40 vmovapd %ymm1,0x40(%rax)
4014c3: c5 fd 59 4a 60 vmulpd 0x60(%rdx),%ymm0,%ymm1
4014c8: c5 f5 58 48 60 vaddpd 0x60(%rax),%ymm1,%ymm1
4014cd: c5 fd 29 48 60 vmovapd %ymm1,0x60(%rax)
4014d2: c5 fd 59 8a 80 00 00 vmulpd 0x80(%rdx),%ymm0,%ymm1
4014d9: 00
4014da: c5 f5 58 88 80 00 00 vaddpd 0x80(%rax),%ymm1,%ymm1
4014e1: 00
4014e2: c5 fd 29 88 80 00 00 vmovapd %ymm1,0x80(%rax)
4014e9: 00
4014ea: c5 fd 59 8a a0 00 00 vmulpd 0xa0(%rdx),%ymm0,%ymm1
4014f1: 00
4014f2: c5 f5 58 88 a0 00 00 vaddpd 0xa0(%rax),%ymm1,%ymm1
4014f9: 00
4014fa: c5 fd 29 88 a0 00 00 vmovapd %ymm1,0xa0(%rax)
401501: 00
401502: c5 fd 59 8a c0 00 00 vmulpd 0xc0(%rdx),%ymm0,%ymm1
401509: 00
40150a: c5 f5 58 88 c0 00 00 vaddpd 0xc0(%rax),%ymm1,%ymm1
401511: 00
401512: c5 fd 29 88 c0 00 00 vmovapd %ymm1,0xc0(%rax)
401519: 00
40151a: c5 fd 59 8a e0 00 00 vmulpd 0xe0(%rdx),%ymm0,%ymm1
401521: 00
401522: c5 f5 58 88 e0 00 00 vaddpd 0xe0(%rax),%ymm1,%ymm1
401529: 00
40152a: c5 fd 29 88 e0 00 00 vmovapd %ymm1,0xe0(%rax)
401531: 00
401532: c5 fd 59 8a 00 01 00 vmulpd 0x100(%rdx),%ymm0,%ymm1
401539: 00
40153a: c5 f5 58 88 00 01 00 vaddpd 0x100(%rax),%ymm1,%ymm1
401541: 00
401542: c5 fd 29 88 00 01 00 vmovapd %ymm1,0x100(%rax)
401549: 00
40154a: c5 fd 59 8a 20 01 00 vmulpd 0x120(%rdx),%ymm0,%ymm1
401551: 00
401552: c5 f5 58 88 20 01 00 vaddpd 0x120(%rax),%ymm1,%ymm1
401559: 00
40155a: c5 fd 29 88 20 01 00 vmovapd %ymm1,0x120(%rax)
401561: 00
401562: c5 fd 59 8a 40 01 00 vmulpd 0x140(%rdx),%ymm0,%ymm1
401569: 00
40156a: c5 f5 58 88 40 01 00 vaddpd 0x140(%rax),%ymm1,%ymm1
401571: 00
401572: c5 fd 29 88 40 01 00 vmovapd %ymm1,0x140(%rax)
401579: 00
40157a: c5 fd 59 8a 60 01 00 vmulpd 0x160(%rdx),%ymm0,%ymm1
401581: 00
401582: c5 f5 58 88 60 01 00 vaddpd 0x160(%rax),%ymm1,%ymm1
401589: 00
40158a: c5 fd 29 88 60 01 00 vmovapd %ymm1,0x160(%rax)
401591: 00
401592: c5 fd 59 8a 80 01 00 vmulpd 0x180(%rdx),%ymm0,%ymm1
401599: 00
40159a: c5 f5 58 88 80 01 00 vaddpd 0x180(%rax),%ymm1,%ymm1
4015a1: 00
4015a2: c5 fd 29 88 80 01 00 vmovapd %ymm1,0x180(%rax)
4015a9: 00
4015aa: c5 fd 59 8a a0 01 00 vmulpd 0x1a0(%rdx),%ymm0,%ymm1
4015b1: 00
4015b2: c5 f5 58 88 a0 01 00 vaddpd 0x1a0(%rax),%ymm1,%ymm1
4015b9: 00
4015ba: c5 fd 29 88 a0 01 00 vmovapd %ymm1,0x1a0(%rax)
4015c1: 00
4015c2: c5 fd 59 8a c0 01 00 vmulpd 0x1c0(%rdx),%ymm0,%ymm1
4015c9: 00
4015ca: c5 f5 58 88 c0 01 00 vaddpd 0x1c0(%rax),%ymm1,%ymm1
4015d1: 00
4015d2: c5 fd 29 88 c0 01 00 vmovapd %ymm1,0x1c0(%rax)
4015d9: 00
4015da: c5 fd 59 82 e0 01 00 vmulpd 0x1e0(%rdx),%ymm0,%ymm0
4015e1: 00
4015e2: c5 fd 58 80 e0 01 00 vaddpd 0x1e0(%rax),%ymm0,%ymm0
4015e9: 00
4015ea: 48 81 c2 00 02 00 00 add $0x200,%rdx
4015f1: 48 39 ce cmp %rcx,%rsi
4015f4: c5 fd 29 80 e0 01 00 vmovapd %ymm0,0x1e0(%rax)
4015fb: 00
4015fc: 0f 85 8e fe ff ff jne 401490 <_Z10calc_blockv+0x30>
401602: 48 83 c7 01 add $0x1,%rdi
401606: 48 05 00 02 00 00 add $0x200,%rax
40160c: 48 83 ff 40 cmp $0x40,%rdi
401610: 0f 85 5a fe ff ff jne 401470 <_Z10calc_blockv+0x10>
401616: c5 f8 77 vzeroupper
401619: c3 retq
40161a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
0000000000 401460:
401460:b8 e0 21 60 00 mov$0x6021e0,%eax
401465:41 b8 e0 23 61 00 mov$0x6123e0,%r8d
40146b:31 ff异或%edi,%edi
40146d:49 29 c0子%rax,%r8
401470:49 8d 34 00 lea(%r8,%rax,1),%rsi
401474:48 89 f9 mov%rdi,%rcx
401477:ba e0 a1 6000 mov$0x60a1e0,%edx
40147c:48 c1 e1 09 shl$0x9,%rcx
401480:48 81 c1 e0 21 61 00添加$0x6121e0,%rcx
401487:66 0f 1f 84 00 nopw 0x0(%rax,%rax,1)
40148e:00
401490:c4 e2 7d 19 01 vbroadcastsd(%rcx),%ymm0
401495:48 83 c1 08添加$0x8,%rcx
401499:c5 fd 59 0a vmulpd(%rdx),%ymm0,%ymm1
40149d:c5 f5 58 08 vaddpd(%rax),%ymm1,%ymm1
4014a1:c5 fd 29 08 vmovapd%ymm1,(%rax)
4014a5:c5 fd 59 4a 20 vmulpd 0x20(%rdx),%ymm0,%ymm1
4014aa:c5 f5 58 48 20 vaddpd 0x20(%rax),%ymm1,%ymm1
4014af:c5 fd 29 48 20 vmovapd%ymm1,0x20(%rax)
4014b4:c5 fd 59 4a 40 vmulpd 0x40(%rdx),%ymm0,%ymm1
4014b9:c5 f5 58 48 40 vaddpd 0x40(%rax),%ymm1,%ymm1
4014be:c5 fd 29 48 40 vmovapd%ymm1,0x40(%rax)
4014c3:c5 fd 59 4a 60 vmulpd 0x60(%rdx),%ymm0,%ymm1
4014c8:c5 f5 58 48 60 vaddpd 0x60(%rax),%ymm1,%ymm1
4014cd:c5 fd 29 48 60 vmovapd%ymm1,0x60(%rax)
4014d2:c5 fd 59 8a 80 00 vmulpd 0x80(%rdx),%ymm0,%ymm1
4014d9:00
4014da:c5 f5 58 88 80 00 vaddpd 0x80(%rax),%ymm1,%ymm1
4014e1:00
4014e2:c5 fd 29 88 80 00 vmovapd%ymm1,0x80(%rax)
4014e9:00
4014ea:c5 fd 59 8a a0 00 vmulpd 0xa0(%rdx),%ymm0,%ymm1
4014f1:00
4014f2:c5 f5 58 88 a0 00 vaddpd 0xa0(%rax),%ymm1,%ymm1
4014f9:00
4014fa:c5 fd 29 88 a0 00 00 vmovapd%ymm1,0xa0(%rax)
401501: 00
401502:c5 fd 59 8a C000 00 vmulpd 0xc0(%rdx),%ymm0,%ymm1
401509: 00
40150a:c5 f5 58 88 c0 00 vaddpd 0xc0(%rax),%ymm1,%ymm1
401511: 00
401512:c5 fd 29 88 c0 00 00 vmovapd%ymm1,0xc0(%rax)
401519: 00
40151a:c5 fd 59 8a e0 00 vmulpd 0xe0(%rdx),%ymm0,%ymm1
401521: 00
401522:c5 f5 58 88 e0 00 vaddpd 0xe0(%rax),%ymm1,%ymm1
401529: 00
40152a:c5 fd 29 88 e0 00 00 vmovapd%ymm1,0xe0(%rax)
401531: 00
401532:c5 fd 59 8a 00 01 00 vmulpd 0x100(%rdx),%ymm0,%ymm1
401539: 00
40153a:c5 f5 58 88 00 01 00 vaddpd 0x100(%rax),%ymm1,%ymm1
401541: 00
401542:c5 fd 29 88 00 01 00 vmovapd%ymm1,0x100(%rax)
401549: 00
40154a:c5 fd 59 8a 20 01 00 vmulpd 0x120(%rdx),%ymm0,%ymm1
401551: 00
401552:c5 f5 58 88 20 01 00 vaddpd 0x120(%rax),%ymm1,%ymm1
401559: 00
40155a:c5 fd 29 88 20 01 00 vmovapd%ymm1,0x120(%rax)
401561: 00
401562:c5 fd 59 8a 40 01 00 vmulpd 0x140(%rdx),%ymm0,%ymm1
401569: 00
40156a:c5 f5 58 88 40 01 00 vaddpd 0x140(%rax),%ymm1,%ymm1
401571: 00
401572:c5 fd 29 88 40 01 00 vmovapd%ymm1,0x140(%rax)
401579: 00
40157a:c5 fd 59 8a 60 01 00 vmulpd 0x160(%rdx),%ymm0,%ymm1
401581: 00
401582:c5 f5 58 88 60 01 00 vaddpd 0x160(%rax),%ymm1,%ymm1
401589: 00
40158a:c5 fd 29 88 60 01 00 vmovapd%ymm1,0x160(%rax)
401591: 00
401592:c5 fd 59 8a 80 01 00 vmulpd 0x180(%rdx),%ymm0,%ymm1
401599: 00
40159a:c5 f5 58 88 80 01 00 vaddpd 0x180(%rax),%ymm1,%ymm1
4015a1:00
4015a2:c5 fd 29 88 80 01 00 vmovapd%ymm1,0x180(%rax)
4015a9:00
4015A:c5 fd 59 8a a0 01 00 vmulpd 0x1a0(%rdx),%ymm0,%ymm1
4015b1:00
4015b2:c5 f5 58 88 a0 01 00 vaddpd 0x1a0(%rax),%ymm1,%ymm1
4015b9:00
4015ba:c5 fd 29 88 a0 01 00 vmovapd%ymm1,0x1a0(%rax)
4015c1:00
4015c2:c5 fd 59 8a c0 0
$ ls -al /usr/lib/libblas.so.3
/usr/lib/libblas.so.3 -> /etc/alternatives/libblas.so.3
$ ls -al /etc/alternatives/libblas.so.3
/etc/alternatives/libblas.so.3 -> /usr/lib/atlas-base/atlas/libblas.so.3
$ ls -al /usr/lib/atlas-base/atlas/libblas.so.3
/usr/lib/atlas-base/atlas/libblas.so.3 -> libblas.so.3.0
$ ls -al /usr/lib/atlas-base/atlas/libblas.so.3.0
/usr/lib/atlas-base/atlas/libblas.so.3.0
$ dpkg -S /usr/lib/atlas-base/atlas/libblas.so.3.0
libatlas3-base: /usr/lib/atlas-base/atlas/libblas.so.3.0
$ apt-get source libatlas3-base
for (size_t tcol = 0; tcol < block_width; tcol++)
bufz[trow][tcol] += B * bufy[tk][tcol];
#include <iostream>
#include <algorithm>
using namespace std;
constexpr size_t dim = 4096;
constexpr size_t block_width = 64;
constexpr size_t num_blocks = dim / block_width;
double X[dim][dim], Y[dim][dim], Z[dim][dim];
double bufx[block_width][block_width];
double bufy[block_width][block_width];
double bufz[block_width][block_width];
void calc_block()
{
for (size_t trow = 0; trow < block_width; trow++)
for (size_t tk = 0; tk < block_width; tk++)
{
double B = bufx[trow][tk];
for (size_t tcol = 0; tcol < block_width; tcol++)
bufz[trow][tcol] += B * bufy[tk][tcol];
}
}
int main()
{
random_device rd;
mt19937 gen(rd());
normal_distribution<double> dist(0, 1);
for (size_t row = 0; row < dim; row++)
for (size_t col = 0; col < dim; col++)
{
X[row][col] = dist(gen);
Y[row][col] = dist(gen);
Z[row][col] = 0;
}
for (size_t block_row = 0; block_row < num_blocks; block_row++)
for (size_t block_col = 0; block_col < num_blocks; block_col++)
{
for (size_t trow = 0; trow < block_width; trow++)
for (size_t tcol = 0; tcol < block_width; tcol++)
bufz[trow][tcol] = 0;
for (size_t block_k = 0; block_k < num_blocks; block_k++)
{
for (size_t trow = 0; trow < block_width; trow++)
for (size_t tcol = 0; tcol < block_width; tcol++)
{
bufx[trow][tcol] = X[block_row*block_width + trow][block_k*block_width + tcol];
bufy[trow][tcol] = Y[block_k*block_width + trow][block_col*block_width + tcol];
}
calc_block();
}
for (size_t trow = 0; trow < block_width; trow++)
for (size_t tcol = 0; tcol < block_width; tcol++)
Z[block_row*block_width + trow][block_col*block_width + tcol] = bufz[trow][tcol];
}
double t = 0;
for (size_t row = 0; row < dim; row++)
for (size_t col = 0; col < dim; col++)
t += Z[row][col];
cout << t << endl;
}
real 0m17.370s
user 0m17.213s
sys 0m0.092s
0000000000401460 <_Z10calc_blockv>:
401460: b8 e0 21 60 00 mov $0x6021e0,%eax
401465: 41 b8 e0 23 61 00 mov $0x6123e0,%r8d
40146b: 31 ff xor %edi,%edi
40146d: 49 29 c0 sub %rax,%r8
401470: 49 8d 34 00 lea (%r8,%rax,1),%rsi
401474: 48 89 f9 mov %rdi,%rcx
401477: ba e0 a1 60 00 mov $0x60a1e0,%edx
40147c: 48 c1 e1 09 shl $0x9,%rcx
401480: 48 81 c1 e0 21 61 00 add $0x6121e0,%rcx
401487: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
40148e: 00 00
401490: c4 e2 7d 19 01 vbroadcastsd (%rcx),%ymm0
401495: 48 83 c1 08 add $0x8,%rcx
401499: c5 fd 59 0a vmulpd (%rdx),%ymm0,%ymm1
40149d: c5 f5 58 08 vaddpd (%rax),%ymm1,%ymm1
4014a1: c5 fd 29 08 vmovapd %ymm1,(%rax)
4014a5: c5 fd 59 4a 20 vmulpd 0x20(%rdx),%ymm0,%ymm1
4014aa: c5 f5 58 48 20 vaddpd 0x20(%rax),%ymm1,%ymm1
4014af: c5 fd 29 48 20 vmovapd %ymm1,0x20(%rax)
4014b4: c5 fd 59 4a 40 vmulpd 0x40(%rdx),%ymm0,%ymm1
4014b9: c5 f5 58 48 40 vaddpd 0x40(%rax),%ymm1,%ymm1
4014be: c5 fd 29 48 40 vmovapd %ymm1,0x40(%rax)
4014c3: c5 fd 59 4a 60 vmulpd 0x60(%rdx),%ymm0,%ymm1
4014c8: c5 f5 58 48 60 vaddpd 0x60(%rax),%ymm1,%ymm1
4014cd: c5 fd 29 48 60 vmovapd %ymm1,0x60(%rax)
4014d2: c5 fd 59 8a 80 00 00 vmulpd 0x80(%rdx),%ymm0,%ymm1
4014d9: 00
4014da: c5 f5 58 88 80 00 00 vaddpd 0x80(%rax),%ymm1,%ymm1
4014e1: 00
4014e2: c5 fd 29 88 80 00 00 vmovapd %ymm1,0x80(%rax)
4014e9: 00
4014ea: c5 fd 59 8a a0 00 00 vmulpd 0xa0(%rdx),%ymm0,%ymm1
4014f1: 00
4014f2: c5 f5 58 88 a0 00 00 vaddpd 0xa0(%rax),%ymm1,%ymm1
4014f9: 00
4014fa: c5 fd 29 88 a0 00 00 vmovapd %ymm1,0xa0(%rax)
401501: 00
401502: c5 fd 59 8a c0 00 00 vmulpd 0xc0(%rdx),%ymm0,%ymm1
401509: 00
40150a: c5 f5 58 88 c0 00 00 vaddpd 0xc0(%rax),%ymm1,%ymm1
401511: 00
401512: c5 fd 29 88 c0 00 00 vmovapd %ymm1,0xc0(%rax)
401519: 00
40151a: c5 fd 59 8a e0 00 00 vmulpd 0xe0(%rdx),%ymm0,%ymm1
401521: 00
401522: c5 f5 58 88 e0 00 00 vaddpd 0xe0(%rax),%ymm1,%ymm1
401529: 00
40152a: c5 fd 29 88 e0 00 00 vmovapd %ymm1,0xe0(%rax)
401531: 00
401532: c5 fd 59 8a 00 01 00 vmulpd 0x100(%rdx),%ymm0,%ymm1
401539: 00
40153a: c5 f5 58 88 00 01 00 vaddpd 0x100(%rax),%ymm1,%ymm1
401541: 00
401542: c5 fd 29 88 00 01 00 vmovapd %ymm1,0x100(%rax)
401549: 00
40154a: c5 fd 59 8a 20 01 00 vmulpd 0x120(%rdx),%ymm0,%ymm1
401551: 00
401552: c5 f5 58 88 20 01 00 vaddpd 0x120(%rax),%ymm1,%ymm1
401559: 00
40155a: c5 fd 29 88 20 01 00 vmovapd %ymm1,0x120(%rax)
401561: 00
401562: c5 fd 59 8a 40 01 00 vmulpd 0x140(%rdx),%ymm0,%ymm1
401569: 00
40156a: c5 f5 58 88 40 01 00 vaddpd 0x140(%rax),%ymm1,%ymm1
401571: 00
401572: c5 fd 29 88 40 01 00 vmovapd %ymm1,0x140(%rax)
401579: 00
40157a: c5 fd 59 8a 60 01 00 vmulpd 0x160(%rdx),%ymm0,%ymm1
401581: 00
401582: c5 f5 58 88 60 01 00 vaddpd 0x160(%rax),%ymm1,%ymm1
401589: 00
40158a: c5 fd 29 88 60 01 00 vmovapd %ymm1,0x160(%rax)
401591: 00
401592: c5 fd 59 8a 80 01 00 vmulpd 0x180(%rdx),%ymm0,%ymm1
401599: 00
40159a: c5 f5 58 88 80 01 00 vaddpd 0x180(%rax),%ymm1,%ymm1
4015a1: 00
4015a2: c5 fd 29 88 80 01 00 vmovapd %ymm1,0x180(%rax)
4015a9: 00
4015aa: c5 fd 59 8a a0 01 00 vmulpd 0x1a0(%rdx),%ymm0,%ymm1
4015b1: 00
4015b2: c5 f5 58 88 a0 01 00 vaddpd 0x1a0(%rax),%ymm1,%ymm1
4015b9: 00
4015ba: c5 fd 29 88 a0 01 00 vmovapd %ymm1,0x1a0(%rax)
4015c1: 00
4015c2: c5 fd 59 8a c0 01 00 vmulpd 0x1c0(%rdx),%ymm0,%ymm1
4015c9: 00
4015ca: c5 f5 58 88 c0 01 00 vaddpd 0x1c0(%rax),%ymm1,%ymm1
4015d1: 00
4015d2: c5 fd 29 88 c0 01 00 vmovapd %ymm1,0x1c0(%rax)
4015d9: 00
4015da: c5 fd 59 82 e0 01 00 vmulpd 0x1e0(%rdx),%ymm0,%ymm0
4015e1: 00
4015e2: c5 fd 58 80 e0 01 00 vaddpd 0x1e0(%rax),%ymm0,%ymm0
4015e9: 00
4015ea: 48 81 c2 00 02 00 00 add $0x200,%rdx
4015f1: 48 39 ce cmp %rcx,%rsi
4015f4: c5 fd 29 80 e0 01 00 vmovapd %ymm0,0x1e0(%rax)
4015fb: 00
4015fc: 0f 85 8e fe ff ff jne 401490 <_Z10calc_blockv+0x30>
401602: 48 83 c7 01 add $0x1,%rdi
401606: 48 05 00 02 00 00 add $0x200,%rax
40160c: 48 83 ff 40 cmp $0x40,%rdi
401610: 0f 85 5a fe ff ff jne 401470 <_Z10calc_blockv+0x10>
401616: c5 f8 77 vzeroupper
401619: c3 retq
40161a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
bufz[trow][tcol] += B * bufy[tk][tcol];