Loops 时间性能SIMD汇编程序:更长的循环执行速度更快
我最近学习了汇编语言中的SIMD(x86_64),并获得了一些意想不到的结果。归结起来是以下几点 我有两个程序在循环中运行多次。第一个程序包含一个执行4条SIMD指令的循环,第二个程序包含一条额外指令的完全相同的循环。代码如下所示: 第一个方案:Loops 时间性能SIMD汇编程序:更长的循环执行速度更快,loops,assembly,time,simd,Loops,Assembly,Time,Simd,我最近学习了汇编语言中的SIMD(x86_64),并获得了一些意想不到的结果。归结起来是以下几点 我有两个程序在循环中运行多次。第一个程序包含一个执行4条SIMD指令的循环,第二个程序包含一条额外指令的完全相同的循环。代码如下所示: 第一个方案: section .bss doublestorage: resb 8 section .text global _start _start: mov rax, 0x0000000100000001 mov [doublest
section .bss
doublestorage: resb 8
section .text
global _start
_start:
mov rax, 0x0000000100000001
mov [doublestorage], rax
cvtpi2pd xmm1, [doublestorage]
cvtpi2pd xmm2, [doublestorage]
cvtpi2pd xmm3, [doublestorage]
cvtpi2pd xmm4, [doublestorage]
cvtpi2pd xmm5, [doublestorage]
cvtpi2pd xmm6, [doublestorage]
cvtpi2pd xmm7, [doublestorage]
mov rax, (1 << 31)
loop:
movupd xmm1, xmm3
movupd xmm2, xmm5
divpd xmm1, xmm2
addpd xmm4, xmm1
dec rax
jnz loop
mov rax, 60
mov rdi, 0
syscall
exec/instr4: file format elf64-x86-64
Disassembly of section .text:
00000000004000b0 <.text>:
4000b0: 48 b8 01 00 00 00 01 movabs $0x100000001,%rax
4000b7: 00 00 00
4000ba: 48 89 04 25 28 01 60 mov %rax,0x600128
4000c1: 00
4000c2: 66 0f 2a 0c 25 28 01 cvtpi2pd 0x600128,%xmm1
4000c9: 60 00
4000cb: 66 0f 2a 14 25 28 01 cvtpi2pd 0x600128,%xmm2
4000d2: 60 00
4000d4: 66 0f 2a 1c 25 28 01 cvtpi2pd 0x600128,%xmm3
4000db: 60 00
4000dd: 66 0f 2a 24 25 28 01 cvtpi2pd 0x600128,%xmm4
4000e4: 60 00
4000e6: 66 0f 2a 2c 25 28 01 cvtpi2pd 0x600128,%xmm5
4000ed: 60 00
4000ef: 66 0f 2a 34 25 28 01 cvtpi2pd 0x600128,%xmm6
4000f6: 60 00
4000f8: 66 0f 2a 3c 25 28 01 cvtpi2pd 0x600128,%xmm7
4000ff: 60 00
400101: b8 00 00 00 80 mov $0x80000000,%eax
400106: 66 0f 10 cb movupd %xmm3,%xmm1
40010a: 66 0f 10 d5 movupd %xmm5,%xmm2
40010e: 66 0f 5e ca divpd %xmm2,%xmm1
400112: 66 0f 58 e1 addpd %xmm1,%xmm4
400116: 48 ff c8 dec %rax
400119: 75 eb jne 0x400106
40011b: b8 3c 00 00 00 mov $0x3c,%eax
400120: bf 00 00 00 00 mov $0x0,%edi
400125: 0f 05 syscall
我的结论是,第二个程序运行得相当快。这些结果对我来说似乎违反直觉,所以我想知道这种行为的原因是什么
完整地说,我正在运行Ubuntu 15.10和NASM编译器(-elf64),并使用Intel Core i7-5600。此外,我检查了反汇编,编译器没有进行任何优化:
第一个程序的Objdump:
section .bss
doublestorage: resb 8
section .text
global _start
_start:
mov rax, 0x0000000100000001
mov [doublestorage], rax
cvtpi2pd xmm1, [doublestorage]
cvtpi2pd xmm2, [doublestorage]
cvtpi2pd xmm3, [doublestorage]
cvtpi2pd xmm4, [doublestorage]
cvtpi2pd xmm5, [doublestorage]
cvtpi2pd xmm6, [doublestorage]
cvtpi2pd xmm7, [doublestorage]
mov rax, (1 << 31)
loop:
movupd xmm1, xmm3
movupd xmm2, xmm5
divpd xmm1, xmm2
addpd xmm4, xmm1
dec rax
jnz loop
mov rax, 60
mov rdi, 0
syscall
exec/instr4: file format elf64-x86-64
Disassembly of section .text:
00000000004000b0 <.text>:
4000b0: 48 b8 01 00 00 00 01 movabs $0x100000001,%rax
4000b7: 00 00 00
4000ba: 48 89 04 25 28 01 60 mov %rax,0x600128
4000c1: 00
4000c2: 66 0f 2a 0c 25 28 01 cvtpi2pd 0x600128,%xmm1
4000c9: 60 00
4000cb: 66 0f 2a 14 25 28 01 cvtpi2pd 0x600128,%xmm2
4000d2: 60 00
4000d4: 66 0f 2a 1c 25 28 01 cvtpi2pd 0x600128,%xmm3
4000db: 60 00
4000dd: 66 0f 2a 24 25 28 01 cvtpi2pd 0x600128,%xmm4
4000e4: 60 00
4000e6: 66 0f 2a 2c 25 28 01 cvtpi2pd 0x600128,%xmm5
4000ed: 60 00
4000ef: 66 0f 2a 34 25 28 01 cvtpi2pd 0x600128,%xmm6
4000f6: 60 00
4000f8: 66 0f 2a 3c 25 28 01 cvtpi2pd 0x600128,%xmm7
4000ff: 60 00
400101: b8 00 00 00 80 mov $0x80000000,%eax
400106: 66 0f 10 cb movupd %xmm3,%xmm1
40010a: 66 0f 10 d5 movupd %xmm5,%xmm2
40010e: 66 0f 5e ca divpd %xmm2,%xmm1
400112: 66 0f 58 e1 addpd %xmm1,%xmm4
400116: 48 ff c8 dec %rax
400119: 75 eb jne 0x400106
40011b: b8 3c 00 00 00 mov $0x3c,%eax
400120: bf 00 00 00 00 mov $0x0,%edi
400125: 0f 05 syscall
exec/instr4:文件格式elf64-x86-64
第节的分解。正文:
0000000000 4000b0:
4000b0:48 b8 01 00 01 movabs$0x10000001,%rax
4000b7:00
4000ba:48890425280160MOV%rax,0x600128
4000c1:00
4000c2:66 0f 2a 0c 25 28 01 cvtpi2pd 0x600128,%xmm1
4000c9:60000
4000cb:66 0f 2a 14 25 28 01 cvtpi2pd 0x600128,%xmm2
4000d2:60000
4000d4:66 0f 2a 1c 25 28 01 cvtpi2pd 0x600128,%xmm3
4000db:60000
4000dd:66 0f 2a 24 25 28 01 cvtpi2pd 0x600128,%xmm4
4000e4:60000
4000e6:66 0f 2a 2c 25 28 01 cvtpi2pd 0x600128,%xmm5
4000ed:60000
4000ef:66 0f 2a 34 25 28 01 cvtpi2pd 0x600128,%xmm6
4000f6:60000
4000f8:66 0f 2a 3c 25 28 01 cvtpi2pd 0x600128,%xmm7
4000ff:60000
400101:b8 00 00 80 mov$0x8000000,%eax
400106:66 0f 10 cb movupd%xmm3,%xmm1
40010a:66 0f 10 d5移动日%xmm5,%xmm2
40010e:66 0f 5e ca divpd%xmm2,%xmm1
400112:66 0f 58 e1 addpd%xmm1,%xmm4
400116:48 ff c8 12%rax
400119:75 eb jne 0x400106
40011b:b8 3c 00 mov$0x3c,%eax
400120:bf 00 mov$0x0,%edi
400125:0f 05系统调用
第二个程序的Objdump:
section .bss
doublestorage: resb 8
section .text
global _start
_start:
mov rax, 0x0000000100000001
mov [doublestorage], rax
cvtpi2pd xmm1, [doublestorage]
cvtpi2pd xmm2, [doublestorage]
cvtpi2pd xmm3, [doublestorage]
cvtpi2pd xmm4, [doublestorage]
cvtpi2pd xmm5, [doublestorage]
cvtpi2pd xmm6, [doublestorage]
cvtpi2pd xmm7, [doublestorage]
mov rax, (1 << 31)
loop:
movupd xmm1, xmm3
movupd xmm2, xmm5
divpd xmm1, xmm2
addpd xmm4, xmm1
movupd xmm6, xmm7
dec rax
jnz loop
mov rax, 60
mov rdi, 0
syscall
exec/instr5: file format elf64-x86-64
Disassembly of section .text:
00000000004000b0 <.text>:
4000b0: 48 b8 01 00 00 00 01 movabs $0x100000001,%rax
4000b7: 00 00 00
4000ba: 48 89 04 25 2c 01 60 mov %rax,0x60012c
4000c1: 00
4000c2: 66 0f 2a 0c 25 2c 01 cvtpi2pd 0x60012c,%xmm1
4000c9: 60 00
4000cb: 66 0f 2a 14 25 2c 01 cvtpi2pd 0x60012c,%xmm2
4000d2: 60 00
4000d4: 66 0f 2a 1c 25 2c 01 cvtpi2pd 0x60012c,%xmm3
4000db: 60 00
4000dd: 66 0f 2a 24 25 2c 01 cvtpi2pd 0x60012c,%xmm4
4000e4: 60 00
4000e6: 66 0f 2a 2c 25 2c 01 cvtpi2pd 0x60012c,%xmm5
4000ed: 60 00
4000ef: 66 0f 2a 34 25 2c 01 cvtpi2pd 0x60012c,%xmm6
4000f6: 60 00
4000f8: 66 0f 2a 3c 25 2c 01 cvtpi2pd 0x60012c,%xmm7
4000ff: 60 00
400101: b8 00 00 00 80 mov $0x80000000,%eax
400106: 66 0f 10 cb movupd %xmm3,%xmm1
40010a: 66 0f 10 d5 movupd %xmm5,%xmm2
40010e: 66 0f 5e ca divpd %xmm2,%xmm1
400112: 66 0f 58 e1 addpd %xmm1,%xmm4
400116: 66 0f 10 f7 movupd %xmm7,%xmm6
40011a: 48 ff c8 dec %rax
40011d: 75 e7 jne 0x400106
40011f: b8 3c 00 00 00 mov $0x3c,%eax
400124: bf 00 00 00 00 mov $0x0,%edi
400129: 0f 05 syscall
exec/instr5:文件格式elf64-x86-64
第节的分解。正文:
0000000000 4000b0:
4000b0:48 b8 01 00 01 movabs$0x10000001,%rax
4000b7:00
4000ba:48 89 04 25 2c 01 60 mov%rax,0x60012c
4000c1:00
4000c2:66 0f 2a 0c 25 2c 01 cvtpi2pd 0x60012c,%xmm1
4000c9:60000
4000cb:66 0f 2a 14 25 2c 01 cvtpi2pd 0x60012c,%xmm2
4000d2:60000
4000d4:66 0f 2a 1c 25 2c 01 cvtpi2pd 0x60012c,%xmm3
4000db:60000
4000dd:66 0f 2a 24 25 2c 01 cvtpi2pd 0x60012c,%xmm4
4000e4:60000
4000e6:66 0f 2a 2c 25 2c 01 cvtpi2pd 0x60012c,%xmm5
4000ed:60000
4000ef:66 0f 2a 34 25 2c 01 cvtpi2pd 0x60012c,%xmm6
4000f6:60000
4000f8:66 0f 2a 3c 25 2c 01 cvtpi2pd 0x60012c,%xmm7
4000ff:60000
400101:b8 00 00 80 mov$0x8000000,%eax
400106:66 0f 10 cb movupd%xmm3,%xmm1
40010a:66 0f 10 d5移动日%xmm5,%xmm2
40010e:66 0f 5e ca divpd%xmm2,%xmm1
400112:66 0f 58 e1 addpd%xmm1,%xmm4
400116:66 0f 10 f7移动%xmm7,%xmm6
40011a:48 ff c8 12%rax
40011d:75 e7 jne 0x400106
40011f:b8 3c 00 mov$0x3c,%eax
400124:bf 00 mov$0x0,%edi
400129:0f 05系统调用
有一个。我想你的意思是,这是一个低功耗(15W TDO)CPU,具有2.6GHz的base/3.2GHz的turbo
你能再检查一下这是可复制的吗?确保CPU时钟速度在两次测试中保持不变,因为您的低功耗CPU可能无法在除法单元一直忙的情况下以全高速运行
使用性能计数器(例如,perf stat./a.out
)进行测试也可能有用,测量核心时钟周期。(不是“参考”周期。您要计算时钟实际运行的实际周期。)
仅支持最多Haswell。对于这两个循环,除了每次迭代14c之外,它没有说任何其他内容,这对分配器吞吐量造成了瓶颈。(Agner Fog对
divpd
的测量值为。)
有,但这是关于饱和前端
这个循环在
divpd
throughput()上应该是完全瓶颈。如果这个效应是真的,我唯一的解释就是,一个movupd
insns并不总是被消除,有时会从divpd
中窃取p0一个周期
循环中的三个未使用的域UOP都在不同的端口上运行,因此它们无法互相延迟。(divpd
在p0上,addpd
在p1上,并且在p6上预测了cmp/jcc
)
事实上,即使这个理论也站不住脚。未消除的movaps xmm,xmm
使用Broadwell上的端口5。我假设movupd xmm,xmm
的奇数选择也会解码到端口5 uop。(Agner Fog甚至没有列出movups
/movupd
的reg reg表单的条目,因为每个人都总是使用movaps
。或者movapd
,如果他们喜欢将insn类型与数据匹配,即使它比数据长一个字节,并且不存在uarch关心s
vsd
的旁路延迟,只>movaps
用于浮点/双精度,movdqa用于整数。)
有趣的是,我的2.4GHz E6600(Conroe/merom Microach)以4.5s运行您的循环。Agner Fog的表在Merom上列出了每5-31c一个表<代码>1.0/1.0可能发生在5c中。Sandybridge的最佳案例分割速度明显慢于Nehalem。只有Skylake的最佳案例吞吐量下降得和Merom一样快。(通过
%macro IACA_start 0
mov ebx, 111
db 0x64, 0x67, 0x90
%endmacro
%macro IACA_end 0
mov ebx, 222
db 0x64, 0x67, 0x90
%endmacro
.loop:
movapd xmm1, xmm3 ; replace the only operand that div writes
divpd xmm1, xmm2
addpd xmm4, xmm1
dec eax
jnz .loop
vdivpd xmm0, xmm1, xmm2
vaddpd xmm4, xmm4, xmm0
cmp/jcc