为什么在同一个程序中相同C循环的相同副本需要显著但一致不同的时间来执行？_C_Loops_Assembly_Intel_Memory Alignment

为什么在同一个程序中相同C循环的相同副本需要显著但一致不同的时间来执行？

c loops assembly

为什么在同一个程序中相同C循环的相同副本需要显著但一致不同的时间来执行？,c,loops,assembly,intel,memory-alignment,C,Loops,Assembly,Intel,Memory Alignment,我希望我已经将我的问题简化为一个简单且可重复的测试用例。源（即）包含相同的简单循环的10个副本。每个循环的形式如下： #define COUNT (1000 * 1000 * 1000) volatile uint64_t counter = 0; void loopN(void) { for (int j = COUNT; j != 0; j--) { uint64_t val = counter; val = val + 1; counter = val;

我希望我已经将我的问题简化为一个简单且可重复的测试用例。源（即）包含相同的简单循环的10个副本。每个循环的形式如下：

#define COUNT (1000 * 1000 * 1000)
volatile uint64_t counter = 0;

void loopN(void) {
  for (int j = COUNT; j != 0; j--) {
    uint64_t val = counter;
    val = val + 1;
    counter = val;
  }
  return;
}

变量的“volatile”很重要，因为它强制在每次迭代时从内存读取和写入值。每个循环使用“-falign loops=64”与64个字节对齐，并生成相同的程序集，但与全局循环的相对偏移除外：

   400880:       48 8b 15 c1 07 20 00    mov    0x2007c1(%rip),%rdx  # 601048 <counter>
   400887:       48 83 c2 01             add    $0x1,%rdx
   40088b:       83 e8 01                sub    $0x1,%eax
   40088e:       48 89 15 b3 07 20 00    mov    %rdx,0x2007b3(%rip)  # 601048 <counter>
   400895:       75 e9                   jne    400880 <loop8+0x20>

我还在源代码中使用属性（（noinline））使程序集更清晰，但观察问题时不需要这样做。我发现使用shell循环的函数速度最快，速度最慢：

for n in 0 1 2 3 4 5 6 7 8 9; do echo same-function ${n}:; /usr/bin/time -f "%e seconds" same-function ${n}; /usr/bin/time -f "%e seconds" same-function ${n}; /usr/bin/time -f "%e seconds" same-function ${n}; done
它产生的结果在每次运行中大约一致于1%，最快和最慢函数的确切数量根据确切的二进制布局而变化：

same-function 0: 2.08 seconds 2.04 seconds 2.06 seconds same-function 1: 2.12 seconds 2.12 seconds 2.12 seconds same-function 2: 2.10 seconds 2.14 seconds 2.11 seconds same-function 3: 2.04 seconds 2.04 seconds 2.05 seconds same-function 4: 2.05 seconds 2.00 seconds 2.03 seconds same-function 5: 2.07 seconds 2.07 seconds 1.98 seconds same-function 6: 1.83 seconds 1.83 seconds 1.83 seconds same-function 7: 1.95 seconds 1.98 seconds 1.95 seconds same-function 8: 1.86 seconds 1.88 seconds 1.86 seconds same-function 9: 2.04 seconds 2.04 seconds 2.02 seconds
在本例中，我们看到loop2（）是执行速度最慢的一个，而loop6（）是执行速度最快的一个，两者之间的差异刚刚超过10%。我们通过使用不同的方法反复测试这两种情况来再次确认这一点：

nate@haswell$ N=2; for i in {1..10}; do perf stat same-function $N 2>&1 | grep GHz; done 7,180,104,866 cycles # 3.391 GHz 7,169,930,711 cycles # 3.391 GHz 7,150,190,394 cycles # 3.391 GHz 7,188,959,096 cycles # 3.391 GHz 7,177,272,608 cycles # 3.391 GHz 7,093,246,955 cycles # 3.391 GHz 7,210,636,865 cycles # 3.391 GHz 7,239,838,211 cycles # 3.391 GHz 7,172,716,779 cycles # 3.391 GHz 7,223,252,964 cycles # 3.391 GHz nate@haswell$ N=6; for i in {1..10}; do perf stat same-function $N 2>&1 | grep GHz; done 6,234,770,361 cycles # 3.391 GHz 6,199,096,296 cycles # 3.391 GHz 6,213,348,126 cycles # 3.391 GHz 6,217,971,263 cycles # 3.391 GHz 6,224,779,686 cycles # 3.391 GHz 6,194,117,897 cycles # 3.391 GHz 6,225,259,274 cycles # 3.391 GHz 6,244,391,509 cycles # 3.391 GHz 6,189,972,381 cycles # 3.391 GHz 6,205,556,306 cycles # 3.391 GHz
考虑到这一点，我们重新阅读了每一本英特尔体系结构手册中的每一个单词，筛选了整个网络上提到“计算机”或“编程”的每一页，并在山顶上独自冥想了6年。由于无法获得任何形式的启蒙，我们来到文明世界，剃胡子，洗澡，并询问科学专家：
这里可能发生什么事
编辑：在本杰明的帮助下（见下面他的答案），我想出了一个更为有效的方法。这是一个独立的20线组装。从使用SUB更改为SBB会导致15%的性能差异，即使结果保持不变且执行的指令数相同。解释？我想我越来越接近了

; Minimal example, see also http://stackoverflow.com/q/26266953/3766665 ; To build (Linux): ; nasm -felf64 func.asm ; ld func.o ; Then run: ; perf stat -r10 ./a.out ; On Haswell and Sandy Bridge, observed runtime varies ; ~15% depending on whether sub or sbb is used in the loop section .text global _start _start: push qword 0h ; put counter variable on stack jmp loop ; jump to function align 64 ; function alignment. loop: mov rcx, 1000000000 align 64 ; loop alignment. l: mov rax, [rsp] add rax, 1h mov [rsp], rax ; sbb rcx, 1h ; which is faster: sbb or sub? sub rcx, 1h ; switch, time it, and find out jne l ; (rot13 spoiler: foo vf snfgre ol 15%) fin: ; If that was too easy, explain why. mov eax, 60 xor edi, edi ; End of program. Exit with code 0 syscall

几年前，我会告诉你，当CPU到达任何一个循环时，检查CPU内部状态的任何差异；众所周知，这对无序预测过程（或类似过程）的能力有着深远的影响。例如，同一个循环的性能可能会发生高达15-20%的变化，这取决于CPU在进入循环之前所做的事情，而仅仅从两个不同的点跳转就足以改变执行速度
对你来说，这很难测试。您所要做的就是更改IF块中指令的顺序；例如，更换以下部件：

switch (firstLetter) { case '0': loop0(); break; case '1': loop1(); break; case '2': loop2(); break; case '3': loop3(); break; case '4': loop4(); break; case '5': loop5(); break; case '6': loop6(); break; case '7': loop7(); break; case '8': loop8(); break; case '9': loop9(); break; default: goto die_usage; }
与：
或者任意顺序。当然，您应该检查生成的汇编代码，以确保编译器没有对指令的顺序进行重新排序

此外，由于循环位于单个函数中；您还应该确保这些函数本身在64字节边界上对齐。
查看完整的perf stat输出，您会发现变化的不是指令数，而是暂停的周期数
查看拆解，我发现了两件事：

计数器变量的偏移量因函数而异。不过，将计数器设置为每个函数的本地并不会使行为消失

这些函数不位于64字节边界上，因此它们可能覆盖不同数量的缓存线。使用-falign functions=64编译确实使差异几乎完全消失
在我的机器上进行上述更改后的测试将产生：
以美元表示的f
（序号7）；执行统计-e循环-r3./相同功能$f 2>&1；完成| grep循环 6070933420次循环（+-0.11%） 6052771142次循环（+-0.06%） 6099676333次循环（+-0.07%） 6092962697次循环（+-0.16%） 6151861993周期（+-0.69%） 6074323033次循环（+-0.36%） 6174434653循环（+-0.65%）
不过，我对你找到的摊位的性质不太了解
编辑： 我将计数器设置为每个函数中的易失性成员，在我的I7-3537U上测试了不同的编译，发现“-falign loops=64”实际上是最慢的：

$ gcc -std=gnu99 -O3 -Wall -Wextra same-function.c -o same-function $ gcc -falign-loops=64 -std=gnu99 -O3 -Wall -Wextra same-function.c -o same-function-l64 $ gcc -falign-functions=64 -std=gnu99 -O3 -Wall -Wextra same-function.c -o same-function-f64 $ for prog in same-function{,-l64,-f64}; do echo $prog; for f in $(seq 7); do perf stat -e cycles -r10 ./$prog $f 2>&1; done|grep cycl; done same-function 6,079,966,292 cycles ( +- 0.19% ) 7,419,053,569 cycles ( +- 0.07% ) 6,136,061,105 cycles ( +- 0.27% ) 7,282,434,896 cycles ( +- 0.74% ) 6,104,866,406 cycles ( +- 0.16% ) 7,342,985,942 cycles ( +- 0.52% ) 6,208,373,040 cycles ( +- 0.50% ) same-function-l64 7,336,838,175 cycles ( +- 0.46% ) 7,358,913,923 cycles ( +- 0.52% ) 7,412,570,515 cycles ( +- 0.38% ) 7,435,048,756 cycles ( +- 0.10% ) 7,404,834,458 cycles ( +- 0.34% ) 7,291,095,582 cycles ( +- 0.99% ) 7,312,052,598 cycles ( +- 0.95% ) same-function-f64 6,103,059,996 cycles ( +- 0.12% ) 6,116,601,533 cycles ( +- 0.29% ) 6,120,841,824 cycles ( +- 0.18% ) 6,114,278,098 cycles ( +- 0.09% ) 6,105,938,586 cycles ( +- 0.14% ) 6,101,672,717 cycles ( +- 0.19% ) 6,121,339,944 cycles ( +- 0.11% )
有关对齐循环与对齐功能的更多详细信息：

$ for prog in same-function{-l64,-f64}; do sudo perf stat -d -r10 ./$prog 0; done Performance counter stats for './same-function-l64 0' (10 runs): 2396.608194 task-clock:HG (msec) # 1.001 CPUs utilized ( +- 0.64% ) 56 context-switches:HG # 0.024 K/sec ( +- 5.51% ) 1 cpu-migrations:HG # 0.000 K/sec ( +- 74.78% ) 46 page-faults:HG # 0.019 K/sec ( +- 0.63% ) 7,331,450,530 cycles:HG # 3.059 GHz ( +- 0.51% ) [85.68%] 5,332,248,218 stalled-cycles-frontend:HG # 72.73% frontend cycles idle ( +- 0.71% ) [71.42%] <not supported> stalled-cycles-backend:HG 5,000,800,933 instructions:HG # 0.68 insns per cycle # 1.07 stalled cycles per insn ( +- 0.04% ) [85.73%] 1,000,446,303 branches:HG # 417.443 M/sec ( +- 0.04% ) [85.75%] 8,461 branch-misses:HG # 0.00% of all branches ( +- 6.05% ) [85.76%] <not supported> L1-dcache-loads:HG 45,593 L1-dcache-load-misses:HG # 0.00% of all L1-dcache hits ( +- 3.61% ) [85.77%] 6,148 LLC-loads:HG # 0.003 M/sec ( +- 8.80% ) [71.36%] <not supported> LLC-load-misses:HG 2.394456699 seconds time elapsed ( +- 0.64% ) Performance counter stats for './same-function-f64 0' (10 runs): 1998.936383 task-clock:HG (msec) # 1.001 CPUs utilized ( +- 0.61% ) 60 context-switches:HG # 0.030 K/sec ( +- 17.77% ) 1 cpu-migrations:HG # 0.001 K/sec ( +- 47.86% ) 46 page-faults:HG # 0.023 K/sec ( +- 0.68% ) 6,107,877,836 cycles:HG # 3.056 GHz ( +- 0.34% ) [85.63%] 4,112,602,649 stalled-cycles-frontend:HG # 67.33% frontend cycles idle ( +- 0.52% ) [71.41%] <not supported> stalled-cycles-backend:HG 5,000,910,172 instructions:HG # 0.82 insns per cycle # 0.82 stalled cycles per insn ( +- 0.01% ) [85.72%] 1,000,423,026 branches:HG # 500.478 M/sec ( +- 0.02% ) [85.77%] 10,660 branch-misses:HG # 0.00% of all branches ( +- 13.23% ) [85.80%] <not supported> L1-dcache-loads:HG 47,492 L1-dcache-load-misses:HG # 0.00% of all L1-dcache hits ( +- 14.82% ) [85.80%] 11,719 LLC-loads:HG # 0.006 M/sec ( +- 42.44% ) [71.28%] <not supported> LLC-load-misses:HG 1.997319759 seconds time elapsed ( +- 0.62% )
及
但是没有找到罪犯。尽管如此，我还是觉得把它记在这里可能会对你有所帮助
编辑2: 这是我对相同功能的补丁。c:

$ git diff -u -U0 diff --git a/same-function.c b/same-function.c index f78449e..78a5772 100644 --- a/same-function.c +++ b/same-function.c @@ -20 +20 @@ done -volatile uint64_t counter = 0; +//volatile uint64_t counter = 0; @@ -22,0 +23 @@ COMPILER_NO_INLINE void loop0(void) { +volatile uint64_t counter = 0; @@ -31,0 +33 @@ COMPILER_NO_INLINE void loop1(void) { +volatile uint64_t counter = 0; @@ -40,0 +43 @@ COMPILER_NO_INLINE void loop2(void) { +volatile uint64_t counter = 0; @@ -49,0 +53 @@ COMPILER_NO_INLINE void loop3(void) { +volatile uint64_t counter = 0; @@ -58,0 +63 @@ COMPILER_NO_INLINE void loop4(void) { +volatile uint64_t counter = 0; @@ -67,0 +73 @@ COMPILER_NO_INLINE void loop5(void) { +volatile uint64_t counter = 0; @@ -76,0 +83 @@ COMPILER_NO_INLINE void loop6(void) { +volatile uint64_t counter = 0; @@ -85,0 +93 @@ COMPILER_NO_INLINE void loop7(void) { +volatile uint64_t counter = 0; @@ -94,0 +103 @@ COMPILER_NO_INLINE void loop8(void) { +volatile uint64_t counter = 0; @@ -103,0 +113 @@ COMPILER_NO_INLINE void loop9(void) { +volatile uint64_t counter = 0; @@ -135 +145 @@ int main(int argc, char** argv) { -} \ No newline at end of file +}
编辑3：同样的事情还有更简单的例子：

; Minimal example, see also http://stackoverflow.com/q/26266953/3766665 ; To build (Linux): ; nasm -felf64 func.asm ; ld func.o ; Then run: ; perf stat -r10 ./a.out ; Runtime varies ~10% depending on whether section .text global _start _start: push qword 0h ; put counter variable on stack jmp loop ; jump to function ;align 64 ; function alignment. Try commenting this loop: mov rcx, 1000000000 ;align 64 ; loop alignment. Try commenting this l: mov rax, [rsp] add rax, 1h mov [rsp], rax sub rcx, 1h jne l fin: ; End of program. Exit with code 0 mov eax, 60 xor edi, edi syscall
这里也有同样的效果。有趣
干杯，
Benjamin
您如何解释后台进程中断您的程序？（我对Linux不太熟悉，但在Windows上，当尝试计时简单但长期运行的代码时，这是一个很大的问题。）当线程进入睡眠状态时，它会花费一个未知的时间间隔不运行，而计时器仍可能在滴答作响。这似乎不是处理器争用的问题。这可能解释了相同N的运行时间之间的差异，但不能解释不同N的运行时间之间一致的、统计上显著的差异。例如，
loop6
和
loop1
的地址对齐方式是什么？它们都是32字节对齐的吗？他们有相同的总体定位吗？@NathanKurz你也禁用了涡轮增压器吗？您需要确保CPU不会动态缩放频率。另外，您是否尝试更改循环执行顺序？如果您执行进程10次（即，程序每次运行一次循环），则答案是缓存状态。如果你在里面重复这个循环
$ for prog in same-function{-l64,-f64}; do sudo perf stat -eL1-{d,i}cache-load-misses,L1-dcache-store-misses,cs,cycles,instructions -r10 ./$prog 0; done

$ sudo perf record -F25000 -e'{cycles:pp,stalled-cycles-frontend}' ./same-function-l64 0 [ perf record: Woken up 28 times to write data ] [ perf record: Captured and wrote 6.771 MB perf.data (~295841 samples) ] $ sudo perf report --group -Sloop0 -n --show-total-period --stdio $ sudo perf annotate --group -sloop0 --stdio

$ git diff -u -U0 diff --git a/same-function.c b/same-function.c index f78449e..78a5772 100644 --- a/same-function.c +++ b/same-function.c @@ -20 +20 @@ done -volatile uint64_t counter = 0; +//volatile uint64_t counter = 0; @@ -22,0 +23 @@ COMPILER_NO_INLINE void loop0(void) { +volatile uint64_t counter = 0; @@ -31,0 +33 @@ COMPILER_NO_INLINE void loop1(void) { +volatile uint64_t counter = 0; @@ -40,0 +43 @@ COMPILER_NO_INLINE void loop2(void) { +volatile uint64_t counter = 0; @@ -49,0 +53 @@ COMPILER_NO_INLINE void loop3(void) { +volatile uint64_t counter = 0; @@ -58,0 +63 @@ COMPILER_NO_INLINE void loop4(void) { +volatile uint64_t counter = 0; @@ -67,0 +73 @@ COMPILER_NO_INLINE void loop5(void) { +volatile uint64_t counter = 0; @@ -76,0 +83 @@ COMPILER_NO_INLINE void loop6(void) { +volatile uint64_t counter = 0; @@ -85,0 +93 @@ COMPILER_NO_INLINE void loop7(void) { +volatile uint64_t counter = 0; @@ -94,0 +103 @@ COMPILER_NO_INLINE void loop8(void) { +volatile uint64_t counter = 0; @@ -103,0 +113 @@ COMPILER_NO_INLINE void loop9(void) { +volatile uint64_t counter = 0; @@ -135 +145 @@ int main(int argc, char** argv) { -} \ No newline at end of file +}

; Minimal example, see also http://stackoverflow.com/q/26266953/3766665 ; To build (Linux): ; nasm -felf64 func.asm ; ld func.o ; Then run: ; perf stat -r10 ./a.out ; Runtime varies ~10% depending on whether section .text global _start _start: push qword 0h ; put counter variable on stack jmp loop ; jump to function ;align 64 ; function alignment. Try commenting this loop: mov rcx, 1000000000 ;align 64 ; loop alignment. Try commenting this l: mov rax, [rsp] add rax, 1h mov [rsp], rax sub rcx, 1h jne l fin: ; End of program. Exit with code 0 mov eax, 60 xor edi, edi syscall