Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/performance/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
英特尔编译器生成的代码比MSVC慢68%(提供了完整示例) < >我有一个C++代码处理三个连续的值从一个1800个元素数组。ICC 14.0编译的代码比MSVC生成的代码慢约68%(1600对2700个CPU周期)。我不明白为什么。有人能帮忙吗?即使我设置英特尔编译器-O3开关,它也不会改变计时。CPU是常春藤桥 #include <iostream> int main(){ int data[1200]; //Dummy-populate data for(int y=0; y<1200; y++){ data[y] = y/2 + 7; } int counter = 0; //Just to repeat the test while(counter < 10000){ int Accum = 0; long long start = 0; long long end = 0; int p = 0; start = __rdtsc(); while(p < 1200){ unsigned int level1 = data[p]; unsigned int factor = data[p + 1]; Accum += (level1 * factor); p = p + 2; } end = __rdtsc(); std::cout << (end - start) << " " << Accum << std::endl; counter++; } } #包括 int main(){ 国际数据[1200]; //虚拟填充数据 对于(int y=0;y_C++_Performance_Optimization_Cpu_Icc - Fatal编程技术网

英特尔编译器生成的代码比MSVC慢68%(提供了完整示例) < >我有一个C++代码处理三个连续的值从一个1800个元素数组。ICC 14.0编译的代码比MSVC生成的代码慢约68%(1600对2700个CPU周期)。我不明白为什么。有人能帮忙吗?即使我设置英特尔编译器-O3开关,它也不会改变计时。CPU是常春藤桥 #include <iostream> int main(){ int data[1200]; //Dummy-populate data for(int y=0; y<1200; y++){ data[y] = y/2 + 7; } int counter = 0; //Just to repeat the test while(counter < 10000){ int Accum = 0; long long start = 0; long long end = 0; int p = 0; start = __rdtsc(); while(p < 1200){ unsigned int level1 = data[p]; unsigned int factor = data[p + 1]; Accum += (level1 * factor); p = p + 2; } end = __rdtsc(); std::cout << (end - start) << " " << Accum << std::endl; counter++; } } #包括 int main(){ 国际数据[1200]; //虚拟填充数据 对于(int y=0;y

英特尔编译器生成的代码比MSVC慢68%(提供了完整示例) < >我有一个C++代码处理三个连续的值从一个1800个元素数组。ICC 14.0编译的代码比MSVC生成的代码慢约68%(1600对2700个CPU周期)。我不明白为什么。有人能帮忙吗?即使我设置英特尔编译器-O3开关,它也不会改变计时。CPU是常春藤桥 #include <iostream> int main(){ int data[1200]; //Dummy-populate data for(int y=0; y<1200; y++){ data[y] = y/2 + 7; } int counter = 0; //Just to repeat the test while(counter < 10000){ int Accum = 0; long long start = 0; long long end = 0; int p = 0; start = __rdtsc(); while(p < 1200){ unsigned int level1 = data[p]; unsigned int factor = data[p + 1]; Accum += (level1 * factor); p = p + 2; } end = __rdtsc(); std::cout << (end - start) << " " << Accum << std::endl; counter++; } } #包括 int main(){ 国际数据[1200]; //虚拟填充数据 对于(int y=0;y,c++,performance,optimization,cpu,icc,C++,Performance,Optimization,Cpu,Icc,ICC在这里很糟糕,因为它正在计算每个data[n]access alamov edi,dword ptr[rsp+rax*4+44h]…所有这些运行时乘法都很昂贵。您应该能够通过重新编码来避免它,因此索引是常量(也可以使用*p_data++三次,但这会导致排序问题,可能会对性能产生不利影响) for(无符号*p_数据=&data[0],*p_end=data+1800;p_数据

ICC在这里很糟糕,因为它正在计算每个
data[n]
access ala
mov edi,dword ptr[rsp+rax*4+44h]
…所有这些运行时乘法都很昂贵。您应该能够通过重新编码来避免它,因此索引是常量(也可以使用
*p_data++
三次,但这会导致排序问题,可能会对性能产生不利影响)

for(无符号*p_数据=&data[0],*p_end=data+1800;p_数据
用户997112,我测试了您的新代码(只有一个级别和accum),使用
-O3
选项(
-march=native-mtune=native
可能会帮助您)。我将核心2 Q6600固定在2.4 GHz上,最佳结果是gcc为1800,icc为1900

这是我的测试版本(
rdtsc()
用gnu asm重新定义,运行时保存在数组中,只打印最小(最佳)运行时:

$ cat my.cc
#include <iostream>

#if 1
// my cpu has no rdtscp, so use asm
inline unsigned long long rdtsc() __attribute__((always_inline));
inline unsigned long long rdtsc() {
  unsigned int lo, hi;
  asm volatile (
     "cpuid \n"
     "rdtsc" 
   : "=a"(lo), "=d"(hi) /* outputs */
   : "a"(0)             /* inputs */
   : "%ebx", "%ecx");     /* clobbers*/
  return ((unsigned long long)lo) | (((unsigned long long)hi) << 32);
}
#else
#define rdtsc __rdtsc
#endif

int main(){
        int data[1200];
        int dummy[10000];
        int stats[10000];

        //Dummy-populate data
        for(int y=0; y<1200; y++){
            data[y] = y/2 + 7;
        }
        for(int y=0; y<10000; y++){
            stats[y]=0;
        }

        int counter = 0;

        //Just to repeat the test
        while(counter < 10000){

            int Accum = 0;
            long long start = 0;
            long long end = 0;
            int p = 0;

            start = rdtsc();

            while(p < 1200){
                unsigned int level1 = data[p];  
                unsigned int factor = data[p + 1];
                Accum += (level1 * factor);
                p = p + 2;
            }

            end = rdtsc();
            stats[counter]=(end - start);
            dummy[counter]=Accum;
            counter++;
        }

        int min=0xfffff;
        for(int y=0; y<10000; y++) {
            if(stats[y] < min) {
                min = stats[y];
                std::cout << min << std::endl;
                std::cout << "accum " << dummy[y] << std::endl;
            }
        }
        std::cout << min << std::endl;
}
结果(CPU频率更改在2.4 GHz时被禁用,内核由
任务集固定,由Linux PMU访问工具测量):

因此,我们可以看到,gcc需要更多的指令来处理相同数量的数据,但它也实现了更好的IPC(每时钟指令数)速率

gcc中的内部循环有一个简单的汇编代码:

  4009b9:       45 31 c0                xor    %r8d,%r8d
  4009bc:       45 31 c9                xor    %r9d,%r9d
  4009bf:       90                      nop
  4009c0:       44 89 c8                mov    %r9d,%eax
  4009c3:       0f a2                   cpuid  
  4009c5:       0f 31                   rdtsc  
  4009c7:       49 89 d2                mov    %rdx,%r10
  4009ca:       89 c0                   mov    %eax,%eax
  4009cc:       48 89 e2                mov    %rsp,%rdx
  4009cf:       49 c1 e2 20             shl    $0x20,%r10
  4009d3:       31 ff                   xor    %edi,%edi
  4009d5:       49 09 c2                or     %rax,%r10
  4009d8:       0f 1f 84 00 00 00 00    nopl   0x0(%rax,%rax,1)
  4009df:       00 

   vvvv
  4009e0:       8b 4a 04                mov    0x4(%rdx),%ecx
  4009e3:       48 83 c2 08             add    $0x8,%rdx
  4009e7:       0f af 4a f8             imul   -0x8(%rdx),%ecx
  4009eb:       48 39 d5                cmp    %rdx,%rbp
  4009ee:       8d 34 39                lea    (%rcx,%rdi,1),%esi
  4009f1:       89 f7                   mov    %esi,%edi
  4009f3:       75 eb                   jne    4009e0 <main+0x90>
   ^^^^

  4009f5:       44 89 c8                mov    %r9d,%eax
  4009f8:       0f a2                   cpuid  
  4009fa:       0f 31                   rdtsc  
4009b9:45 31 c0异或%r8d,%r8d
4009bc:45 31 c9异或%r9d,%r9d
4009bf:90无
4009c0:44 89 c8 mov%r9d%eax
4009c3:0f a2 cpuid
4009c5:0f 31 rdtsc
4009c7:49 89 d2 mov%rdx,%r10
4009ca:89 c0 mov%eax,%eax
4009cc:48 89 e2 mov%rsp,%rdx
4009cf:49 c1 e2 20 shl$0x20,%r10
4009d3:31 ff异或%edi,%edi
4009d5:49 09 c2或%rax,%r10
4009d8:0f 1f 84 00 nopl 0x0(%rax,%rax,1)
4009df:00
VVV
4009e0:8b 4a 04 mov 0x4(%rdx),%ecx
4009e3:48 83 c2 08添加$0x8,%rdx
4009e7:0f af 4a f8 imul-0x8(%rdx),%ecx
4009eb:48 39 d5 cmp%rdx%rbp
4009ee:8d 34 39 lea(%rcx,%rdi,1),%esi
4009f1:89 f7移动%esi,%edi
4009f3:75 eb jne 4009e0
^^^^
4009f5:44 89 c8 mov%r9d%eax
4009f8:0f a2 cpuid
4009fa:0f 31 rdtsc
来自icc的重SSE2/展开(循环的一部分,1184次迭代,被矢量化,尾部在循环后处理):

400e4c:33 c9异或%ecx,%ecx
400e4e:49 89 cd mov%rcx,%r13
400e51:33 c0异或%eax,%eax
400e53:0f a2 cpuid
400e55:0f 31 rdtsc
400e57:66 0f ef c9 pxor%xmm1,%xmm1
400e5b:66 0f 6f 05 7d 2f 00 movdqa 0x2f7d(%rip),%xmm0
400e62:00
400e63:41 89 c4 mov%eax,%r12d
400e66:33 c0异或%eax,%eax
VVV
400e68:66 0f 6f 9c c4 80 38 movdqa 0x13880(%rsp,%rax,8),%xmm3
400e6f:01 00
400e71:66 0f 6f 94 c4 90 38 movdqa 0x13890(%rsp,%rax,8),%xmm2
400e78:01 00
400e7a:66 0f 6f f3移动质量保证%xmm3,%xmm6
400e7e:66 0f 62 f2冲压件质量%xmm2,%xmm6
400e82:66 0f 6a da punpckhdq%xmm2,%xmm3
400e86:66 0f 6f fe movdqa%xmm6%xmm7
400e8a:66 0f 62 fb punpckldq%xmm3%xmm7
400e8e:66 0f 6f ac c4 a0 38 movdqa 0x138a0(%rsp,%rax,8),%xmm5
400e95:01 00
400e97:66 44 0f 6f d7移动质量保证%xmm7,%xmm10
400e9c:66 0f 6a f3蓬普克赫德Q%xmm3,%xmm6
400ea0:66 44 0f 6f c5 movdqa%xmm5%xmm8
400ea5:66 0f 6f a4 c4 b0 38 movdqa 0x138b0(%rsp,%rax,8),%xmm4
400eac:01 00
400eae:66 0f 73 d7 20 psrlq$0x20,%xmm7
400eb3:66 44 0f f4 d6 pmuludq%xmm6,%xmm10
400eb8:66 0f 73 d6 20 psrlq$0x20,%xmm6
400ebd:66 0f f4 fe pmuludq%xmm6,%xmm7
400ec1:66 44 0f 6f ac c4 c0 movdqa 0x138c0(%rsp,%rax,8),%xmm13
400ec8:38 01 00
400ecb:66 44 0f db d0和%xmm0,%xmm10
400ed0:66 44 0f 62 c4 punpckldq%xmm4,%xmm8
400ed5:66 45 0f 6f f5移动质量保证%xmm13,%xmm14
400eda:66 44 0f 6f a4 c4 d0 movdqa 0x138d0(%rsp,%rax,8),%xmm12
400ee1:38 01 00
400ee4:66 45 0f 6f c8移动质量保证%xmm8,%xmm9
400ee9:66 0f 6a ec punpckhdq%xmm4,%xmm5
400eed:66 0f 73 f7 20 psllq$0x20,%xmm7
400ef2:66 0f 6f a4 c4 e0 38 movdqa 0x138e0(%rsp,%rax,8),%xmm4
400ef9:01 00
400efb:66 44 0f eb d7端口%xmm7,%xmm10
400f00:66 0f 6f 9c c4 f0 38 movdqa 0x138f0(%rsp,%rax,8),%xmm3
400f07:01 00
400f09:66 41 0f铁钙填充%xm
$ g++ my.cc -o mygccO3t -O3 -march=native -mtune=native
$ icc my.cc -o myiccO3t -O3 -march=native -mtune=native
$ taskset -c 3 perf stat -e cycles:u,instructions:u ./myiccO3t |tail -n 1 
 Performance counter stats for './myiccO3t':
    23 875 260 cycles:u                 
    28 866 440 instructions:u            #    1,21  insns per cycle        

   0,011297567 seconds time elapsed
1899

$ taskset -c 3 perf stat -e cycles:u,instructions:u ./mygccO3t |tail -n 1 
 Performance counter stats for './mygccO3t':
    22 389 238 cycles:u                 
    43 551 129 instructions:u            #    1,95  insns per cycle        

   0,010683920 seconds time elapsed
1800
  4009b9:       45 31 c0                xor    %r8d,%r8d
  4009bc:       45 31 c9                xor    %r9d,%r9d
  4009bf:       90                      nop
  4009c0:       44 89 c8                mov    %r9d,%eax
  4009c3:       0f a2                   cpuid  
  4009c5:       0f 31                   rdtsc  
  4009c7:       49 89 d2                mov    %rdx,%r10
  4009ca:       89 c0                   mov    %eax,%eax
  4009cc:       48 89 e2                mov    %rsp,%rdx
  4009cf:       49 c1 e2 20             shl    $0x20,%r10
  4009d3:       31 ff                   xor    %edi,%edi
  4009d5:       49 09 c2                or     %rax,%r10
  4009d8:       0f 1f 84 00 00 00 00    nopl   0x0(%rax,%rax,1)
  4009df:       00 

   vvvv
  4009e0:       8b 4a 04                mov    0x4(%rdx),%ecx
  4009e3:       48 83 c2 08             add    $0x8,%rdx
  4009e7:       0f af 4a f8             imul   -0x8(%rdx),%ecx
  4009eb:       48 39 d5                cmp    %rdx,%rbp
  4009ee:       8d 34 39                lea    (%rcx,%rdi,1),%esi
  4009f1:       89 f7                   mov    %esi,%edi
  4009f3:       75 eb                   jne    4009e0 <main+0x90>
   ^^^^

  4009f5:       44 89 c8                mov    %r9d,%eax
  4009f8:       0f a2                   cpuid  
  4009fa:       0f 31                   rdtsc  
  400e4c:       33 c9                   xor    %ecx,%ecx
  400e4e:       49 89 cd                mov    %rcx,%r13
  400e51:       33 c0                   xor    %eax,%eax
  400e53:       0f a2                   cpuid  
  400e55:       0f 31                   rdtsc  
  400e57:       66 0f ef c9             pxor   %xmm1,%xmm1
  400e5b:       66 0f 6f 05 7d 2f 00    movdqa 0x2f7d(%rip),%xmm0         
  400e62:       00 
  400e63:       41 89 c4                mov    %eax,%r12d
  400e66:       33 c0                   xor    %eax,%eax

   vvvv
  400e68:       66 0f 6f 9c c4 80 38    movdqa 0x13880(%rsp,%rax,8),%xmm3
  400e6f:       01 00 
  400e71:       66 0f 6f 94 c4 90 38    movdqa 0x13890(%rsp,%rax,8),%xmm2
  400e78:       01 00 
  400e7a:       66 0f 6f f3             movdqa %xmm3,%xmm6
  400e7e:       66 0f 62 f2             punpckldq %xmm2,%xmm6
  400e82:       66 0f 6a da             punpckhdq %xmm2,%xmm3
  400e86:       66 0f 6f fe             movdqa %xmm6,%xmm7
  400e8a:       66 0f 62 fb             punpckldq %xmm3,%xmm7
  400e8e:       66 0f 6f ac c4 a0 38    movdqa 0x138a0(%rsp,%rax,8),%xmm5
  400e95:       01 00 
  400e97:       66 44 0f 6f d7          movdqa %xmm7,%xmm10
  400e9c:       66 0f 6a f3             punpckhdq %xmm3,%xmm6
  400ea0:       66 44 0f 6f c5          movdqa %xmm5,%xmm8
  400ea5:       66 0f 6f a4 c4 b0 38    movdqa 0x138b0(%rsp,%rax,8),%xmm4
  400eac:       01 00 
  400eae:       66 0f 73 d7 20          psrlq  $0x20,%xmm7
  400eb3:       66 44 0f f4 d6          pmuludq %xmm6,%xmm10
  400eb8:       66 0f 73 d6 20          psrlq  $0x20,%xmm6
  400ebd:       66 0f f4 fe             pmuludq %xmm6,%xmm7
  400ec1:       66 44 0f 6f ac c4 c0    movdqa 0x138c0(%rsp,%rax,8),%xmm13
  400ec8:       38 01 00 
  400ecb:       66 44 0f db d0          pand   %xmm0,%xmm10
  400ed0:       66 44 0f 62 c4          punpckldq %xmm4,%xmm8
  400ed5:       66 45 0f 6f f5          movdqa %xmm13,%xmm14
  400eda:       66 44 0f 6f a4 c4 d0    movdqa 0x138d0(%rsp,%rax,8),%xmm12
  400ee1:       38 01 00 
  400ee4:       66 45 0f 6f c8          movdqa %xmm8,%xmm9
  400ee9:       66 0f 6a ec             punpckhdq %xmm4,%xmm5
  400eed:       66 0f 73 f7 20          psllq  $0x20,%xmm7
  400ef2:       66 0f 6f a4 c4 e0 38    movdqa 0x138e0(%rsp,%rax,8),%xmm4
  400ef9:       01 00 
  400efb:       66 44 0f eb d7          por    %xmm7,%xmm10
  400f00:       66 0f 6f 9c c4 f0 38    movdqa 0x138f0(%rsp,%rax,8),%xmm3
  400f07:       01 00 
  400f09:       66 41 0f fe ca          paddd  %xmm10,%xmm1
  400f0e:       66 44 0f 62 cd          punpckldq %xmm5,%xmm9
  400f13:       48 83 c0 10             add    $0x10,%rax
  400f17:       66 44 0f 6a c5          punpckhdq %xmm5,%xmm8
  400f1c:       66 0f 6f ec             movdqa %xmm4,%xmm5
  400f20:       66 45 0f 62 f4          punpckldq %xmm12,%xmm14
  400f25:       66 45 0f 6f d9          movdqa %xmm9,%xmm11
  400f2a:       66 45 0f 6a ec          punpckhdq %xmm12,%xmm13
  400f2f:       66 45 0f 6f fe          movdqa %xmm14,%xmm15
  400f34:       66 0f 62 eb             punpckldq %xmm3,%xmm5
  400f38:       66 41 0f 73 d1 20       psrlq  $0x20,%xmm9
  400f3e:       66 45 0f 62 fd          punpckldq %xmm13,%xmm15
  400f43:       66 0f 6f f5             movdqa %xmm5,%xmm6
  400f47:       66 0f 6a e3             punpckhdq %xmm3,%xmm4
  400f4b:       66 41 0f 6f d7          movdqa %xmm15,%xmm2
  400f50:       66 45 0f f4 d8          pmuludq %xmm8,%xmm11
  400f55:       66 41 0f 73 d0 20       psrlq  $0x20,%xmm8
  400f5b:       66 45 0f f4 c8          pmuludq %xmm8,%xmm9
  400f60:       66 45 0f 6a f5          punpckhdq %xmm13,%xmm14
  400f65:       66 41 0f 73 d7 20       psrlq  $0x20,%xmm15
  400f6b:       66 0f 62 f4             punpckldq %xmm4,%xmm6
  400f6f:       66 44 0f db d8          pand   %xmm0,%xmm11
  400f74:       66 41 0f f4 d6          pmuludq %xmm14,%xmm2
  400f79:       66 41 0f 73 d6 20       psrlq  $0x20,%xmm14
  400f7f:       66 45 0f f4 fe          pmuludq %xmm14,%xmm15
  400f84:       66 0f 6a ec             punpckhdq %xmm4,%xmm5
  400f88:       66 0f 6f fe             movdqa %xmm6,%xmm7
  400f8c:       66 0f f4 fd             pmuludq %xmm5,%xmm7
  400f90:       66 0f 73 d6 20          psrlq  $0x20,%xmm6
  400f95:       66 0f 73 d5 20          psrlq  $0x20,%xmm5
  400f9a:       66 41 0f 73 f1 20       psllq  $0x20,%xmm9
  400fa0:       66 0f f4 f5             pmuludq %xmm5,%xmm6
  400fa4:       66 45 0f eb d9          por    %xmm9,%xmm11
  400fa9:       66 0f db d0             pand   %xmm0,%xmm2
  400fad:       66 41 0f 73 f7 20       psllq  $0x20,%xmm15
  400fb3:       66 41 0f fe cb          paddd  %xmm11,%xmm1
  400fb8:       66 41 0f eb d7          por    %xmm15,%xmm2
  400fbd:       66 0f db f8             pand   %xmm0,%xmm7
  400fc1:       66 0f 73 f6 20          psllq  $0x20,%xmm6
  400fc6:       66 0f fe ca             paddd  %xmm2,%xmm1
  400fca:       66 0f eb fe             por    %xmm6,%xmm7
  400fce:       66 0f fe cf             paddd  %xmm7,%xmm1
  400fd2:       48 3d 50 02 00 00       cmp    $0x250,%rax
  400fd8:       0f 82 8a fe ff ff       jb     400e68 <main+0xe8>
   ^^^^

  400fde:       66 0f 6f c1             movdqa %xmm1,%xmm0
  400fe2:       66 0f 73 d8 08          psrldq $0x8,%xmm0
  400fe7:       66 0f fe c8             paddd  %xmm0,%xmm1
  400feb:       66 0f 6f d1             movdqa %xmm1,%xmm2
  400fef:       8b 84 24 00 4b 01 00    mov    0x14b00(%rsp),%eax
  400ff6:       66 0f 73 d2 20          psrlq  $0x20,%xmm2
  400ffb:       0f af 84 24 04 4b 01    imul   0x14b04(%rsp),%eax
  401002:       00 
  401003:       66 0f fe ca             paddd  %xmm2,%xmm1
  401007:       66 0f 7e cb             movd   %xmm1,%ebx
  40100b:       8b 94 24 08 4b 01 00    mov    0x14b08(%rsp),%edx
  401012:       03 d8                   add    %eax,%ebx
  401014:       0f af 94 24 0c 4b 01    imul   0x14b0c(%rsp),%edx
  40101b:       00 
  40101c:       8b b4 24 10 4b 01 00    mov    0x14b10(%rsp),%esi
  401023:       03 da                   add    %edx,%ebx
  401025:       0f af b4 24 14 4b 01    imul   0x14b14(%rsp),%esi
  40102c:       00 
  40102d:       8b bc 24 18 4b 01 00    mov    0x14b18(%rsp),%edi
  401034:       03 de                   add    %esi,%ebx
  401036:       0f af bc 24 1c 4b 01    imul   0x14b1c(%rsp),%edi
  40103d:       00 
  40103e:       44 8b 84 24 20 4b 01    mov    0x14b20(%rsp),%r8d
  401045:       00 
  401046:       03 df                   add    %edi,%ebx
  401048:       44 0f af 84 24 24 4b    imul   0x14b24(%rsp),%r8d
  40104f:       01 00 
  401051:       44 8b 8c 24 28 4b 01    mov    0x14b28(%rsp),%r9d
  401058:       00 
  401059:       41 03 d8                add    %r8d,%ebx
  40105c:       44 0f af 8c 24 2c 4b    imul   0x14b2c(%rsp),%r9d
  401063:       01 00 
  401065:       44 8b 94 24 30 4b 01    mov    0x14b30(%rsp),%r10d
  40106c:       00 
  40106d:       41 03 d9                add    %r9d,%ebx
  401070:       44 0f af 94 24 34 4b    imul   0x14b34(%rsp),%r10d
  401077:       01 00 
  401079:       44 8b 9c 24 38 4b 01    mov    0x14b38(%rsp),%r11d
  401080:       00 
  401081:       41 03 da                add    %r10d,%ebx
  401084:       44 0f af 9c 24 3c 4b    imul   0x14b3c(%rsp),%r11d
  40108b:       01 00 
  40108d:       41 03 db                add    %r11d,%ebx
  401090:       e8 eb 00 00 00          callq  401180 <_Z5rdtscv>