C++ 将32位循环计数器替换为64位会导致英特尔CPU上的_mm_popcnt_u64出现疯狂的性能偏差
我在寻找C++ 将32位循环计数器替换为64位会导致英特尔CPU上的_mm_popcnt_u64出现疯狂的性能偏差,c++,performance,assembly,x86,compiler-optimization,C++,Performance,Assembly,X86,Compiler Optimization,我在寻找popcount大型数据数组的最快方法。我遇到了一个非常奇怪的效果:将循环变量从unsigned更改为uint64\u t使我的电脑性能下降了50% 基准 以下是我在3.50GHz的CPU上运行test1(1MB随机数据)的结果: 未签名41959360000 0.401554秒26.113GB/s uint64_t 41959360000 0.759822秒13.8003GB/s 如您所见,uint64\u t版本的吞吐量只有未签名版本的一半!问题似乎是生成了不同的程序集,但为什么
popcount
大型数据数组的最快方法。我遇到了一个非常奇怪的效果:将循环变量从unsigned
更改为uint64\u t
使我的电脑性能下降了50%
基准
以下是我在3.50GHz的CPU上运行test1
(1MB随机数据)的结果:
- 未签名41959360000 0.401554秒26.113GB/s
- uint64_t 41959360000 0.759822秒13.8003GB/s
uint64\u t
版本的吞吐量只有未签名版本的一半!问题似乎是生成了不同的程序集,但为什么?首先,我想到了一个编译器错误,所以我尝试了clang++
(Ubuntu版本3.4-1ubuntu3):
结果:测试1
- 未签名41959360000 0.398293秒26.3267 GB/s
- uint64_t 41959360000 0.680954秒15.3986 GB/s
因此,这几乎是相同的结果,仍然很奇怪。但现在它变得非常奇怪。我将从输入读取的缓冲区大小替换为常量1
,因此我更改:
uint64_t size = atol(argv[1]) << 20;
以下是我在g++中的结果:
- 未签名41959360000 0.396728秒26.4306 GB/s
- uint64_t 41959360000 0.509484秒20.5811 GB/s
是的,还有另一种选择。我们仍然有快速的26GB/s和u32
,但我们设法使u64
至少从13GB/s版本升级到20GB/s版本在我的同事的PC上,u64
版本比u32
版本速度更快,产生了最快的结果。遗憾的是,这只适用于g++
,clang++
似乎并不关心静态
我的问题
你能解释一下这些结果吗?特别是:
u32
和u64
之间怎么会有这样的区别
- 用一个恒定的缓冲区大小替换一个非常量如何触发更少的优化代码
- 插入
static
关键字如何使u64
循环更快?比我同事电脑上的原始代码还要快李>
我知道优化是一个棘手的领域,但是,我从来没有想到如此小的变化会导致执行时间100%的差异,而像恒定缓冲区大小这样的小因素会再次完全混合结果。当然,我一直希望有一个版本,能够PopCount26GB/s。我能想到的唯一可靠的方法是复制粘贴本例中的程序集并使用内联程序集。这是我摆脱编译器的唯一方法,这些编译器似乎对一些小的改动很着迷。你怎么认为?有没有其他方法可以可靠地获得性能最好的代码
拆卸
以下是各种结果的分解:
来自g++/u32/non-const bufsize的26GB/s版本
0x400af8:
lea 0x1(%rdx),%eax
popcnt (%rbx,%rax,8),%r9
lea 0x2(%rdx),%edi
popcnt (%rbx,%rcx,8),%rax
lea 0x3(%rdx),%esi
add %r9,%rax
popcnt (%rbx,%rdi,8),%rcx
add $0x4,%edx
add %rcx,%rax
popcnt (%rbx,%rsi,8),%rcx
add %rcx,%rax
mov %edx,%ecx
add %rax,%r14
cmp %rbp,%rcx
jb 0x400af8
0x400c00:
popcnt 0x8(%rbx,%rdx,8),%rcx
popcnt (%rbx,%rdx,8),%rax
add %rcx,%rax
popcnt 0x10(%rbx,%rdx,8),%rcx
add %rcx,%rax
popcnt 0x18(%rbx,%rdx,8),%rcx
add $0x4,%rdx
add %rcx,%rax
add %rax,%r12
cmp %rbp,%rdx
jb 0x400c00
0x400e50:
popcnt (%r15,%rcx,8),%rdx
add %rbx,%rdx
popcnt 0x8(%r15,%rcx,8),%rsi
add %rdx,%rsi
popcnt 0x10(%r15,%rcx,8),%rdx
add %rsi,%rdx
popcnt 0x18(%r15,%rcx,8),%rbx
add %rdx,%rbx
add $0x4,%rcx
cmp %rbp,%rcx
jb 0x400e50
0x400a68:
popcnt (%rbx,%rdx,1),%rax
popcnt 0x8(%rbx,%rdx,1),%rcx
add %rax,%rcx
popcnt 0x10(%rbx,%rdx,1),%rax
add %rax,%rcx
popcnt 0x18(%rbx,%rdx,1),%rsi
add $0x20,%rdx
add %rsi,%rcx
add %rcx,%rbp
cmp $0x100000,%rdx
jne 0x400a68
0x400dd0:
popcnt (%r14,%rcx,8),%rdx
add %rbx,%rdx
popcnt 0x8(%r14,%rcx,8),%rsi
add %rdx,%rsi
popcnt 0x10(%r14,%rcx,8),%rdx
add %rsi,%rdx
popcnt 0x18(%r14,%rcx,8),%rbx
add %rdx,%rbx
add $0x4,%rcx
cmp $0x20000,%rcx
jb 0x400dd0
来自g++/u64/non-const bufsize的13GB/s版本:
0x400af8:
lea 0x1(%rdx),%eax
popcnt (%rbx,%rax,8),%r9
lea 0x2(%rdx),%edi
popcnt (%rbx,%rcx,8),%rax
lea 0x3(%rdx),%esi
add %r9,%rax
popcnt (%rbx,%rdi,8),%rcx
add $0x4,%edx
add %rcx,%rax
popcnt (%rbx,%rsi,8),%rcx
add %rcx,%rax
mov %edx,%ecx
add %rax,%r14
cmp %rbp,%rcx
jb 0x400af8
0x400c00:
popcnt 0x8(%rbx,%rdx,8),%rcx
popcnt (%rbx,%rdx,8),%rax
add %rcx,%rax
popcnt 0x10(%rbx,%rdx,8),%rcx
add %rcx,%rax
popcnt 0x18(%rbx,%rdx,8),%rcx
add $0x4,%rdx
add %rcx,%rax
add %rax,%r12
cmp %rbp,%rdx
jb 0x400c00
0x400e50:
popcnt (%r15,%rcx,8),%rdx
add %rbx,%rdx
popcnt 0x8(%r15,%rcx,8),%rsi
add %rdx,%rsi
popcnt 0x10(%r15,%rcx,8),%rdx
add %rsi,%rdx
popcnt 0x18(%r15,%rcx,8),%rbx
add %rdx,%rbx
add $0x4,%rcx
cmp %rbp,%rcx
jb 0x400e50
0x400a68:
popcnt (%rbx,%rdx,1),%rax
popcnt 0x8(%rbx,%rdx,1),%rcx
add %rax,%rcx
popcnt 0x10(%rbx,%rdx,1),%rax
add %rax,%rcx
popcnt 0x18(%rbx,%rdx,1),%rsi
add $0x20,%rdx
add %rsi,%rcx
add %rcx,%rbp
cmp $0x100000,%rdx
jne 0x400a68
0x400dd0:
popcnt (%r14,%rcx,8),%rdx
add %rbx,%rdx
popcnt 0x8(%r14,%rcx,8),%rsi
add %rdx,%rsi
popcnt 0x10(%r14,%rcx,8),%rdx
add %rsi,%rdx
popcnt 0x18(%r14,%rcx,8),%rbx
add %rdx,%rbx
add $0x4,%rcx
cmp $0x20000,%rcx
jb 0x400dd0
来自clang++/u64/non-const bufsize的15GB/s版本
0x400af8:
lea 0x1(%rdx),%eax
popcnt (%rbx,%rax,8),%r9
lea 0x2(%rdx),%edi
popcnt (%rbx,%rcx,8),%rax
lea 0x3(%rdx),%esi
add %r9,%rax
popcnt (%rbx,%rdi,8),%rcx
add $0x4,%edx
add %rcx,%rax
popcnt (%rbx,%rsi,8),%rcx
add %rcx,%rax
mov %edx,%ecx
add %rax,%r14
cmp %rbp,%rcx
jb 0x400af8
0x400c00:
popcnt 0x8(%rbx,%rdx,8),%rcx
popcnt (%rbx,%rdx,8),%rax
add %rcx,%rax
popcnt 0x10(%rbx,%rdx,8),%rcx
add %rcx,%rax
popcnt 0x18(%rbx,%rdx,8),%rcx
add $0x4,%rdx
add %rcx,%rax
add %rax,%r12
cmp %rbp,%rdx
jb 0x400c00
0x400e50:
popcnt (%r15,%rcx,8),%rdx
add %rbx,%rdx
popcnt 0x8(%r15,%rcx,8),%rsi
add %rdx,%rsi
popcnt 0x10(%r15,%rcx,8),%rdx
add %rsi,%rdx
popcnt 0x18(%r15,%rcx,8),%rbx
add %rdx,%rbx
add $0x4,%rcx
cmp %rbp,%rcx
jb 0x400e50
0x400a68:
popcnt (%rbx,%rdx,1),%rax
popcnt 0x8(%rbx,%rdx,1),%rcx
add %rax,%rcx
popcnt 0x10(%rbx,%rdx,1),%rax
add %rax,%rcx
popcnt 0x18(%rbx,%rdx,1),%rsi
add $0x20,%rdx
add %rsi,%rcx
add %rcx,%rbp
cmp $0x100000,%rdx
jne 0x400a68
0x400dd0:
popcnt (%r14,%rcx,8),%rdx
add %rbx,%rdx
popcnt 0x8(%r14,%rcx,8),%rsi
add %rdx,%rsi
popcnt 0x10(%r14,%rcx,8),%rdx
add %rsi,%rdx
popcnt 0x18(%r14,%rcx,8),%rbx
add %rdx,%rbx
add $0x4,%rcx
cmp $0x20000,%rcx
jb 0x400dd0
来自g++/u32和u64/const bufsize的20GB/s版本
0x400af8:
lea 0x1(%rdx),%eax
popcnt (%rbx,%rax,8),%r9
lea 0x2(%rdx),%edi
popcnt (%rbx,%rcx,8),%rax
lea 0x3(%rdx),%esi
add %r9,%rax
popcnt (%rbx,%rdi,8),%rcx
add $0x4,%edx
add %rcx,%rax
popcnt (%rbx,%rsi,8),%rcx
add %rcx,%rax
mov %edx,%ecx
add %rax,%r14
cmp %rbp,%rcx
jb 0x400af8
0x400c00:
popcnt 0x8(%rbx,%rdx,8),%rcx
popcnt (%rbx,%rdx,8),%rax
add %rcx,%rax
popcnt 0x10(%rbx,%rdx,8),%rcx
add %rcx,%rax
popcnt 0x18(%rbx,%rdx,8),%rcx
add $0x4,%rdx
add %rcx,%rax
add %rax,%r12
cmp %rbp,%rdx
jb 0x400c00
0x400e50:
popcnt (%r15,%rcx,8),%rdx
add %rbx,%rdx
popcnt 0x8(%r15,%rcx,8),%rsi
add %rdx,%rsi
popcnt 0x10(%r15,%rcx,8),%rdx
add %rsi,%rdx
popcnt 0x18(%r15,%rcx,8),%rbx
add %rdx,%rbx
add $0x4,%rcx
cmp %rbp,%rcx
jb 0x400e50
0x400a68:
popcnt (%rbx,%rdx,1),%rax
popcnt 0x8(%rbx,%rdx,1),%rcx
add %rax,%rcx
popcnt 0x10(%rbx,%rdx,1),%rax
add %rax,%rcx
popcnt 0x18(%rbx,%rdx,1),%rsi
add $0x20,%rdx
add %rsi,%rcx
add %rcx,%rbp
cmp $0x100000,%rdx
jne 0x400a68
0x400dd0:
popcnt (%r14,%rcx,8),%rdx
add %rbx,%rdx
popcnt 0x8(%r14,%rcx,8),%rsi
add %rdx,%rsi
popcnt 0x10(%r14,%rcx,8),%rdx
add %rsi,%rdx
popcnt 0x18(%r14,%rcx,8),%rbx
add %rdx,%rbx
add $0x4,%rcx
cmp $0x20000,%rcx
jb 0x400dd0
来自clang++/u32和u64/const bufsize的15GB/s版本
0x400af8:
lea 0x1(%rdx),%eax
popcnt (%rbx,%rax,8),%r9
lea 0x2(%rdx),%edi
popcnt (%rbx,%rcx,8),%rax
lea 0x3(%rdx),%esi
add %r9,%rax
popcnt (%rbx,%rdi,8),%rcx
add $0x4,%edx
add %rcx,%rax
popcnt (%rbx,%rsi,8),%rcx
add %rcx,%rax
mov %edx,%ecx
add %rax,%r14
cmp %rbp,%rcx
jb 0x400af8
0x400c00:
popcnt 0x8(%rbx,%rdx,8),%rcx
popcnt (%rbx,%rdx,8),%rax
add %rcx,%rax
popcnt 0x10(%rbx,%rdx,8),%rcx
add %rcx,%rax
popcnt 0x18(%rbx,%rdx,8),%rcx
add $0x4,%rdx
add %rcx,%rax
add %rax,%r12
cmp %rbp,%rdx
jb 0x400c00
0x400e50:
popcnt (%r15,%rcx,8),%rdx
add %rbx,%rdx
popcnt 0x8(%r15,%rcx,8),%rsi
add %rdx,%rsi
popcnt 0x10(%r15,%rcx,8),%rdx
add %rsi,%rdx
popcnt 0x18(%r15,%rcx,8),%rbx
add %rdx,%rbx
add $0x4,%rcx
cmp %rbp,%rcx
jb 0x400e50
0x400a68:
popcnt (%rbx,%rdx,1),%rax
popcnt 0x8(%rbx,%rdx,1),%rcx
add %rax,%rcx
popcnt 0x10(%rbx,%rdx,1),%rax
add %rax,%rcx
popcnt 0x18(%rbx,%rdx,1),%rsi
add $0x20,%rdx
add %rsi,%rcx
add %rcx,%rbp
cmp $0x100000,%rdx
jne 0x400a68
0x400dd0:
popcnt (%r14,%rcx,8),%rdx
add %rbx,%rdx
popcnt 0x8(%r14,%rcx,8),%rsi
add %rdx,%rsi
popcnt 0x10(%r14,%rcx,8),%rdx
add %rsi,%rdx
popcnt 0x18(%r14,%rcx,8),%rbx
add %rdx,%rbx
add $0x4,%rcx
cmp $0x20000,%rcx
jb 0x400dd0
有趣的是,最快(26GB/s)的版本也是最长的!这似乎是使用lea
的唯一解决方案。一些版本使用jb
跳转,其他版本使用jne
。但除此之外,所有版本似乎都具有可比性。我不知道100%的性能差距是从何而来的,但我不太擅长破译程序集。最慢(13GB/s)的版本看起来甚至很短也很好。有人能解释一下吗
经验教训
不管这个问题的答案是什么;我已经了解到,在真正的热循环中,每一个细节都很重要,即使是看起来与热代码没有任何关联的细节。我从来没有想过循环变量应该使用什么类型,但正如您所看到的,这样一个小小的改变可以带来100%的不同!甚至缓冲区的存储类型也会产生巨大的差异,正如我们在size变量前面插入static
关键字所看到的!将来,在编写对系统性能至关重要的非常紧密的热循环时,我将始终在各种编译器上测试各种替代方案
有趣的是,尽管我已经展开了四次循环,但性能差异仍然很大。因此,即使您展开,您仍然可能受到重大性能偏差的影响。很有趣。这不是一个答案,但如果我把结果放在评论中,很难阅读
我用一个(6核3.33GHz)的处理器得到了这些结果。我用clang-O3-msse4-lstdc++a.cpp-oa
(-O2得到相同的结果)编译了它
与uint64\u t size=atol(argv[1])碰撞您是否尝试将还原步骤移出循环?现在,您有一个真正不需要的数据依赖项
尝试:
uint64子集计数[4]={};
for(无符号k=0;k<10000;k++){
//无符号紧展开循环
无符号i=0;
而(i
你也有一些奇怪的别名,我不确定是否符合严格的别名规则。我不能给出权威性的答案,但提供一个可能原因的概述。这很清楚地表明了这一点
gcc (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4
3.19.0-58-generic
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 70
model name : Intel(R) Core(TM) i7-4870HQ CPU @ 2.50 GHz
stepping : 1
microcode : 0xf
cpu MHz : 2494.226
cache size : 6144 KB
physical id : 0
siblings : 1
core id : 0
cpu cores : 1
apicid : 0
initial apicid : 0
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx rdtscp lm constant_tsc nopl xtopology nonstop_tsc eagerfpu pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm arat pln pts dtherm fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 invpcid xsaveopt
bugs :
bogomips : 4988.45
clflush size : 64
cache_alignment : 64
address sizes : 36 bits physical, 48 bits virtual
power management:
uint64_t builtin_popcnt1a(const uint64_t* buf, size_t len)
{
uint64_t cnt0, cnt1, cnt2, cnt3;
cnt0 = cnt1 = cnt2 = cnt3 = 0;
uint64_t val = buf[0];
#if 0
__asm__ __volatile__ (
"1:\n\t"
"popcnt %2, %1\n\t"
"popcnt %2, %1\n\t"
"popcnt %2, %1\n\t"
"popcnt %2, %1\n\t"
"subq $4, %0\n\t"
"jnz 1b\n\t"
: "+q" (len), "=q" (cnt0)
: "q" (val)
:
);
#else
__asm__ __volatile__ (
"1:\n\t"
"popcnt %5, %1\n\t"
"popcnt %5, %2\n\t"
"popcnt %5, %3\n\t"
"popcnt %5, %4\n\t"
"subq $4, %0\n\t"
"jnz 1b\n\t"
: "+q" (len), "=q" (cnt0), "=q" (cnt1), "=q" (cnt2), "=q" (cnt3)
: "q" (val)
:
);
#endif
return cnt0;
}
unsigned 209695540000 1.8322 sec 28.6152 GB/s
uint64_t 209695540000 3.08764 sec 16.9802 GB/s
unsigned 209695540000 1.70845 sec 30.688 GB/s
uint64_t 209695540000 1.57956 sec 33.1921 GB/s
int __cdecl main(int argc, const char **argv, const char **envp)
{
int v6; // er13
_BYTE *v8; // rsi
unsigned int v9; // edi
unsigned __int64 i; // rbx
unsigned __int64 v11; // rdi
int v12; // ebp
__int64 v13; // r14
__int64 v14; // rbx
unsigned int v15; // eax
unsigned __int64 v16; // rcx
unsigned int v17; // eax
unsigned __int64 v18; // rcx
__int64 v19; // rdx
unsigned int v20; // eax
int result; // eax
std::ostream *v23; // rbx
char v24; // dl
std::ostream *v33; // rbx
std::ostream *v41; // rbx
__int64 v42; // rdx
unsigned int v43; // eax
int v44; // ebp
__int64 v45; // r14
__int64 v46; // rbx
unsigned __int64 v47; // rax
unsigned __int64 v48; // rax
std::ostream *v50; // rdi
char v51; // dl
std::ostream *v58; // rdi
std::ostream *v60; // rdi
__int64 v61; // rdx
unsigned int v62; // eax
__asm
{
vmovdqa [rsp+98h+var_58], xmm8
vmovapd [rsp+98h+var_68], xmm7
vmovapd [rsp+98h+var_78], xmm6
}
if ( argc == 2 )
{
v6 = atol(argv[1]) << 20;
_R15 = v6;
v8 = operator new[](v6);
if ( v6 )
{
v9 = 1;
for ( i = 0i64; i < v6; i = v9++ )
v8[i] = rand();
}
v11 = (unsigned __int64)v6 >> 3;
v12 = 0;
v13 = Xtime_get_ticks_0();
v14 = 0i64;
do
{
if ( v6 )
{
v15 = 4;
v16 = 0i64;
do
{
v14 += __popcnt(*(_QWORD *)&v8[8 * v16])
+ __popcnt(*(_QWORD *)&v8[8 * v15 - 24])
+ __popcnt(*(_QWORD *)&v8[8 * v15 - 16])
+ __popcnt(*(_QWORD *)&v8[8 * v15 - 8]);
v16 = v15;
v15 += 4;
}
while ( v11 > v16 );
v17 = 4;
v18 = 0i64;
do
{
v14 += __popcnt(*(_QWORD *)&v8[8 * v18])
+ __popcnt(*(_QWORD *)&v8[8 * v17 - 24])
+ __popcnt(*(_QWORD *)&v8[8 * v17 - 16])
+ __popcnt(*(_QWORD *)&v8[8 * v17 - 8]);
v18 = v17;
v17 += 4;
}
while ( v11 > v18 );
}
v12 += 2;
}
while ( v12 != 10000 );
_RBP = 100 * (Xtime_get_ticks_0() - v13);
std::operator___std::char_traits_char___(std::cout, "unsigned\t");
v23 = (std::ostream *)std::ostream::operator<<(std::cout, v14);
std::operator___std::char_traits_char____0(v23, v24);
__asm
{
vmovq xmm0, rbp
vmovdqa xmm8, cs:__xmm@00000000000000004530000043300000
vpunpckldq xmm0, xmm0, xmm8
vmovapd xmm7, cs:__xmm@45300000000000004330000000000000
vsubpd xmm0, xmm0, xmm7
vpermilpd xmm1, xmm0, 1
vaddsd xmm6, xmm1, xmm0
vdivsd xmm1, xmm6, cs:__real@41cdcd6500000000
}
v33 = (std::ostream *)std::ostream::operator<<(v23);
std::operator___std::char_traits_char___(v33, " sec \t");
__asm
{
vmovq xmm0, r15
vpunpckldq xmm0, xmm0, xmm8
vsubpd xmm0, xmm0, xmm7
vpermilpd xmm1, xmm0, 1
vaddsd xmm0, xmm1, xmm0
vmulsd xmm7, xmm0, cs:__real@40c3880000000000
vdivsd xmm1, xmm7, xmm6
}
v41 = (std::ostream *)std::ostream::operator<<(v33);
std::operator___std::char_traits_char___(v41, " GB/s");
LOBYTE(v42) = 10;
v43 = std::ios::widen((char *)v41 + *(int *)(*(_QWORD *)v41 + 4i64), v42);
std::ostream::put(v41, v43);
std::ostream::flush(v41);
v44 = 0;
v45 = Xtime_get_ticks_0();
v46 = 0i64;
do
{
if ( v6 )
{
v47 = 0i64;
do
{
v46 += __popcnt(*(_QWORD *)&v8[8 * v47])
+ __popcnt(*(_QWORD *)&v8[8 * v47 + 8])
+ __popcnt(*(_QWORD *)&v8[8 * v47 + 16])
+ __popcnt(*(_QWORD *)&v8[8 * v47 + 24]);
v47 += 4i64;
}
while ( v47 < v11 );
v48 = 0i64;
do
{
v46 += __popcnt(*(_QWORD *)&v8[8 * v48])
+ __popcnt(*(_QWORD *)&v8[8 * v48 + 8])
+ __popcnt(*(_QWORD *)&v8[8 * v48 + 16])
+ __popcnt(*(_QWORD *)&v8[8 * v48 + 24]);
v48 += 4i64;
}
while ( v48 < v11 );
}
v44 += 2;
}
while ( v44 != 10000 );
_RBP = 100 * (Xtime_get_ticks_0() - v45);
std::operator___std::char_traits_char___(std::cout, "uint64_t\t");
v50 = (std::ostream *)std::ostream::operator<<(std::cout, v46);
std::operator___std::char_traits_char____0(v50, v51);
__asm
{
vmovq xmm0, rbp
vpunpckldq xmm0, xmm0, cs:__xmm@00000000000000004530000043300000
vsubpd xmm0, xmm0, cs:__xmm@45300000000000004330000000000000
vpermilpd xmm1, xmm0, 1
vaddsd xmm6, xmm1, xmm0
vdivsd xmm1, xmm6, cs:__real@41cdcd6500000000
}
v58 = (std::ostream *)std::ostream::operator<<(v50);
std::operator___std::char_traits_char___(v58, " sec \t");
__asm { vdivsd xmm1, xmm7, xmm6 }
v60 = (std::ostream *)std::ostream::operator<<(v58);
std::operator___std::char_traits_char___(v60, " GB/s");
LOBYTE(v61) = 10;
v62 = std::ios::widen((char *)v60 + *(int *)(*(_QWORD *)v60 + 4i64), v61);
std::ostream::put(v60, v62);
std::ostream::flush(v60);
free(v8);
result = 0;
}
else
{
std::operator___std::char_traits_char___(std::cerr, "usage: array_size in MB");
LOBYTE(v19) = 10;
v20 = std::ios::widen((char *)&std::cerr + *((int *)std::cerr + 1), v19);
std::ostream::put(std::cerr, v20);
std::ostream::flush(std::cerr);
result = -1;
}
__asm
{
vmovaps xmm6, [rsp+98h+var_78]
vmovaps xmm7, [rsp+98h+var_68]
vmovaps xmm8, [rsp+98h+var_58]
}
return result;
}
.text:0140001000 .686p
.text:0140001000 .mmx
.text:0140001000 .model flat
.text:0140001000
.text:0140001000 ; ===========================================================================
.text:0140001000
.text:0140001000 ; Segment type: Pure code
.text:0140001000 ; Segment permissions: Read/Execute
.text:0140001000 _text segment para public 'CODE' use64
.text:0140001000 assume cs:_text
.text:0140001000 ;org 140001000h
.text:0140001000 assume es:nothing, ss:nothing, ds:_data, fs:nothing, gs:nothing
.text:0140001000
.text:0140001000 ; =============== S U B R O U T I N E =======================================
.text:0140001000
.text:0140001000
.text:0140001000 ; int __cdecl main(int argc, const char **argv, const char **envp)
.text:0140001000 main proc near ; CODE XREF: __scrt_common_main_seh+107↓p
.text:0140001000 ; DATA XREF: .pdata:ExceptionDir↓o
.text:0140001000
.text:0140001000 var_78 = xmmword ptr -78h
.text:0140001000 var_68 = xmmword ptr -68h
.text:0140001000 var_58 = xmmword ptr -58h
.text:0140001000
.text:0140001000 push r15
.text:0140001002 push r14
.text:0140001004 push r13
.text:0140001006 push r12
.text:0140001008 push rsi
.text:0140001009 push rdi
.text:014000100A push rbp
.text:014000100B push rbx
.text:014000100C sub rsp, 58h
.text:0140001010 vmovdqa [rsp+98h+var_58], xmm8
.text:0140001016 vmovapd [rsp+98h+var_68], xmm7
.text:014000101C vmovapd [rsp+98h+var_78], xmm6
.text:0140001022 cmp ecx, 2
.text:0140001025 jnz loc_14000113E
.text:014000102B mov rcx, [rdx+8] ; String
.text:014000102F call cs:__imp_atol
.text:0140001035 mov r13d, eax
.text:0140001038 shl r13d, 14h
.text:014000103C movsxd r15, r13d
.text:014000103F mov rcx, r15 ; size
.text:0140001042 call ??_U@YAPEAX_K@Z ; operator new[](unsigned __int64)
.text:0140001047 mov rsi, rax
.text:014000104A test r15d, r15d
.text:014000104D jz short loc_14000106E
.text:014000104F mov edi, 1
.text:0140001054 xor ebx, ebx
.text:0140001056 mov rbp, cs:__imp_rand
.text:014000105D nop dword ptr [rax]
.text:0140001060
.text:0140001060 loc_140001060: ; CODE XREF: main+6C↓j
.text:0140001060 call rbp ; __imp_rand
.text:0140001062 mov [rsi+rbx], al
.text:0140001065 mov ebx, edi
.text:0140001067 inc edi
.text:0140001069 cmp rbx, r15
.text:014000106C jb short loc_140001060
.text:014000106E
.text:014000106E loc_14000106E: ; CODE XREF: main+4D↑j
.text:014000106E mov rdi, r15
.text:0140001071 shr rdi, 3
.text:0140001075 xor ebp, ebp
.text:0140001077 call _Xtime_get_ticks_0
.text:014000107C mov r14, rax
.text:014000107F xor ebx, ebx
.text:0140001081 jmp short loc_14000109F
.text:0140001081 ; ---------------------------------------------------------------------------
.text:0140001083 align 10h
.text:0140001090
.text:0140001090 loc_140001090: ; CODE XREF: main+A2↓j
.text:0140001090 ; main+EC↓j ...
.text:0140001090 add ebp, 2
.text:0140001093 cmp ebp, 2710h
.text:0140001099 jz loc_140001184
.text:014000109F
.text:014000109F loc_14000109F: ; CODE XREF: main+81↑j
.text:014000109F test r13d, r13d
.text:01400010A2 jz short loc_140001090
.text:01400010A4 mov eax, 4
.text:01400010A9 xor ecx, ecx
.text:01400010AB nop dword ptr [rax+rax+00h]
.text:01400010B0
.text:01400010B0 loc_1400010B0: ; CODE XREF: main+E7↓j
.text:01400010B0 popcnt rcx, qword ptr [rsi+rcx*8]
.text:01400010B6 add rcx, rbx
.text:01400010B9 lea edx, [rax-3]
.text:01400010BC popcnt rdx, qword ptr [rsi+rdx*8]
.text:01400010C2 add rdx, rcx
.text:01400010C5 lea ecx, [rax-2]
.text:01400010C8 popcnt rcx, qword ptr [rsi+rcx*8]
.text:01400010CE add rcx, rdx
.text:01400010D1 lea edx, [rax-1]
.text:01400010D4 xor ebx, ebx
.text:01400010D6 popcnt rbx, qword ptr [rsi+rdx*8]
.text:01400010DC add rbx, rcx
.text:01400010DF mov ecx, eax
.text:01400010E1 add eax, 4
.text:01400010E4 cmp rdi, rcx
.text:01400010E7 ja short loc_1400010B0
.text:01400010E9 test r13d, r13d
.text:01400010EC jz short loc_140001090
.text:01400010EE mov eax, 4
.text:01400010F3 xor ecx, ecx
.text:01400010F5 db 2Eh
.text:01400010F5 nop word ptr [rax+rax+00000000h]
.text:01400010FF nop
.text:0140001100
.text:0140001100 loc_140001100: ; CODE XREF: main+137↓j
.text:0140001100 popcnt rcx, qword ptr [rsi+rcx*8]
.text:0140001106 add rcx, rbx
.text:0140001109 lea edx, [rax-3]
.text:014000110C popcnt rdx, qword ptr [rsi+rdx*8]
.text:0140001112 add rdx, rcx
.text:0140001115 lea ecx, [rax-2]
.text:0140001118 popcnt rcx, qword ptr [rsi+rcx*8]
.text:014000111E add rcx, rdx
.text:0140001121 lea edx, [rax-1]
.text:0140001124 xor ebx, ebx
.text:0140001126 popcnt rbx, qword ptr [rsi+rdx*8]
.text:014000112C add rbx, rcx
.text:014000112F mov ecx, eax
.text:0140001131 add eax, 4
.text:0140001134 cmp rdi, rcx
.text:0140001137 ja short loc_140001100
.text:0140001139 jmp loc_140001090
.text:014000113E ; ---------------------------------------------------------------------------
.text:014000113E
.text:014000113E loc_14000113E: ; CODE XREF: main+25↑j
.text:014000113E mov rsi, cs:__imp_?cerr@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A ; std::ostream std::cerr
.text:0140001145 lea rdx, aUsageArraySize ; "usage: array_size in MB"
.text:014000114C mov rcx, rsi ; std::ostream *
.text:014000114F call std__operator___std__char_traits_char___
.text:0140001154 mov rax, [rsi]
.text:0140001157 movsxd rcx, dword ptr [rax+4]
.text:014000115B add rcx, rsi
.text:014000115E mov dl, 0Ah
.text:0140001160 call cs:__imp_?widen@?$basic_ios@DU?$char_traits@D@std@@@std@@QEBADD@Z ; std::ios::widen(char)
.text:0140001166 mov rcx, rsi
.text:0140001169 mov edx, eax
.text:014000116B call cs:__imp_?put@?$basic_ostream@DU?$char_traits@D@std@@@std@@QEAAAEAV12@D@Z ; std::ostream::put(char)
.text:0140001171 mov rcx, rsi
.text:0140001174 call cs:__imp_?flush@?$basic_ostream@DU?$char_traits@D@std@@@std@@QEAAAEAV12@XZ ; std::ostream::flush(void)
.text:014000117A mov eax, 0FFFFFFFFh
.text:014000117F jmp loc_1400013E2
.text:0140001184 ; ---------------------------------------------------------------------------
.text:0140001184
.text:0140001184 loc_140001184: ; CODE XREF: main+99↑j
.text:0140001184 call _Xtime_get_ticks_0
.text:0140001189 sub rax, r14
.text:014000118C imul rbp, rax, 64h ; 'd'
.text:0140001190 mov r14, cs:__imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A ; std::ostream std::cout
.text:0140001197 lea rdx, aUnsigned ; "unsigned\t"
.text:014000119E mov rcx, r14 ; std::ostream *
.text:01400011A1 call std__operator___std__char_traits_char___
.text:01400011A6 mov rcx, r14
.text:01400011A9 mov rdx, rbx
.text:01400011AC call cs:__imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QEAAAEAV01@_K@Z ; std::ostream::operator<<(unsigned __int64)
.text:01400011B2 mov rbx, rax
.text:01400011B5 mov rcx, rax ; std::ostream *
.text:01400011B8 call std__operator___std__char_traits_char____0
.text:01400011BD vmovq xmm0, rbp
.text:01400011C2 vmovdqa xmm8, cs:__xmm@00000000000000004530000043300000
.text:01400011CA vpunpckldq xmm0, xmm0, xmm8
.text:01400011CF vmovapd xmm7, cs:__xmm@45300000000000004330000000000000
.text:01400011D7 vsubpd xmm0, xmm0, xmm7
.text:01400011DB vpermilpd xmm1, xmm0, 1
.text:01400011E1 vaddsd xmm6, xmm1, xmm0
.text:01400011E5 vdivsd xmm1, xmm6, cs:__real@41cdcd6500000000
.text:01400011ED mov r12, cs:__imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QEAAAEAV01@N@Z ; std::ostream::operator<<(double)
.text:01400011F4 mov rcx, rbx
.text:01400011F7 call r12 ; std::ostream::operator<<(double) ; std::ostream::operator<<(double)
.text:01400011FA mov rbx, rax
.text:01400011FD lea rdx, aSec ; " sec \t"
.text:0140001204 mov rcx, rax ; std::ostream *
.text:0140001207 call std__operator___std__char_traits_char___
.text:014000120C vmovq xmm0, r15
.text:0140001211 vpunpckldq xmm0, xmm0, xmm8
.text:0140001216 vsubpd xmm0, xmm0, xmm7
.text:014000121A vpermilpd xmm1, xmm0, 1
.text:0140001220 vaddsd xmm0, xmm1, xmm0
.text:0140001224 vmulsd xmm7, xmm0, cs:__real@40c3880000000000
.text:014000122C vdivsd xmm1, xmm7, xmm6
.text:0140001230 mov rcx, rbx
.text:0140001233 call r12 ; std::ostream::operator<<(double) ; std::ostream::operator<<(double)
.text:0140001236 mov rbx, rax
.text:0140001239 lea rdx, aGbS ; " GB/s"
.text:0140001240 mov rcx, rax ; std::ostream *
.text:0140001243 call std__operator___std__char_traits_char___
.text:0140001248 mov rax, [rbx]
.text:014000124B movsxd rcx, dword ptr [rax+4]
.text:014000124F add rcx, rbx
.text:0140001252 mov dl, 0Ah
.text:0140001254 call cs:__imp_?widen@?$basic_ios@DU?$char_traits@D@std@@@std@@QEBADD@Z ; std::ios::widen(char)
.text:014000125A mov rcx, rbx
.text:014000125D mov edx, eax
.text:014000125F call cs:__imp_?put@?$basic_ostream@DU?$char_traits@D@std@@@std@@QEAAAEAV12@D@Z ; std::ostream::put(char)
.text:0140001265 mov rcx, rbx
.text:0140001268 call cs:__imp_?flush@?$basic_ostream@DU?$char_traits@D@std@@@std@@QEAAAEAV12@XZ ; std::ostream::flush(void)
.text:014000126E xor ebp, ebp
.text:0140001270 call _Xtime_get_ticks_0
.text:0140001275 mov r14, rax
.text:0140001278 xor ebx, ebx
.text:014000127A jmp short loc_14000128F
.text:014000127A ; ---------------------------------------------------------------------------
.text:014000127C align 20h
.text:0140001280
.text:0140001280 loc_140001280: ; CODE XREF: main+292↓j
.text:0140001280 ; main+2DB↓j ...
.text:0140001280 add ebp, 2
.text:0140001283 cmp ebp, 2710h
.text:0140001289 jz loc_14000131D
.text:014000128F
.text:014000128F loc_14000128F: ; CODE XREF: main+27A↑j
.text:014000128F test r13d, r13d
.text:0140001292 jz short loc_140001280
.text:0140001294 xor eax, eax
.text:0140001296 db 2Eh
.text:0140001296 nop word ptr [rax+rax+00000000h]
.text:01400012A0
.text:01400012A0 loc_1400012A0: ; CODE XREF: main+2D6↓j
.text:01400012A0 xor ecx, ecx
.text:01400012A2 popcnt rcx, qword ptr [rsi+rax*8]
.text:01400012A8 add rcx, rbx
.text:01400012AB xor edx, edx
.text:01400012AD popcnt rdx, qword ptr [rsi+rax*8+8]
.text:01400012B4 add rdx, rcx
.text:01400012B7 xor ecx, ecx
.text:01400012B9 popcnt rcx, qword ptr [rsi+rax*8+10h]
.text:01400012C0 add rcx, rdx
.text:01400012C3 xor ebx, ebx
.text:01400012C5 popcnt rbx, qword ptr [rsi+rax*8+18h]
.text:01400012CC add rbx, rcx
.text:01400012CF add rax, 4
.text:01400012D3 cmp rax, rdi
.text:01400012D6 jb short loc_1400012A0
.text:01400012D8 test r13d, r13d
.text:01400012DB jz short loc_140001280
.text:01400012DD xor eax, eax
.text:01400012DF nop
.text:01400012E0
.text:01400012E0 loc_1400012E0: ; CODE XREF: main+316↓j
.text:01400012E0 xor ecx, ecx
.text:01400012E2 popcnt rcx, qword ptr [rsi+rax*8]
.text:01400012E8 add rcx, rbx
.text:01400012EB xor edx, edx
.text:01400012ED popcnt rdx, qword ptr [rsi+rax*8+8]
.text:01400012F4 add rdx, rcx
.text:01400012F7 xor ecx, ecx
.text:01400012F9 popcnt rcx, qword ptr [rsi+rax*8+10h]
.text:0140001300 add rcx, rdx
.text:0140001303 xor ebx, ebx
.text:0140001305 popcnt rbx, qword ptr [rsi+rax*8+18h]
.text:014000130C add rbx, rcx
.text:014000130F add rax, 4
.text:0140001313 cmp rax, rdi
.text:0140001316 jb short loc_1400012E0
.text:0140001318 jmp loc_140001280
.text:014000131D ; ---------------------------------------------------------------------------
.text:014000131D
.text:014000131D loc_14000131D: ; CODE XREF: main+289↑j
.text:014000131D call _Xtime_get_ticks_0
.text:0140001322 sub rax, r14
.text:0140001325 imul rbp, rax, 64h ; 'd'
.text:0140001329 mov rdi, cs:__imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A ; std::ostream std::cout
.text:0140001330 lea rdx, aUint64T ; "uint64_t\t"
.text:0140001337 mov rcx, rdi ; std::ostream *
.text:014000133A call std__operator___std__char_traits_char___
.text:014000133F mov rcx, rdi
.text:0140001342 mov rdx, rbx
.text:0140001345 call cs:__imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QEAAAEAV01@_K@Z ; std::ostream::operator<<(unsigned __int64)
.text:014000134B mov rdi, rax
.text:014000134E mov rcx, rax ; std::ostream *
.text:0140001351 call std__operator___std__char_traits_char____0
.text:0140001356 vmovq xmm0, rbp
.text:014000135B vpunpckldq xmm0, xmm0, cs:__xmm@00000000000000004530000043300000
.text:0140001363 vsubpd xmm0, xmm0, cs:__xmm@45300000000000004330000000000000
.text:014000136B vpermilpd xmm1, xmm0, 1
.text:0140001371 vaddsd xmm6, xmm1, xmm0
.text:0140001375 vdivsd xmm1, xmm6, cs:__real@41cdcd6500000000
.text:014000137D mov rcx, rdi
.text:0140001380 call r12 ; std::ostream::operator<<(double) ; std::ostream::operator<<(double)
.text:0140001383 mov rdi, rax
.text:0140001386 lea rdx, aSec ; " sec \t"
.text:014000138D mov rcx, rax ; std::ostream *
.text:0140001390 call std__operator___std__char_traits_char___
.text:0140001395 vdivsd xmm1, xmm7, xmm6
.text:0140001399 mov rcx, rdi
.text:014000139C call r12 ; std::ostream::operator<<(double) ; std::ostream::operator<<(double)
.text:014000139F mov rdi, rax
.text:01400013A2 lea rdx, aGbS ; " GB/s"
.text:01400013A9 mov rcx, rax ; std::ostream *
.text:01400013AC call std__operator___std__char_traits_char___
.text:01400013B1 mov rax, [rdi]
.text:01400013B4 movsxd rcx, dword ptr [rax+4]
.text:01400013B8 add rcx, rdi
.text:01400013BB mov dl, 0Ah
.text:01400013BD call cs:__imp_?widen@?$basic_ios@DU?$char_traits@D@std@@@std@@QEBADD@Z ; std::ios::widen(char)
.text:01400013C3 mov rcx, rdi
.text:01400013C6 mov edx, eax
.text:01400013C8 call cs:__imp_?put@?$basic_ostream@DU?$char_traits@D@std@@@std@@QEAAAEAV12@D@Z ; std::ostream::put(char)
.text:01400013CE mov rcx, rdi
.text:01400013D1 call cs:__imp_?flush@?$basic_ostream@DU?$char_traits@D@std@@@std@@QEAAAEAV12@XZ ; std::ostream::flush(void)
.text:01400013D7 mov rcx, rsi ; Block
.text:01400013DA call cs:__imp_free
.text:01400013E0 xor eax, eax
.text:01400013E2
.text:01400013E2 loc_1400013E2: ; CODE XREF: main+17F↑j
.text:01400013E2 vmovaps xmm6, [rsp+98h+var_78]
.text:01400013E8 vmovaps xmm7, [rsp+98h+var_68]
.text:01400013EE vmovaps xmm8, [rsp+98h+var_58]
.text:01400013F4 add rsp, 58h
.text:01400013F8 pop rbx
.text:01400013F9 pop rbp
.text:01400013FA pop rdi
.text:01400013FB pop rsi
.text:01400013FC pop r12
.text:01400013FE pop r13
.text:0140001400 pop r14
.text:0140001402 pop r15
.text:0140001404 retn
.text:0140001404 main endp