C 浮点到双精度转换:为什么有这么多指令?
我很好奇是否有人能帮我解释一下。我正在研究一些数值数据转换的东西,我有几个函数可以进行数据转换,我使用两个宏来定义:C 浮点到双精度转换:为什么有这么多指令?,c,assembly,compiler-optimization,C,Assembly,Compiler Optimization,我很好奇是否有人能帮我解释一下。我正在研究一些数值数据转换的东西,我有几个函数可以进行数据转换,我使用两个宏来定义: #define CONV_VIA_CAST(name, dtype, vtype) \ static inline void name(void *data, void *view, size_t len) { \ vtype *vptr = (vtype*)view;
#define CONV_VIA_CAST(name, dtype, vtype) \
static inline void name(void *data, void *view, size_t len) { \
vtype *vptr = (vtype*)view; \
dtype *dptr = (dtype*)data; \
for (size_t ii=0; ii < len/sizeof(vtype); ii++) { \
*vptr++ = (vtype)*dptr++; \
} \
}
#define CONV_VIA_FUNC(name, dtype, vtype, via) \
static inline void name(void *data, void *view, size_t len) { \
vtype *vptr = (vtype*)view; \
dtype *dptr = (dtype*)data; \
for (size_t ii=0; ii < len/sizeof(vtype); ii++) { \
*vptr++ = (vtype)via(*dptr++); \
} \
}
我得到了一个非常简洁的小部件,上面有-O3:
0x0000000000401fb0 <+0>: shr %rdx
0x0000000000401fb3 <+3>: je 0x401fd3 <f_to_i+35>
0x0000000000401fb5 <+5>: xor %eax,%eax
0x0000000000401fb7 <+7>: nopw 0x0(%rax,%rax,1)
0x0000000000401fc0 <+16>: cvtss2si (%rdi,%rax,4),%rcx
0x0000000000401fc6 <+22>: mov %cx,(%rsi,%rax,2)
0x0000000000401fca <+26>: add $0x1,%rax
0x0000000000401fce <+30>: cmp %rdx,%rax
0x0000000000401fd1 <+33>: jne 0x401fc0 <f_to_i+16>
0x0000000000401fd3 <+35>: repz retq
我得到了这个怪物:
0x0000000000402040 <+0>: mov %rdx,%r8
0x0000000000402043 <+3>: shr $0x3,%r8
0x0000000000402047 <+7>: test %r8,%r8
0x000000000040204a <+10>: je 0x402106 <f_to_d+198>
0x0000000000402050 <+16>: shr $0x5,%rdx
0x0000000000402054 <+20>: lea 0x0(,%rdx,4),%r9
0x000000000040205c <+28>: test %r9,%r9
0x000000000040205f <+31>: je 0x402108 <f_to_d+200>
0x0000000000402065 <+37>: lea (%rdi,%r8,4),%rax
0x0000000000402069 <+41>: cmp $0xb,%r8
0x000000000040206d <+45>: lea (%rsi,%r8,8),%r10
0x0000000000402071 <+49>: seta %cl
0x0000000000402074 <+52>: cmp %rax,%rsi
0x0000000000402077 <+55>: seta %al
0x000000000040207a <+58>: cmp %r10,%rdi
0x000000000040207d <+61>: seta %r10b
0x0000000000402081 <+65>: or %r10d,%eax
0x0000000000402084 <+68>: test %al,%cl
0x0000000000402086 <+70>: je 0x402108 <f_to_d+200>
0x000000000040208c <+76>: xorps %xmm3,%xmm3
0x000000000040208f <+79>: xor %eax,%eax
0x0000000000402091 <+81>: xor %ecx,%ecx
0x0000000000402093 <+83>: nopl 0x0(%rax,%rax,1)
0x0000000000402098 <+88>: movaps %xmm3,%xmm0
0x000000000040209b <+91>: add $0x1,%rcx
0x000000000040209f <+95>: movlps (%rdi,%rax,1),%xmm0
0x00000000004020a3 <+99>: movhps 0x8(%rdi,%rax,1),%xmm0
0x00000000004020a8 <+104>: movhlps %xmm0,%xmm1
0x00000000004020ab <+107>: cvtps2pd %xmm0,%xmm2
0x00000000004020ae <+110>: cvtps2pd %xmm1,%xmm0
0x00000000004020b1 <+113>: movlpd %xmm2,(%rsi,%rax,2)
0x00000000004020b6 <+118>: movhpd %xmm2,0x8(%rsi,%rax,2)
0x00000000004020bc <+124>: movlpd %xmm0,0x10(%rsi,%rax,2)
0x00000000004020c2 <+130>: movhpd %xmm0,0x18(%rsi,%rax,2)
0x00000000004020c8 <+136>: add $0x10,%rax
0x00000000004020cc <+140>: cmp %rcx,%rdx
0x00000000004020cf <+143>: ja 0x402098 <f_to_d+88>
0x00000000004020d1 <+145>: cmp %r9,%r8
0x00000000004020d4 <+148>: lea (%rsi,%r9,8),%rsi
0x00000000004020d8 <+152>: lea (%rdi,%r9,4),%rdi
0x00000000004020dc <+156>: je 0x40210d <f_to_d+205>
0x00000000004020de <+158>: mov %r9,%rdx
0x00000000004020e1 <+161>: mov %r9,%rax
0x00000000004020e4 <+164>: neg %rdx
0x00000000004020e7 <+167>: lea (%rsi,%rdx,8),%rcx
0x00000000004020eb <+171>: lea (%rdi,%rdx,4),%rdx
0x00000000004020ef <+175>: nop
0x00000000004020f0 <+176>: movss (%rdx,%rax,4),%xmm0
0x00000000004020f5 <+181>: cvtps2pd %xmm0,%xmm0
0x00000000004020f8 <+184>: movsd %xmm0,(%rcx,%rax,8)
0x00000000004020fd <+189>: add $0x1,%rax
0x0000000000402101 <+193>: cmp %rax,%r8
0x0000000000402104 <+196>: ja 0x4020f0 <f_to_d+176>
0x0000000000402106 <+198>: repz retq
0x0000000000402108 <+200>: xor %r9d,%r9d
0x000000000040210b <+203>: jmp 0x4020de <f_to_d+158>
0x000000000040210d <+205>: nopl (%rax)
0x0000000000402110 <+208>: retq
0x0000000000402040:mov%rdx,%r8
0x0000000000402043:shr$0x3,%r8
0x0000000000402047:测试%r8,%r8
0x000000000040204a:je 0x402106
0x0000000000402050:shr$0x5,%rdx
0x0000000000402054:lea0x0(,%rdx,4),%r9
0x000000000040205c:测试%r9,%r9
0x000000000040205f:je 0x402108
0x0000000000402065:lea(%rdi,%r8,4),%rax
0x0000000000402069:cmp$0xb,%r8
0x000000000040206d:lea(%rsi,%r8,8),%r10
0x0000000000402071:集合%cl
0x0000000000402074:cmp%rax,%rsi
0x0000000000402077:seta%al
0x000000000040207a:cmp%r10,%rdi
0x000000000040207d:集合%r10b
0x0000000000402081:或%r10d,%eax
0x0000000000402084:测试%al,%cl
0x0000000000402086:je 0x402108
0x000000000040208c:xorps%xmm3,%xmm3
0x000000000040208f:xor%eax,%eax
0x0000000000402091:xor%ecx,%ecx
0x0000000000402093:nopl 0x0(%rax,%rax,1)
0x0000000000402098:movaps%xmm3,%xmm0
0x000000000040209b:添加$0x1,%rcx
0x000000000040209f:movlps(%rdi,%rax,1),%xmm0
0x00000000004020a3:movhps 0x8(%rdi,%rax,1),%xmm0
0x00000000004020a8:movhlps%xmm0,%xmm1
0x00000000004020ab:cvtps2pd%xmm0,%xmm2
0x00000000004020ae:cvtps2pd%xmm1,%xmm0
0x00000000004020b1:movlpd%xmm2,(%rsi,%rax,2)
0x00000000004020b6:movhpd%xmm2,0x8(%rsi,%rax,2)
0x00000000004020bc:movlpd%xmm0,0x10(%rsi,%rax,2)
0x00000000004020c2:movhpd%xmm0,0x18(%rsi,%rax,2)
0x00000000004020c8:添加$0x10,%rax
0x00000000004020cc:cmp%rcx,%rdx
0x00000000004020cf:ja 0x402098
0x00000000004020d1:cmp%r9,%r8
0x00000000004020d4:lea(%rsi,%r9,8),%rsi
0x00000000004020d8:lea(%rdi,%r9,4),%rdi
0x00000000004020dc:je 0x40210d
0x00000000004020de:mov%r9,%rdx
0x00000000004020e1:mov%r9,%rax
0x00000000004020e4:负%rdx
0x00000000004020e7:lea(%rsi,%rdx,8),%rcx
0x00000000004020eb:lea(%rdi,%rdx,4),%rdx
0x00000000004020ef:nop
0x00000000004020f0:movss(%rdx,%rax,4),%xmm0
0x00000000004020f5:cvtps2pd%xmm0,%xmm0
0x00000000004020f8:movsd%xmm0,(%rcx,%rax,8)
0x00000000004020fd:添加$0x1,%rax
0x0000000000402101:cmp%rax,%r8
0x0000000000402104:ja 0x4020f0
0x0000000000402106:repz-retq
0x0000000000402108:xor%r9d,%r9d
0x000000000040210b:jmp 0x4020de
0x000000000040210d:nopl(%rax)
0x0000000000402110:retq
有人能解释一下浮子->双转换引擎盖下的情况吗?以及如何编写它以获得更高效的组装?如果有必要的话,我将使用GCC4.6.3。这里有一些事情我可以很快看到(代码有点长,时间有点晚,我不喜欢AT&T语法) 首先,第二个循环是矢量化的(但很糟糕,见下文)。这必然会导致一些代码膨胀——它现在必须处理比向量短的“尾端”之类的问题 其次,float到double是一种更广泛的转换。对于标量来说,这并不重要,但对于向量来说,这意味着你不能仅仅读取一些数据,转换数据,然后再写回去——沿着这条线的某个地方,你将得到两倍于原来的字节数,它们必须被处理。(因此
movhlps%xmm0,%xmm1
)
实际的循环仅从402098h到4020cfh,下面是“尾部处理”,上面是一个怪物,它测试是否完全跳过了主循环,以及一些我还没有完全弄清楚的事情-如果是为了对齐,这是有意义的,但我没有看到任何测试rdi,15
,也没有任何明显的东西可以摆脱一个不一致的开端
第三,海湾合作委员会正在步履蹒跚。这并不罕见。它似乎认为xmm3在某种程度上与之相关,但事实并非如此,而且它似乎忘记了向量可以从内存中一块加载到内存中——这可能是因为怪物在开始时确实没有测试对齐,这是它对未对齐指针的防御。无论如何,GCC在这方面做得很糟糕。你所说的“怪物”实际上是这样的。在这种技术开始很好地工作并在通用编译器中有用之前,大约有20年的研究已经对它进行了研究
它可能并不漂亮,但GCC实现者认为,对于长阵列,它会更快。如果您的数组实际上并不长,或者如果您不能忍受编译代码看起来像这样的想法,请禁用该特定优化。使用
-O2
编译应该可以(未经测试)。是否使用64位编译器编译?如果没有,它可能看不到64位扩展。@Nirk抱歉,是的,我在64位Ubuntu上,gcc的默认目标显示为x86_64-linux-gnuBTW,有两个宏是多余的;您只需将标识函数(作为宏,即#define IDENT(x)(x)
)通过_FUNC传递给CONV_,以通过_CAST
获取CONV_。编译器可能已经在可能的情况下为对齐和矢量化做了一些难看的特殊外壳。您启用了什么优化级别?@R,好提示谢谢,我最初只有一个函数可以转换所有内容,然后决定使用lrint函数,所以我添加了第二个宏。我正在使用-O3进行优化(也尝试了-ffast数学,但没有任何区别)确定eno
CONV_VIA_CAST(f_to_d, float, double);
0x0000000000402040 <+0>: mov %rdx,%r8
0x0000000000402043 <+3>: shr $0x3,%r8
0x0000000000402047 <+7>: test %r8,%r8
0x000000000040204a <+10>: je 0x402106 <f_to_d+198>
0x0000000000402050 <+16>: shr $0x5,%rdx
0x0000000000402054 <+20>: lea 0x0(,%rdx,4),%r9
0x000000000040205c <+28>: test %r9,%r9
0x000000000040205f <+31>: je 0x402108 <f_to_d+200>
0x0000000000402065 <+37>: lea (%rdi,%r8,4),%rax
0x0000000000402069 <+41>: cmp $0xb,%r8
0x000000000040206d <+45>: lea (%rsi,%r8,8),%r10
0x0000000000402071 <+49>: seta %cl
0x0000000000402074 <+52>: cmp %rax,%rsi
0x0000000000402077 <+55>: seta %al
0x000000000040207a <+58>: cmp %r10,%rdi
0x000000000040207d <+61>: seta %r10b
0x0000000000402081 <+65>: or %r10d,%eax
0x0000000000402084 <+68>: test %al,%cl
0x0000000000402086 <+70>: je 0x402108 <f_to_d+200>
0x000000000040208c <+76>: xorps %xmm3,%xmm3
0x000000000040208f <+79>: xor %eax,%eax
0x0000000000402091 <+81>: xor %ecx,%ecx
0x0000000000402093 <+83>: nopl 0x0(%rax,%rax,1)
0x0000000000402098 <+88>: movaps %xmm3,%xmm0
0x000000000040209b <+91>: add $0x1,%rcx
0x000000000040209f <+95>: movlps (%rdi,%rax,1),%xmm0
0x00000000004020a3 <+99>: movhps 0x8(%rdi,%rax,1),%xmm0
0x00000000004020a8 <+104>: movhlps %xmm0,%xmm1
0x00000000004020ab <+107>: cvtps2pd %xmm0,%xmm2
0x00000000004020ae <+110>: cvtps2pd %xmm1,%xmm0
0x00000000004020b1 <+113>: movlpd %xmm2,(%rsi,%rax,2)
0x00000000004020b6 <+118>: movhpd %xmm2,0x8(%rsi,%rax,2)
0x00000000004020bc <+124>: movlpd %xmm0,0x10(%rsi,%rax,2)
0x00000000004020c2 <+130>: movhpd %xmm0,0x18(%rsi,%rax,2)
0x00000000004020c8 <+136>: add $0x10,%rax
0x00000000004020cc <+140>: cmp %rcx,%rdx
0x00000000004020cf <+143>: ja 0x402098 <f_to_d+88>
0x00000000004020d1 <+145>: cmp %r9,%r8
0x00000000004020d4 <+148>: lea (%rsi,%r9,8),%rsi
0x00000000004020d8 <+152>: lea (%rdi,%r9,4),%rdi
0x00000000004020dc <+156>: je 0x40210d <f_to_d+205>
0x00000000004020de <+158>: mov %r9,%rdx
0x00000000004020e1 <+161>: mov %r9,%rax
0x00000000004020e4 <+164>: neg %rdx
0x00000000004020e7 <+167>: lea (%rsi,%rdx,8),%rcx
0x00000000004020eb <+171>: lea (%rdi,%rdx,4),%rdx
0x00000000004020ef <+175>: nop
0x00000000004020f0 <+176>: movss (%rdx,%rax,4),%xmm0
0x00000000004020f5 <+181>: cvtps2pd %xmm0,%xmm0
0x00000000004020f8 <+184>: movsd %xmm0,(%rcx,%rax,8)
0x00000000004020fd <+189>: add $0x1,%rax
0x0000000000402101 <+193>: cmp %rax,%r8
0x0000000000402104 <+196>: ja 0x4020f0 <f_to_d+176>
0x0000000000402106 <+198>: repz retq
0x0000000000402108 <+200>: xor %r9d,%r9d
0x000000000040210b <+203>: jmp 0x4020de <f_to_d+158>
0x000000000040210d <+205>: nopl (%rax)
0x0000000000402110 <+208>: retq