C++ SSE加载/存储内存事务
在使用SSE Intrinsic时,内存寄存器交互有两种方式: 中间指针:C++ SSE加载/存储内存事务,c++,performance,sse,simd,C++,Performance,Sse,Simd,在使用SSE Intrinsic时,内存寄存器交互有两种方式: 中间指针: void f_sse(float *input, float *output, unsigned int n) { _m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer _m128 *output_sse = reinterpret_cast<__m128*>(output);/
void f_sse(float *input, float *output, unsigned int n)
{
_m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer
_m128 *output_sse = reinterpret_cast<__m128*>(output);//Output intermediate pointer
_m128 s = _mm_set1_ps(0.1f);
auto loop_size = n/4;
for(auto i=0; i<loop_size; ++i)
output_sse[i] = _mm_add_ps(input_sse[i], s);
}
void f_sse(浮点*输入,浮点*输出,无符号整数n)
{
_m128*input_sse=reinterpret_cast(输入);//输入中间指针
_m128*output_sse=reinterpret_cast(output);//输出中间指针
_m128 s=_mm_set1_ps(0.1f);
自动循环大小=n/4;
对于(auto i=0;i在优化级别O3用g++编译)内部循环的汇编代码(使用objdump-d
)是
20:0f 28 04 07 movaps(%rdi,%rax,1),%xmm0
24:0f 58 c1添加%xmm1,%xmm0
27:0f 29 04 06 movaps%xmm0,(%rsi,%rax,1)
2b:48 83 c0 10添加$0x10,%rax
2f:48 39 d0 cmp%rdx,%rax
32:75欧共体约20
及
10:0f 28 04 07 movaps(%rdi,%rax,1),%xmm0
14:83 c1 04添加$0x4,%ecx
17:0f 58 c1添加%xmm1,%xmm0
1a:0f 29 04 06 movaps%xmm0,(%rsi,%rax,1)
1e:48 83 c0 10添加$0x10,%rax
22:39 ca cmp%ecx,%edx
24:77 ea ja 10
它们非常相似。在第一个g++中,只使用了一个计数器(只有一个add
指令)。因此我想这更好。我用g++-O2编译了两个示例,我发现的主要区别是edx
(n)中的值使用不同,这导致代码略有不同
第一个功能:
0000000000000000 <_Z6f_sse2PfS_j>:
0: c1 ea 02 shr $0x2,%edx # loop_size = n / 4.
3: 85 d2 test %edx,%edx
5: 74 2d je 34 <_Z6f_sse2PfS_j+0x34>
7: 83 ea 01 sub $0x1,%edx
a: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # 11 <_Z6f_sse2PfS_j+0x11>
11: 48 83 c2 01 add $0x1,%rdx
15: 31 c0 xor %eax,%eax
17: 48 c1 e2 04 shl $0x4,%rdx // Adjust for loop size vs. index.
1b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
20: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
24: 0f 58 c1 addps %xmm1,%xmm0
27: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
2b: 48 83 c0 10 add $0x10,%rax
2f: 48 39 d0 cmp %rdx,%rax
32: 75 ec jne 20 <_Z6f_sse2PfS_j+0x20>
34: f3 c3 repz retq
0000000000000000 <_Z5f_ssePfS_j>:
0: 85 d2 test %edx,%edx
2: 74 22 je 26 <_Z5f_ssePfS_j+0x26>
4: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # b <_Z5f_ssePfS_j+0xb>
b: 31 c0 xor %eax,%eax
d: 31 c9 xor %ecx,%ecx
f: 90 nop
10: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
14: 83 c1 04 add $0x4,%ecx
17: 0f 58 c1 addps %xmm1,%xmm0
1a: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
1e: 48 83 c0 10 add $0x10,%rax
22: 39 ca cmp %ecx,%edx
24: 77 ea ja 10 <_Z5f_ssePfS_j+0x10>
26: f3 c3 repz retq
0000000000000000:
0:c1 ea 02 shr$0x2,%edx循环大小=n/4。
3:85 d2测试%edx,%edx
5:74 2d je 34
7:83 ea 01子$0x1,%edx
a:0f 28 0d 00 movaps 0x0(%rip),%xmm1#11
11:48 83 c2 01添加$0x1,%rdx
15:31 c0异或%eax,%eax
17:48 c1 e2 04 shl$0x4,%rdx//调整循环大小与索引。
1b:0f 1f 44 00 nopl 0x0(%rax,%rax,1)
20:0f 28 04 07 movap(%rdi,%rax,1),%xmm0
24:0f 58 c1添加%xmm1,%xmm0
27:0f 29 04 06 movaps%xmm0,(%rsi,%rax,1)
2b:48 83 c0 10添加$0x10,%rax
2f:48 39 d0 cmp%rdx,%rax
32:75欧共体约20
34:f3 c3 repz retq
第二个功能:
0000000000000000 <_Z6f_sse2PfS_j>:
0: c1 ea 02 shr $0x2,%edx # loop_size = n / 4.
3: 85 d2 test %edx,%edx
5: 74 2d je 34 <_Z6f_sse2PfS_j+0x34>
7: 83 ea 01 sub $0x1,%edx
a: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # 11 <_Z6f_sse2PfS_j+0x11>
11: 48 83 c2 01 add $0x1,%rdx
15: 31 c0 xor %eax,%eax
17: 48 c1 e2 04 shl $0x4,%rdx // Adjust for loop size vs. index.
1b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
20: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
24: 0f 58 c1 addps %xmm1,%xmm0
27: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
2b: 48 83 c0 10 add $0x10,%rax
2f: 48 39 d0 cmp %rdx,%rax
32: 75 ec jne 20 <_Z6f_sse2PfS_j+0x20>
34: f3 c3 repz retq
0000000000000000 <_Z5f_ssePfS_j>:
0: 85 d2 test %edx,%edx
2: 74 22 je 26 <_Z5f_ssePfS_j+0x26>
4: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # b <_Z5f_ssePfS_j+0xb>
b: 31 c0 xor %eax,%eax
d: 31 c9 xor %ecx,%ecx
f: 90 nop
10: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
14: 83 c1 04 add $0x4,%ecx
17: 0f 58 c1 addps %xmm1,%xmm0
1a: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
1e: 48 83 c0 10 add $0x10,%rax
22: 39 ca cmp %ecx,%edx
24: 77 ea ja 10 <_Z5f_ssePfS_j+0x10>
26: f3 c3 repz retq
0000000000000000:
0:85 d2测试%edx,%edx
2:74 22 26
4:0f 28 0d 00 movaps 0x0(%rip),%xmm1#b
b:31 c0异或%eax,%eax
d:31 c9异或%ecx,%ecx
f:90不
10:0f 28 04 07 movaps(%rdi,%rax,1),%xmm0
14:83 c1 04添加$0x4,%ecx
17:0f 58 c1添加%xmm1,%xmm0
1a:0f 29 04 06 movaps%xmm0,(%rsi,%rax,1)
1e:48 83 c0 10添加$0x10,%rax
22:39 ca cmp%ecx,%edx
24:77 ea ja 10
26:f3 c3重调
我还查看了生成的代码,并得出以下结论:
void f_sse2(float *input, float *output, unsigned int n)
{
__m128 *end = reinterpret_cast<__m128*>(&input[n]);
__m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer
__m128 *output_sse = reinterpret_cast<__m128*>(output);//Output intermediate pointer
__m128 s = _mm_set1_ps(0.1f);
while(input_sse < end)
*output_sse++ = _mm_add_ps(*input_sse++, s);
}
void f_sse2(浮点*输入,浮点*输出,无符号整数n)
{
__m128*end=重新解释强制转换(&input[n]);
__m128*input_sse=reinterpret_cast(输入);//输入中间指针
__m128*output_sse=reinterpret_cast(output);//输出中间指针
__m128 s=_mm_set1_ps(0.1f);
while(输入值<结束)
*输出_sse++=_mm_add_ps(*输入_sse++,s);
}
这将生成以下代码:
0000000000000000 <_Z6f_sse2PfS_j>:
0: 89 d2 mov %edx,%edx
2: 48 8d 04 97 lea (%rdi,%rdx,4),%rax
6: 48 39 c7 cmp %rax,%rdi
9: 73 23 jae 2e <_Z6f_sse2PfS_j+0x2e>
b: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # 12 <_Z6f_sse2PfS_j+0x12>
12: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
18: 0f 28 07 movaps (%rdi),%xmm0
1b: 48 83 c7 10 add $0x10,%rdi
1f: 0f 58 c1 addps %xmm1,%xmm0
22: 0f 29 06 movaps %xmm0,(%rsi)
25: 48 83 c6 10 add $0x10,%rsi
29: 48 39 f8 cmp %rdi,%rax
2c: 77 ea ja 18 <_Z6f_sse2PfS_j+0x18>
2e: f3 c3 repz retq
0000000000000000:
0:89 d2 mov%edx,%edx
2:48 8d 04 97 lea(%rdi,%rdx,4),%rax
6:48 39 c7 cmp%rax,%rdi
9:73 23 jae 2e
b:0f 28 0d 00 movaps 0x0(%rip),%xmm1#12
12:66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
18:0f 28 07 movaps(%rdi),%xmm0
1b:48 83 c7 10添加$0x10,%rdi
1f:0f 58 c1添加%xmm1,%xmm0
22:0f 29 06 movaps%xmm0,(%rsi)
25:48 83 c6 10添加$0x10,%rsi
29:48 39 f8 cmp%rdi,%rax
2c:77 ea ja 18
2e:f3 c3重复试验
我认为这可能更有效一点,但可能不值得更改它。但它给了我15分钟的时间来做一些事情。如果第一个示例的赋值操作使用未对齐的指令,它会更慢。_mm_store_ps is aligned store不是吗?第一个示例类似于elementwise copy。您能显示反汇编程序的输出吗?
0000000000000000 <_Z6f_sse2PfS_j>:
0: 89 d2 mov %edx,%edx
2: 48 8d 04 97 lea (%rdi,%rdx,4),%rax
6: 48 39 c7 cmp %rax,%rdi
9: 73 23 jae 2e <_Z6f_sse2PfS_j+0x2e>
b: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # 12 <_Z6f_sse2PfS_j+0x12>
12: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
18: 0f 28 07 movaps (%rdi),%xmm0
1b: 48 83 c7 10 add $0x10,%rdi
1f: 0f 58 c1 addps %xmm1,%xmm0
22: 0f 29 06 movaps %xmm0,(%rsi)
25: 48 83 c6 10 add $0x10,%rsi
29: 48 39 f8 cmp %rdi,%rax
2c: 77 ea ja 18 <_Z6f_sse2PfS_j+0x18>
2e: f3 c3 repz retq