C++ SSE加载/存储内存事务

C++ SSE加载/存储内存事务,c++,performance,sse,simd,C++,Performance,Sse,Simd,在使用SSE Intrinsic时,内存寄存器交互有两种方式: 中间指针: void f_sse(float *input, float *output, unsigned int n) { _m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer _m128 *output_sse = reinterpret_cast<__m128*>(output);/

在使用SSE Intrinsic时,内存寄存器交互有两种方式:

中间指针:

void f_sse(float *input, float *output, unsigned int n)
{
   _m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer
   _m128 *output_sse = reinterpret_cast<__m128*>(output);//Output intermediate pointer
   _m128 s = _mm_set1_ps(0.1f);
   auto loop_size = n/4; 
   for(auto i=0; i<loop_size; ++i)
      output_sse[i] = _mm_add_ps(input_sse[i], s);
}
void f_sse(浮点*输入,浮点*输出,无符号整数n)
{
_m128*input_sse=reinterpret_cast(输入);//输入中间指针
_m128*output_sse=reinterpret_cast(output);//输出中间指针
_m128 s=_mm_set1_ps(0.1f);
自动循环大小=n/4;

对于(auto i=0;i在优化级别O3用g++编译)内部循环的汇编代码(使用
objdump-d
)是

20:0f 28 04 07 movaps(%rdi,%rax,1),%xmm0
24:0f 58 c1添加%xmm1,%xmm0
27:0f 29 04 06 movaps%xmm0,(%rsi,%rax,1)
2b:48 83 c0 10添加$0x10,%rax
2f:48 39 d0 cmp%rdx,%rax
32:75欧共体约20

10:0f 28 04 07 movaps(%rdi,%rax,1),%xmm0
14:83 c1 04添加$0x4,%ecx
17:0f 58 c1添加%xmm1,%xmm0
1a:0f 29 04 06 movaps%xmm0,(%rsi,%rax,1)
1e:48 83 c0 10添加$0x10,%rax
22:39 ca cmp%ecx,%edx
24:77 ea ja 10

它们非常相似。在第一个g++中,只使用了一个计数器(只有一个
add
指令)。因此我想这更好。

我用g++-O2编译了两个示例,我发现的主要区别是
edx
(n)中的值使用不同,这导致代码略有不同

第一个功能:

0000000000000000 <_Z6f_sse2PfS_j>:
   0:   c1 ea 02                shr    $0x2,%edx      # loop_size = n / 4. 
   3:   85 d2                   test   %edx,%edx
   5:   74 2d                   je     34 <_Z6f_sse2PfS_j+0x34>
   7:   83 ea 01                sub    $0x1,%edx
   a:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # 11 <_Z6f_sse2PfS_j+0x11>
  11:   48 83 c2 01             add    $0x1,%rdx
  15:   31 c0                   xor    %eax,%eax
  17:   48 c1 e2 04             shl    $0x4,%rdx             // Adjust for loop size vs. index. 
  1b:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  20:   0f 28 04 07             movaps (%rdi,%rax,1),%xmm0
  24:   0f 58 c1                addps  %xmm1,%xmm0
  27:   0f 29 04 06             movaps %xmm0,(%rsi,%rax,1)
  2b:   48 83 c0 10             add    $0x10,%rax
  2f:   48 39 d0                cmp    %rdx,%rax
  32:   75 ec                   jne    20 <_Z6f_sse2PfS_j+0x20>
  34:   f3 c3                   repz retq 
0000000000000000 <_Z5f_ssePfS_j>:
   0:   85 d2                   test   %edx,%edx
   2:   74 22                   je     26 <_Z5f_ssePfS_j+0x26>
   4:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # b <_Z5f_ssePfS_j+0xb>
   b:   31 c0                   xor    %eax,%eax
   d:   31 c9                   xor    %ecx,%ecx
   f:   90                      nop
  10:   0f 28 04 07             movaps (%rdi,%rax,1),%xmm0
  14:   83 c1 04                add    $0x4,%ecx
  17:   0f 58 c1                addps  %xmm1,%xmm0
  1a:   0f 29 04 06             movaps %xmm0,(%rsi,%rax,1)
  1e:   48 83 c0 10             add    $0x10,%rax
  22:   39 ca                   cmp    %ecx,%edx
  24:   77 ea                   ja     10 <_Z5f_ssePfS_j+0x10>
  26:   f3 c3                   repz retq 
0000000000000000:
0:c1 ea 02 shr$0x2,%edx循环大小=n/4。
3:85 d2测试%edx,%edx
5:74 2d je 34
7:83 ea 01子$0x1,%edx
a:0f 28 0d 00 movaps 0x0(%rip),%xmm1#11
11:48 83 c2 01添加$0x1,%rdx
15:31 c0异或%eax,%eax
17:48 c1 e2 04 shl$0x4,%rdx//调整循环大小与索引。
1b:0f 1f 44 00 nopl 0x0(%rax,%rax,1)
20:0f 28 04 07 movap(%rdi,%rax,1),%xmm0
24:0f 58 c1添加%xmm1,%xmm0
27:0f 29 04 06 movaps%xmm0,(%rsi,%rax,1)
2b:48 83 c0 10添加$0x10,%rax
2f:48 39 d0 cmp%rdx,%rax
32:75欧共体约20
34:f3 c3 repz retq
第二个功能:

0000000000000000 <_Z6f_sse2PfS_j>:
   0:   c1 ea 02                shr    $0x2,%edx      # loop_size = n / 4. 
   3:   85 d2                   test   %edx,%edx
   5:   74 2d                   je     34 <_Z6f_sse2PfS_j+0x34>
   7:   83 ea 01                sub    $0x1,%edx
   a:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # 11 <_Z6f_sse2PfS_j+0x11>
  11:   48 83 c2 01             add    $0x1,%rdx
  15:   31 c0                   xor    %eax,%eax
  17:   48 c1 e2 04             shl    $0x4,%rdx             // Adjust for loop size vs. index. 
  1b:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  20:   0f 28 04 07             movaps (%rdi,%rax,1),%xmm0
  24:   0f 58 c1                addps  %xmm1,%xmm0
  27:   0f 29 04 06             movaps %xmm0,(%rsi,%rax,1)
  2b:   48 83 c0 10             add    $0x10,%rax
  2f:   48 39 d0                cmp    %rdx,%rax
  32:   75 ec                   jne    20 <_Z6f_sse2PfS_j+0x20>
  34:   f3 c3                   repz retq 
0000000000000000 <_Z5f_ssePfS_j>:
   0:   85 d2                   test   %edx,%edx
   2:   74 22                   je     26 <_Z5f_ssePfS_j+0x26>
   4:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # b <_Z5f_ssePfS_j+0xb>
   b:   31 c0                   xor    %eax,%eax
   d:   31 c9                   xor    %ecx,%ecx
   f:   90                      nop
  10:   0f 28 04 07             movaps (%rdi,%rax,1),%xmm0
  14:   83 c1 04                add    $0x4,%ecx
  17:   0f 58 c1                addps  %xmm1,%xmm0
  1a:   0f 29 04 06             movaps %xmm0,(%rsi,%rax,1)
  1e:   48 83 c0 10             add    $0x10,%rax
  22:   39 ca                   cmp    %ecx,%edx
  24:   77 ea                   ja     10 <_Z5f_ssePfS_j+0x10>
  26:   f3 c3                   repz retq 
0000000000000000:
0:85 d2测试%edx,%edx
2:74 22 26
4:0f 28 0d 00 movaps 0x0(%rip),%xmm1#b
b:31 c0异或%eax,%eax
d:31 c9异或%ecx,%ecx
f:90不
10:0f 28 04 07 movaps(%rdi,%rax,1),%xmm0
14:83 c1 04添加$0x4,%ecx
17:0f 58 c1添加%xmm1,%xmm0
1a:0f 29 04 06 movaps%xmm0,(%rsi,%rax,1)
1e:48 83 c0 10添加$0x10,%rax
22:39 ca cmp%ecx,%edx
24:77 ea ja 10
26:f3 c3重调
我还查看了生成的代码,并得出以下结论:

void f_sse2(float *input, float *output, unsigned int n)
{
    __m128 *end = reinterpret_cast<__m128*>(&input[n]);
   __m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer
   __m128 *output_sse = reinterpret_cast<__m128*>(output);//Output intermediate pointer
   __m128 s = _mm_set1_ps(0.1f);
   while(input_sse < end)
      *output_sse++ = _mm_add_ps(*input_sse++, s);
}
void f_sse2(浮点*输入,浮点*输出,无符号整数n)
{
__m128*end=重新解释强制转换(&input[n]);
__m128*input_sse=reinterpret_cast(输入);//输入中间指针
__m128*output_sse=reinterpret_cast(output);//输出中间指针
__m128 s=_mm_set1_ps(0.1f);
while(输入值<结束)
*输出_sse++=_mm_add_ps(*输入_sse++,s);
}
这将生成以下代码:

0000000000000000 <_Z6f_sse2PfS_j>:
   0:   89 d2                   mov    %edx,%edx
   2:   48 8d 04 97             lea    (%rdi,%rdx,4),%rax
   6:   48 39 c7                cmp    %rax,%rdi
   9:   73 23                   jae    2e <_Z6f_sse2PfS_j+0x2e>
   b:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # 12 <_Z6f_sse2PfS_j+0x12>
  12:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
  18:   0f 28 07                movaps (%rdi),%xmm0
  1b:   48 83 c7 10             add    $0x10,%rdi
  1f:   0f 58 c1                addps  %xmm1,%xmm0
  22:   0f 29 06                movaps %xmm0,(%rsi)
  25:   48 83 c6 10             add    $0x10,%rsi
  29:   48 39 f8                cmp    %rdi,%rax
  2c:   77 ea                   ja     18 <_Z6f_sse2PfS_j+0x18>
  2e:   f3 c3                   repz retq 
0000000000000000:
0:89 d2 mov%edx,%edx
2:48 8d 04 97 lea(%rdi,%rdx,4),%rax
6:48 39 c7 cmp%rax,%rdi
9:73 23 jae 2e
b:0f 28 0d 00 movaps 0x0(%rip),%xmm1#12
12:66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
18:0f 28 07 movaps(%rdi),%xmm0
1b:48 83 c7 10添加$0x10,%rdi
1f:0f 58 c1添加%xmm1,%xmm0
22:0f 29 06 movaps%xmm0,(%rsi)
25:48 83 c6 10添加$0x10,%rsi
29:48 39 f8 cmp%rdi,%rax
2c:77 ea ja 18
2e:f3 c3重复试验

我认为这可能更有效一点,但可能不值得更改它。但它给了我15分钟的时间来做一些事情。

如果第一个示例的赋值操作使用未对齐的指令,它会更慢。_mm_store_ps is aligned store不是吗?第一个示例类似于elementwise copy。您能显示反汇编程序的输出吗?
0000000000000000 <_Z6f_sse2PfS_j>:
   0:   89 d2                   mov    %edx,%edx
   2:   48 8d 04 97             lea    (%rdi,%rdx,4),%rax
   6:   48 39 c7                cmp    %rax,%rdi
   9:   73 23                   jae    2e <_Z6f_sse2PfS_j+0x2e>
   b:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # 12 <_Z6f_sse2PfS_j+0x12>
  12:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
  18:   0f 28 07                movaps (%rdi),%xmm0
  1b:   48 83 c7 10             add    $0x10,%rdi
  1f:   0f 58 c1                addps  %xmm1,%xmm0
  22:   0f 29 06                movaps %xmm0,(%rsi)
  25:   48 83 c6 10             add    $0x10,%rsi
  29:   48 39 f8                cmp    %rdi,%rax
  2c:   77 ea                   ja     18 <_Z6f_sse2PfS_j+0x18>
  2e:   f3 c3                   repz retq