C++ SSE加载/存储内存事务_C++_Performance_Sse_Simd

C++ SSE加载/存储内存事务

c++ performance

C++ SSE加载/存储内存事务,c++,performance,sse,simd,C++,Performance,Sse,Simd,在使用SSE Intrinsic时，内存寄存器交互有两种方式：中间指针： void f_sse(float *input, float *output, unsigned int n) { _m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer _m128 *output_sse = reinterpret_cast<__m128*>(output);/

在使用SSE Intrinsic时，内存寄存器交互有两种方式：

中间指针：

void f_sse(float *input, float *output, unsigned int n)
{
   _m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer
   _m128 *output_sse = reinterpret_cast<__m128*>(output);//Output intermediate pointer
   _m128 s = _mm_set1_ps(0.1f);
   auto loop_size = n/4; 
   for(auto i=0; i<loop_size; ++i)
      output_sse[i] = _mm_add_ps(input_sse[i], s);
}

void f_sse（浮点*输入，浮点*输出，无符号整数n）
{
_m128*input_sse=reinterpret_cast（输入）；//输入中间指针
_m128*output_sse=reinterpret_cast（output）；//输出中间指针
_m128 s=_mm_set1_ps（0.1f）；
自动循环大小=n/4；
对于（auto i=0；i在优化级别O3用g++编译）内部循环的汇编代码（使用objdump-d
）是
20:0f 28 04 07 movaps（%rdi，%rax，1），%xmm0
24:0f 58 c1添加%xmm1，%xmm0
27:0f 29 04 06 movaps%xmm0，（%rsi，%rax，1）
2b:48 83 c0 10添加$0x10，%rax
2f:48 39 d0 cmp%rdx，%rax
32:75欧共体约20

及
10:0f 28 04 07 movaps（%rdi，%rax，1），%xmm0
14:83 c1 04添加$0x4，%ecx
17:0f 58 c1添加%xmm1，%xmm0
1a:0f 29 04 06 movaps%xmm0，（%rsi，%rax，1）
1e:48 83 c0 10添加$0x10，%rax
22:39 ca cmp%ecx，%edx
24:77 ea ja 10

它们非常相似。在第一个g++中，只使用了一个计数器（只有一个add
指令）。因此我想这更好。
我用g++-O2编译了两个示例，我发现的主要区别是edx
（n）中的值使用不同，这导致代码略有不同
第一个功能：
0000000000000000 <_Z6f_sse2PfS_j>:
   0:   c1 ea 02                shr    $0x2,%edx      # loop_size = n / 4. 
   3:   85 d2                   test   %edx,%edx
   5:   74 2d                   je     34 <_Z6f_sse2PfS_j+0x34>
   7:   83 ea 01                sub    $0x1,%edx
   a:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # 11 <_Z6f_sse2PfS_j+0x11>
  11:   48 83 c2 01             add    $0x1,%rdx
  15:   31 c0                   xor    %eax,%eax
  17:   48 c1 e2 04             shl    $0x4,%rdx             // Adjust for loop size vs. index. 
  1b:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  20:   0f 28 04 07             movaps (%rdi,%rax,1),%xmm0
  24:   0f 58 c1                addps  %xmm1,%xmm0
  27:   0f 29 04 06             movaps %xmm0,(%rsi,%rax,1)
  2b:   48 83 c0 10             add    $0x10,%rax
  2f:   48 39 d0                cmp    %rdx,%rax
  32:   75 ec                   jne    20 <_Z6f_sse2PfS_j+0x20>
  34:   f3 c3                   repz retq 

0000000000000000 <_Z5f_ssePfS_j>:
   0:   85 d2                   test   %edx,%edx
   2:   74 22                   je     26 <_Z5f_ssePfS_j+0x26>
   4:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # b <_Z5f_ssePfS_j+0xb>
   b:   31 c0                   xor    %eax,%eax
   d:   31 c9                   xor    %ecx,%ecx
   f:   90                      nop
  10:   0f 28 04 07             movaps (%rdi,%rax,1),%xmm0
  14:   83 c1 04                add    $0x4,%ecx
  17:   0f 58 c1                addps  %xmm1,%xmm0
  1a:   0f 29 04 06             movaps %xmm0,(%rsi,%rax,1)
  1e:   48 83 c0 10             add    $0x10,%rax
  22:   39 ca                   cmp    %ecx,%edx
  24:   77 ea                   ja     10 <_Z5f_ssePfS_j+0x10>
  26:   f3 c3                   repz retq 

0000000000000000:
0:c1 ea 02 shr$0x2，%edx循环大小=n/4。
3:85 d2测试%edx，%edx
5:74 2d je 34
7:83 ea 01子$0x1，%edx
a:0f 28 0d 00 movaps 0x0（%rip），%xmm1#11
11:48 83 c2 01添加$0x1，%rdx
15:31 c0异或%eax，%eax
17:48 c1 e2 04 shl$0x4，%rdx//调整循环大小与索引。
1b:0f 1f 44 00 nopl 0x0（%rax，%rax，1）
20:0f 28 04 07 movap（%rdi，%rax，1），%xmm0
24:0f 58 c1添加%xmm1，%xmm0
27:0f 29 04 06 movaps%xmm0，（%rsi，%rax，1）
2b:48 83 c0 10添加$0x10，%rax
2f:48 39 d0 cmp%rdx，%rax
32:75欧共体约20
34:f3 c3 repz retq

第二个功能：
0000000000000000 <_Z6f_sse2PfS_j>:
   0:   c1 ea 02                shr    $0x2,%edx      # loop_size = n / 4. 
   3:   85 d2                   test   %edx,%edx
   5:   74 2d                   je     34 <_Z6f_sse2PfS_j+0x34>
   7:   83 ea 01                sub    $0x1,%edx
   a:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # 11 <_Z6f_sse2PfS_j+0x11>
  11:   48 83 c2 01             add    $0x1,%rdx
  15:   31 c0                   xor    %eax,%eax
  17:   48 c1 e2 04             shl    $0x4,%rdx             // Adjust for loop size vs. index. 
  1b:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  20:   0f 28 04 07             movaps (%rdi,%rax,1),%xmm0
  24:   0f 58 c1                addps  %xmm1,%xmm0
  27:   0f 29 04 06             movaps %xmm0,(%rsi,%rax,1)
  2b:   48 83 c0 10             add    $0x10,%rax
  2f:   48 39 d0                cmp    %rdx,%rax
  32:   75 ec                   jne    20 <_Z6f_sse2PfS_j+0x20>
  34:   f3 c3                   repz retq 

0000000000000000 <_Z5f_ssePfS_j>:
   0:   85 d2                   test   %edx,%edx
   2:   74 22                   je     26 <_Z5f_ssePfS_j+0x26>
   4:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # b <_Z5f_ssePfS_j+0xb>
   b:   31 c0                   xor    %eax,%eax
   d:   31 c9                   xor    %ecx,%ecx
   f:   90                      nop
  10:   0f 28 04 07             movaps (%rdi,%rax,1),%xmm0
  14:   83 c1 04                add    $0x4,%ecx
  17:   0f 58 c1                addps  %xmm1,%xmm0
  1a:   0f 29 04 06             movaps %xmm0,(%rsi,%rax,1)
  1e:   48 83 c0 10             add    $0x10,%rax
  22:   39 ca                   cmp    %ecx,%edx
  24:   77 ea                   ja     10 <_Z5f_ssePfS_j+0x10>
  26:   f3 c3                   repz retq 

0000000000000000:
0:85 d2测试%edx，%edx
2:74 22 26
4:0f 28 0d 00 movaps 0x0（%rip），%xmm1#b
b:31 c0异或%eax，%eax
d:31 c9异或%ecx，%ecx
f:90不
10:0f 28 04 07 movaps（%rdi，%rax，1），%xmm0
14:83 c1 04添加$0x4，%ecx
17:0f 58 c1添加%xmm1，%xmm0
1a:0f 29 04 06 movaps%xmm0，（%rsi，%rax，1）
1e:48 83 c0 10添加$0x10，%rax
22:39 ca cmp%ecx，%edx
24:77 ea ja 10
26:f3 c3重调

我还查看了生成的代码，并得出以下结论：
void f_sse2(float *input, float *output, unsigned int n)
{
    __m128 *end = reinterpret_cast<__m128*>(&input[n]);
   __m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer
   __m128 *output_sse = reinterpret_cast<__m128*>(output);//Output intermediate pointer
   __m128 s = _mm_set1_ps(0.1f);
   while(input_sse < end)
      *output_sse++ = _mm_add_ps(*input_sse++, s);
}

void f_sse2（浮点*输入，浮点*输出，无符号整数n）
{
__m128*end=重新解释强制转换（&input[n]）；
__m128*input_sse=reinterpret_cast（输入）；//输入中间指针
__m128*output_sse=reinterpret_cast（output）；//输出中间指针
__m128 s=_mm_set1_ps（0.1f）；
while（输入值<结束）
*输出_sse++=_mm_add_ps（*输入_sse++，s）；
}

这将生成以下代码：
0000000000000000 <_Z6f_sse2PfS_j>:
   0:   89 d2                   mov    %edx,%edx
   2:   48 8d 04 97             lea    (%rdi,%rdx,4),%rax
   6:   48 39 c7                cmp    %rax,%rdi
   9:   73 23                   jae    2e <_Z6f_sse2PfS_j+0x2e>
   b:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # 12 <_Z6f_sse2PfS_j+0x12>
  12:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
  18:   0f 28 07                movaps (%rdi),%xmm0
  1b:   48 83 c7 10             add    $0x10,%rdi
  1f:   0f 58 c1                addps  %xmm1,%xmm0
  22:   0f 29 06                movaps %xmm0,(%rsi)
  25:   48 83 c6 10             add    $0x10,%rsi
  29:   48 39 f8                cmp    %rdi,%rax
  2c:   77 ea                   ja     18 <_Z6f_sse2PfS_j+0x18>
  2e:   f3 c3                   repz retq 

0000000000000000:
0:89 d2 mov%edx，%edx
2:48 8d 04 97 lea（%rdi，%rdx，4），%rax
6:48 39 c7 cmp%rax，%rdi
9:73 23 jae 2e
b:0f 28 0d 00 movaps 0x0（%rip），%xmm1#12
12:66 0f 1f 44 00 00 nopw 0x0（%rax，%rax，1）
18:0f 28 07 movaps（%rdi），%xmm0
1b:48 83 c7 10添加$0x10，%rdi
1f:0f 58 c1添加%xmm1，%xmm0
22:0f 29 06 movaps%xmm0，（%rsi）
25:48 83 c6 10添加$0x10，%rsi
29:48 39 f8 cmp%rdi，%rax
2c:77 ea ja 18
2e:f3 c3重复试验

我认为这可能更有效一点，但可能不值得更改它。但它给了我15分钟的时间来做一些事情。如果第一个示例的赋值操作使用未对齐的指令，它会更慢。_mm_store_ps is aligned store不是吗？第一个示例类似于elementwise copy。您能显示反汇编程序的输出吗？
0000000000000000 <_Z6f_sse2PfS_j>:
   0:   89 d2                   mov    %edx,%edx
   2:   48 8d 04 97             lea    (%rdi,%rdx,4),%rax
   6:   48 39 c7                cmp    %rax,%rdi
   9:   73 23                   jae    2e <_Z6f_sse2PfS_j+0x2e>
   b:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # 12 <_Z6f_sse2PfS_j+0x12>
  12:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
  18:   0f 28 07                movaps (%rdi),%xmm0
  1b:   48 83 c7 10             add    $0x10,%rdi
  1f:   0f 58 c1                addps  %xmm1,%xmm0
  22:   0f 29 06                movaps %xmm0,(%rsi)
  25:   48 83 c6 10             add    $0x10,%rsi
  29:   48 39 f8                cmp    %rdi,%rax
  2c:   77 ea                   ja     18 <_Z6f_sse2PfS_j+0x18>
  2e:   f3 c3                   repz retq