Warning: file_get_contents(/data/phpspider/zhask/data//catemap/6/cplusplus/148.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/assembly/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/maven/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C++ SSE矢量包装器类型性能与bare _m128相比_C++_Assembly_Optimization_X86_Sse - Fatal编程技术网

C++ SSE矢量包装器类型性能与bare _m128相比

C++ SSE矢量包装器类型性能与bare _m128相比,c++,assembly,optimization,x86,sse,C++,Assembly,Optimization,X86,Sse,我发现了一个有关SIMD陷阱的有趣信息,它指出使用包装器类型不可能达到“纯”类型的性能。我对此表示怀疑,所以我下载了项目文件并制作了一个类似的测试用例 事实证明(让我惊讶的是)包装器版本要慢得多。由于我不想只谈空谈,测试用例如下: 在第一种情况下Vec4是\u m128类型的简单别名,带有一些运算符: #include <xmmintrin.h> #include <emmintrin.h> using Vec4 = __m128; inline __m128 VLo

我发现了一个有关SIMD陷阱的有趣信息,它指出使用包装器类型不可能达到“纯”类型的性能。我对此表示怀疑,所以我下载了项目文件并制作了一个类似的测试用例

事实证明(让我惊讶的是)包装器版本要慢得多。由于我不想只谈空谈,测试用例如下:

在第一种情况下
Vec4
\u m128
类型的简单别名,带有一些运算符:

#include <xmmintrin.h>
#include <emmintrin.h>

using Vec4 = __m128;

inline __m128 VLoad(float f)
{
    return _mm_set_ps(f, f, f, f);
};

inline Vec4& operator+=(Vec4 &va, Vec4 vb)
{
    return (va = _mm_add_ps(va, vb));
};

inline Vec4& operator*=(Vec4 &va, Vec4 vb)
{
    return (va = _mm_mul_ps(va, vb));
};

inline Vec4 operator+(Vec4 va, Vec4 vb)
{
    return _mm_add_ps(va, vb);
};

inline Vec4 operator-(Vec4 va, Vec4 vb)
{
    return _mm_sub_ps(va, vb);
};

inline Vec4 operator*(Vec4 va, Vec4 vb)
{
    return _mm_mul_ps(va, vb);
};
这里是测试内核,它使用不同版本的
Vec4
产生不同的性能:

#include <xmmintrin.h>
#include <emmintrin.h>

struct EQSTATE
{
    // Filter #1 (Low band)

    Vec4  lf;       // Frequency
    Vec4  f1p0;     // Poles ...
    Vec4  f1p1;     
    Vec4  f1p2;
    Vec4  f1p3;

    // Filter #2 (High band)

    Vec4  hf;       // Frequency
    Vec4  f2p0;     // Poles ...
    Vec4  f2p1;
    Vec4  f2p2;
    Vec4  f2p3;

    // Sample history buffer

    Vec4  sdm1;     // Sample data minus 1
    Vec4  sdm2;     //                   2
    Vec4  sdm3;     //                   3

    // Gain Controls

    Vec4  lg;       // low  gain
    Vec4  mg;       // mid  gain
    Vec4  hg;       // high gain

};  

static float vsaf = (1.0f / 4294967295.0f);   // Very small amount (Denormal Fix)
static Vec4 vsa = VLoad(vsaf);

Vec4 TestEQ(EQSTATE* es, Vec4& sample)
{
    // Locals

    Vec4  l,m,h;      // Low / Mid / High - Sample Values

    // Filter #1 (lowpass)

    es->f1p0  += (es->lf * (sample   - es->f1p0)) + vsa;
    //es->f1p0 = VAdd(es->f1p0, VAdd(VMul(es->lf, VSub(sample, es->f1p0)), vsa));

    es->f1p1  += (es->lf * (es->f1p0 - es->f1p1));
    //es->f1p1 = VAdd(es->f1p1, VMul(es->lf, VSub(es->f1p0, es->f1p1)));

    es->f1p2  += (es->lf * (es->f1p1 - es->f1p2));
    //es->f1p2 = VAdd(es->f1p2, VMul(es->lf, VSub(es->f1p1, es->f1p2)));

    es->f1p3  += (es->lf * (es->f1p2 - es->f1p3));
    //es->f1p3 = VAdd(es->f1p3, VMul(es->lf, VSub(es->f1p2, es->f1p3)));

    l          = es->f1p3;

    // Filter #2 (highpass)

    es->f2p0  += (es->hf * (sample   - es->f2p0)) + vsa;
    //es->f2p0 = VAdd(es->f2p0, VAdd(VMul(es->hf, VSub(sample, es->f2p0)), vsa));

    es->f2p1  += (es->hf * (es->f2p0 - es->f2p1));
    //es->f2p1 = VAdd(es->f2p1, VMul(es->hf, VSub(es->f2p0, es->f2p1)));

    es->f2p2  += (es->hf * (es->f2p1 - es->f2p2));
    //es->f2p2 = VAdd(es->f2p2, VMul(es->hf, VSub(es->f2p1, es->f2p2)));

    es->f2p3  += (es->hf * (es->f2p2 - es->f2p3));
    //es->f2p3 = VAdd(es->f2p3, VMul(es->hf, VSub(es->f2p2, es->f2p3)));

    h          = es->sdm3 - es->f2p3;
    //h = VSub(es->sdm3, es->f2p3);

    // Calculate midrange (signal - (low + high))

    m          = es->sdm3 - (h + l);
    //m = VSub(es->sdm3, VAdd(h, l));

    // Scale, Combine and store

    l         *= es->lg;
    m         *= es->mg;
    h         *= es->hg;

    //l = VMul(l, es->lg);
    //m = VMul(m, es->mg);
    //h = VMul(h, es->hg);

    // Shuffle history buffer 

    es->sdm3   = es->sdm2;
    es->sdm2   = es->sdm1;
    es->sdm1   = sample;                

    // Return result

    return(l + m + h);
    //return(VAdd(l, VAdd(m, h)));
}

//make these as globals to enforce the function call;
static Vec4 sample[1024], result[1024];
static EQSTATE es;

#include <chrono>
#include <iostream>

int main()
{
    auto t0 = std::chrono::high_resolution_clock::now();

    for (int ii=0; ii<1024; ii++)
    {
        result[ii] = TestEQ(&es, sample[ii]);
    }

    auto t1 = std::chrono::high_resolution_clock::now();
    auto t = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
    std::cout << "timing: " << t << '\n';

    std::cin.get();

    return 0;
}
MSVC 2015为第二版生成的程序集:

;   COMDAT ?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z
_TEXT   SEGMENT
?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z PROC      ; TestEQ, COMDAT
; _es$dead$ = ecx
; _sample$ = edx
    vmovaps xmm0, XMMWORD PTR [edx]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3T__m128@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+16, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+32, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+48, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm4, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+80
    vmovaps xmm1, XMMWORD PTR ?es@@3UEQSTATE@@A+192
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+64, xmm4
    vmovaps xmm0, XMMWORD PTR [edx]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3T__m128@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+96, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+112, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+128, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vsubps  xmm2, xmm1, xmm0
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+144, xmm0
    vmovaps xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+176
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+192, xmm0
    vmovaps xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+160
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+176, xmm0
    vmovaps xmm0, XMMWORD PTR [edx]
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+160, xmm0
    vaddps  xmm0, xmm4, xmm2
    vsubps  xmm0, xmm1, xmm0
    vmulps  xmm1, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+224
    vmulps  xmm0, xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+240
    vaddps  xmm1, xmm1, xmm0
    vmulps  xmm0, xmm4, XMMWORD PTR ?es@@3UEQSTATE@@A+208
    vaddps  xmm0, xmm1, xmm0
    ret 0
?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z ENDP      ; TestEQ
?TestEQ@@YA?AUVec4@VMATH@@PAUEQSTATE@@AAU12@@Z PROC ; TestEQ, COMDAT
; ___$ReturnUdt$ = ecx
; _es$dead$ = edx
    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -8                 ; fffffff8H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov eax, DWORD PTR _sample$[ebx]
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A
    vmovaps xmm1, XMMWORD PTR ?es@@3UEQSTATE@@A+192
    mov DWORD PTR [esp+4], ebp
    vmovaps xmm0, XMMWORD PTR [eax]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3UVec4@VMATH@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+16, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+32, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+48, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm4, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+80
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+64, xmm4
    vmovaps xmm0, XMMWORD PTR [eax]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3UVec4@VMATH@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+96, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+112, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+128, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vsubps  xmm2, xmm1, xmm0
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+144, xmm0
    vaddps  xmm0, xmm2, xmm4
    vsubps  xmm0, xmm1, xmm0
    vmulps  xmm1, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+224
    vmovdqu xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+176
    vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+192, xmm0
    vmovdqu xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+160
    vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+176, xmm0
    vmovdqu xmm0, XMMWORD PTR [eax]
    vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+160, xmm0
    vmulps  xmm0, xmm4, XMMWORD PTR ?es@@3UEQSTATE@@A+208
    vaddps  xmm1, xmm0, xmm1
    vmulps  xmm0, xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+240
    vaddps  xmm0, xmm1, xmm0
    vmovaps XMMWORD PTR [ecx], xmm0
    mov eax, ecx
    pop ebp
    mov esp, ebx
    pop ebx
    ret 0
?TestEQ@@YA?AUVec4@VMATH@@PAUEQSTATE@@AAU12@@Z ENDP ; TestEQ
"?TestEQ@@YAT__m128@@PAUEQSTATE@@AAT1@@Z": # @"\01?TestEQ@@YAT__m128@@PAUEQSTATE@@AAT1@@Z"
Lfunc_begin0:
Ltmp0:
# BB#0:                                 # %entry
    movl    8(%esp), %eax
    movl    4(%esp), %ecx
    vmovaps _vsa, %xmm0
    vmovaps (%ecx), %xmm1
    vmovaps 16(%ecx), %xmm2
    vmovaps (%eax), %xmm3
    vsubps  %xmm2, %xmm3, %xmm3
    vmulps  %xmm3, %xmm1, %xmm3
    vaddps  %xmm3, %xmm0, %xmm3
    vaddps  %xmm3, %xmm2, %xmm2
    vmovaps %xmm2, 16(%ecx)
    vmovaps 32(%ecx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm2, %xmm1, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 32(%ecx)
    vmovaps 48(%ecx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm2, %xmm1, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 48(%ecx)
    vmovaps 64(%ecx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm2, %xmm1, %xmm1
    vaddps  %xmm1, %xmm3, %xmm1
    vmovaps %xmm1, 64(%ecx)
    vmovaps 80(%ecx), %xmm2
    vmovaps 96(%ecx), %xmm3
    vmovaps (%eax), %xmm4
    vsubps  %xmm3, %xmm4, %xmm4
    vmulps  %xmm4, %xmm2, %xmm4
    vaddps  %xmm4, %xmm0, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 96(%ecx)
    vmovaps 112(%ecx), %xmm3
    vsubps  %xmm3, %xmm0, %xmm0
    vmulps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 112(%ecx)
    vmovaps 128(%ecx), %xmm3
    vsubps  %xmm3, %xmm0, %xmm0
    vmulps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 128(%ecx)
    vmovaps 144(%ecx), %xmm3
    vsubps  %xmm3, %xmm0, %xmm0
    vmulps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 144(%ecx)
    vmovaps 192(%ecx), %xmm2
    vsubps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm1, %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  208(%ecx), %xmm1, %xmm1
    vmulps  224(%ecx), %xmm2, %xmm2
    vmulps  240(%ecx), %xmm0, %xmm0
    vmovaps 176(%ecx), %xmm3
    vmovaps %xmm3, 192(%ecx)
    vmovaps 160(%ecx), %xmm3
    vmovaps %xmm3, 176(%ecx)
    vmovaps (%eax), %xmm3
    vmovaps %xmm3, 160(%ecx)
    vaddps  %xmm2, %xmm0, %xmm0
    vaddps  %xmm0, %xmm1, %xmm0
    retl
Lfunc_end0:
"?TestEQ@@YA?AUVec4@@PAUEQSTATE@@AAU1@@Z": # @"\01?TestEQ@@YA?AUVec4@@PAUEQSTATE@@AAU1@@Z"
Lfunc_begin0:
Ltmp0:
# BB#0:                                 # %entry
    movl    12(%esp), %ecx
    movl    8(%esp), %edx
    vmovaps (%edx), %xmm0
    vmovaps 16(%edx), %xmm1
    vmovaps (%ecx), %xmm2
    vsubps  %xmm1, %xmm2, %xmm2
    vmulps  %xmm0, %xmm2, %xmm2
    vaddps  _vsa, %xmm2, %xmm2
    vaddps  %xmm2, %xmm1, %xmm1
    vmovaps %xmm1, 16(%edx)
    vmovaps 32(%edx), %xmm2
    vsubps  %xmm2, %xmm1, %xmm1
    vmulps  %xmm0, %xmm1, %xmm1
    vaddps  %xmm1, %xmm2, %xmm1
    vmovaps %xmm1, 32(%edx)
    vmovaps 48(%edx), %xmm2
    vsubps  %xmm2, %xmm1, %xmm1
    vmulps  %xmm0, %xmm1, %xmm1
    vaddps  %xmm1, %xmm2, %xmm1
    vmovaps %xmm1, 48(%edx)
    vmovaps 64(%edx), %xmm2
    vsubps  %xmm2, %xmm1, %xmm1
    vmulps  %xmm0, %xmm1, %xmm0
    vaddps  %xmm0, %xmm2, %xmm0
    vmovaps %xmm0, 64(%edx)
    vmovaps 80(%edx), %xmm1
    vmovaps 96(%edx), %xmm2
    vmovaps (%ecx), %xmm3
    vsubps  %xmm2, %xmm3, %xmm3
    vmulps  %xmm1, %xmm3, %xmm3
    vaddps  _vsa, %xmm3, %xmm3
    vaddps  %xmm3, %xmm2, %xmm2
    vmovaps %xmm2, 96(%edx)
    vmovaps 112(%edx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm1, %xmm2, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 112(%edx)
    vmovaps 128(%edx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm1, %xmm2, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 128(%edx)
    vmovaps 144(%edx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm1, %xmm2, %xmm1
    vaddps  %xmm1, %xmm3, %xmm1
    vmovaps %xmm1, 144(%edx)
    vmovaps 192(%edx), %xmm2
    vsubps  %xmm1, %xmm2, %xmm1
    vaddps  %xmm1, %xmm0, %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  208(%edx), %xmm0, %xmm0
    vmulps  224(%edx), %xmm2, %xmm2
    movl    4(%esp), %eax
    vmulps  240(%edx), %xmm1, %xmm1
    vmovaps 176(%edx), %xmm3
    vmovaps %xmm3, 192(%edx)
    vmovaps 160(%edx), %xmm3
    vmovaps %xmm3, 176(%edx)
    vmovaps (%ecx), %xmm3
    vmovaps %xmm3, 160(%edx)
    vaddps  %xmm2, %xmm0, %xmm0
    vaddps  %xmm0, %xmm1, %xmm0
    vmovaps %xmm0, (%eax)
    retl
Lfunc_end0:
生产的第二个版本的装配明显更长、更慢。它与VisualStudio没有严格的关系,因为Clang3.8会产生类似的性能结果


Clang 3.8为第1版生成的程序集:

;   COMDAT ?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z
_TEXT   SEGMENT
?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z PROC      ; TestEQ, COMDAT
; _es$dead$ = ecx
; _sample$ = edx
    vmovaps xmm0, XMMWORD PTR [edx]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3T__m128@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+16, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+32, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+48, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm4, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+80
    vmovaps xmm1, XMMWORD PTR ?es@@3UEQSTATE@@A+192
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+64, xmm4
    vmovaps xmm0, XMMWORD PTR [edx]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3T__m128@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+96, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+112, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+128, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vsubps  xmm2, xmm1, xmm0
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+144, xmm0
    vmovaps xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+176
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+192, xmm0
    vmovaps xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+160
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+176, xmm0
    vmovaps xmm0, XMMWORD PTR [edx]
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+160, xmm0
    vaddps  xmm0, xmm4, xmm2
    vsubps  xmm0, xmm1, xmm0
    vmulps  xmm1, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+224
    vmulps  xmm0, xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+240
    vaddps  xmm1, xmm1, xmm0
    vmulps  xmm0, xmm4, XMMWORD PTR ?es@@3UEQSTATE@@A+208
    vaddps  xmm0, xmm1, xmm0
    ret 0
?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z ENDP      ; TestEQ
?TestEQ@@YA?AUVec4@VMATH@@PAUEQSTATE@@AAU12@@Z PROC ; TestEQ, COMDAT
; ___$ReturnUdt$ = ecx
; _es$dead$ = edx
    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -8                 ; fffffff8H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov eax, DWORD PTR _sample$[ebx]
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A
    vmovaps xmm1, XMMWORD PTR ?es@@3UEQSTATE@@A+192
    mov DWORD PTR [esp+4], ebp
    vmovaps xmm0, XMMWORD PTR [eax]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3UVec4@VMATH@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+16, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+32, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+48, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm4, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+80
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+64, xmm4
    vmovaps xmm0, XMMWORD PTR [eax]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3UVec4@VMATH@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+96, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+112, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+128, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vsubps  xmm2, xmm1, xmm0
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+144, xmm0
    vaddps  xmm0, xmm2, xmm4
    vsubps  xmm0, xmm1, xmm0
    vmulps  xmm1, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+224
    vmovdqu xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+176
    vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+192, xmm0
    vmovdqu xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+160
    vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+176, xmm0
    vmovdqu xmm0, XMMWORD PTR [eax]
    vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+160, xmm0
    vmulps  xmm0, xmm4, XMMWORD PTR ?es@@3UEQSTATE@@A+208
    vaddps  xmm1, xmm0, xmm1
    vmulps  xmm0, xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+240
    vaddps  xmm0, xmm1, xmm0
    vmovaps XMMWORD PTR [ecx], xmm0
    mov eax, ecx
    pop ebp
    mov esp, ebx
    pop ebx
    ret 0
?TestEQ@@YA?AUVec4@VMATH@@PAUEQSTATE@@AAU12@@Z ENDP ; TestEQ
"?TestEQ@@YAT__m128@@PAUEQSTATE@@AAT1@@Z": # @"\01?TestEQ@@YAT__m128@@PAUEQSTATE@@AAT1@@Z"
Lfunc_begin0:
Ltmp0:
# BB#0:                                 # %entry
    movl    8(%esp), %eax
    movl    4(%esp), %ecx
    vmovaps _vsa, %xmm0
    vmovaps (%ecx), %xmm1
    vmovaps 16(%ecx), %xmm2
    vmovaps (%eax), %xmm3
    vsubps  %xmm2, %xmm3, %xmm3
    vmulps  %xmm3, %xmm1, %xmm3
    vaddps  %xmm3, %xmm0, %xmm3
    vaddps  %xmm3, %xmm2, %xmm2
    vmovaps %xmm2, 16(%ecx)
    vmovaps 32(%ecx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm2, %xmm1, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 32(%ecx)
    vmovaps 48(%ecx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm2, %xmm1, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 48(%ecx)
    vmovaps 64(%ecx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm2, %xmm1, %xmm1
    vaddps  %xmm1, %xmm3, %xmm1
    vmovaps %xmm1, 64(%ecx)
    vmovaps 80(%ecx), %xmm2
    vmovaps 96(%ecx), %xmm3
    vmovaps (%eax), %xmm4
    vsubps  %xmm3, %xmm4, %xmm4
    vmulps  %xmm4, %xmm2, %xmm4
    vaddps  %xmm4, %xmm0, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 96(%ecx)
    vmovaps 112(%ecx), %xmm3
    vsubps  %xmm3, %xmm0, %xmm0
    vmulps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 112(%ecx)
    vmovaps 128(%ecx), %xmm3
    vsubps  %xmm3, %xmm0, %xmm0
    vmulps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 128(%ecx)
    vmovaps 144(%ecx), %xmm3
    vsubps  %xmm3, %xmm0, %xmm0
    vmulps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 144(%ecx)
    vmovaps 192(%ecx), %xmm2
    vsubps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm1, %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  208(%ecx), %xmm1, %xmm1
    vmulps  224(%ecx), %xmm2, %xmm2
    vmulps  240(%ecx), %xmm0, %xmm0
    vmovaps 176(%ecx), %xmm3
    vmovaps %xmm3, 192(%ecx)
    vmovaps 160(%ecx), %xmm3
    vmovaps %xmm3, 176(%ecx)
    vmovaps (%eax), %xmm3
    vmovaps %xmm3, 160(%ecx)
    vaddps  %xmm2, %xmm0, %xmm0
    vaddps  %xmm0, %xmm1, %xmm0
    retl
Lfunc_end0:
"?TestEQ@@YA?AUVec4@@PAUEQSTATE@@AAU1@@Z": # @"\01?TestEQ@@YA?AUVec4@@PAUEQSTATE@@AAU1@@Z"
Lfunc_begin0:
Ltmp0:
# BB#0:                                 # %entry
    movl    12(%esp), %ecx
    movl    8(%esp), %edx
    vmovaps (%edx), %xmm0
    vmovaps 16(%edx), %xmm1
    vmovaps (%ecx), %xmm2
    vsubps  %xmm1, %xmm2, %xmm2
    vmulps  %xmm0, %xmm2, %xmm2
    vaddps  _vsa, %xmm2, %xmm2
    vaddps  %xmm2, %xmm1, %xmm1
    vmovaps %xmm1, 16(%edx)
    vmovaps 32(%edx), %xmm2
    vsubps  %xmm2, %xmm1, %xmm1
    vmulps  %xmm0, %xmm1, %xmm1
    vaddps  %xmm1, %xmm2, %xmm1
    vmovaps %xmm1, 32(%edx)
    vmovaps 48(%edx), %xmm2
    vsubps  %xmm2, %xmm1, %xmm1
    vmulps  %xmm0, %xmm1, %xmm1
    vaddps  %xmm1, %xmm2, %xmm1
    vmovaps %xmm1, 48(%edx)
    vmovaps 64(%edx), %xmm2
    vsubps  %xmm2, %xmm1, %xmm1
    vmulps  %xmm0, %xmm1, %xmm0
    vaddps  %xmm0, %xmm2, %xmm0
    vmovaps %xmm0, 64(%edx)
    vmovaps 80(%edx), %xmm1
    vmovaps 96(%edx), %xmm2
    vmovaps (%ecx), %xmm3
    vsubps  %xmm2, %xmm3, %xmm3
    vmulps  %xmm1, %xmm3, %xmm3
    vaddps  _vsa, %xmm3, %xmm3
    vaddps  %xmm3, %xmm2, %xmm2
    vmovaps %xmm2, 96(%edx)
    vmovaps 112(%edx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm1, %xmm2, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 112(%edx)
    vmovaps 128(%edx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm1, %xmm2, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 128(%edx)
    vmovaps 144(%edx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm1, %xmm2, %xmm1
    vaddps  %xmm1, %xmm3, %xmm1
    vmovaps %xmm1, 144(%edx)
    vmovaps 192(%edx), %xmm2
    vsubps  %xmm1, %xmm2, %xmm1
    vaddps  %xmm1, %xmm0, %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  208(%edx), %xmm0, %xmm0
    vmulps  224(%edx), %xmm2, %xmm2
    movl    4(%esp), %eax
    vmulps  240(%edx), %xmm1, %xmm1
    vmovaps 176(%edx), %xmm3
    vmovaps %xmm3, 192(%edx)
    vmovaps 160(%edx), %xmm3
    vmovaps %xmm3, 176(%edx)
    vmovaps (%ecx), %xmm3
    vmovaps %xmm3, 160(%edx)
    vaddps  %xmm2, %xmm0, %xmm0
    vaddps  %xmm0, %xmm1, %xmm0
    vmovaps %xmm0, (%eax)
    retl
Lfunc_end0:
Clang 3.8为第二版生成的程序集:

;   COMDAT ?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z
_TEXT   SEGMENT
?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z PROC      ; TestEQ, COMDAT
; _es$dead$ = ecx
; _sample$ = edx
    vmovaps xmm0, XMMWORD PTR [edx]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3T__m128@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+16, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+32, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+48, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm4, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+80
    vmovaps xmm1, XMMWORD PTR ?es@@3UEQSTATE@@A+192
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+64, xmm4
    vmovaps xmm0, XMMWORD PTR [edx]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3T__m128@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+96, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+112, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+128, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vsubps  xmm2, xmm1, xmm0
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+144, xmm0
    vmovaps xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+176
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+192, xmm0
    vmovaps xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+160
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+176, xmm0
    vmovaps xmm0, XMMWORD PTR [edx]
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+160, xmm0
    vaddps  xmm0, xmm4, xmm2
    vsubps  xmm0, xmm1, xmm0
    vmulps  xmm1, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+224
    vmulps  xmm0, xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+240
    vaddps  xmm1, xmm1, xmm0
    vmulps  xmm0, xmm4, XMMWORD PTR ?es@@3UEQSTATE@@A+208
    vaddps  xmm0, xmm1, xmm0
    ret 0
?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z ENDP      ; TestEQ
?TestEQ@@YA?AUVec4@VMATH@@PAUEQSTATE@@AAU12@@Z PROC ; TestEQ, COMDAT
; ___$ReturnUdt$ = ecx
; _es$dead$ = edx
    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -8                 ; fffffff8H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov eax, DWORD PTR _sample$[ebx]
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A
    vmovaps xmm1, XMMWORD PTR ?es@@3UEQSTATE@@A+192
    mov DWORD PTR [esp+4], ebp
    vmovaps xmm0, XMMWORD PTR [eax]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3UVec4@VMATH@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+16, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+32, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+48, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm4, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
    vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+80
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+64, xmm4
    vmovaps xmm0, XMMWORD PTR [eax]
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?vsa@@3UVec4@VMATH@@A
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+96, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+112, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+128, xmm0
    vsubps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vmulps  xmm0, xmm0, xmm2
    vaddps  xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
    vsubps  xmm2, xmm1, xmm0
    vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+144, xmm0
    vaddps  xmm0, xmm2, xmm4
    vsubps  xmm0, xmm1, xmm0
    vmulps  xmm1, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+224
    vmovdqu xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+176
    vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+192, xmm0
    vmovdqu xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+160
    vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+176, xmm0
    vmovdqu xmm0, XMMWORD PTR [eax]
    vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+160, xmm0
    vmulps  xmm0, xmm4, XMMWORD PTR ?es@@3UEQSTATE@@A+208
    vaddps  xmm1, xmm0, xmm1
    vmulps  xmm0, xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+240
    vaddps  xmm0, xmm1, xmm0
    vmovaps XMMWORD PTR [ecx], xmm0
    mov eax, ecx
    pop ebp
    mov esp, ebx
    pop ebx
    ret 0
?TestEQ@@YA?AUVec4@VMATH@@PAUEQSTATE@@AAU12@@Z ENDP ; TestEQ
"?TestEQ@@YAT__m128@@PAUEQSTATE@@AAT1@@Z": # @"\01?TestEQ@@YAT__m128@@PAUEQSTATE@@AAT1@@Z"
Lfunc_begin0:
Ltmp0:
# BB#0:                                 # %entry
    movl    8(%esp), %eax
    movl    4(%esp), %ecx
    vmovaps _vsa, %xmm0
    vmovaps (%ecx), %xmm1
    vmovaps 16(%ecx), %xmm2
    vmovaps (%eax), %xmm3
    vsubps  %xmm2, %xmm3, %xmm3
    vmulps  %xmm3, %xmm1, %xmm3
    vaddps  %xmm3, %xmm0, %xmm3
    vaddps  %xmm3, %xmm2, %xmm2
    vmovaps %xmm2, 16(%ecx)
    vmovaps 32(%ecx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm2, %xmm1, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 32(%ecx)
    vmovaps 48(%ecx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm2, %xmm1, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 48(%ecx)
    vmovaps 64(%ecx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm2, %xmm1, %xmm1
    vaddps  %xmm1, %xmm3, %xmm1
    vmovaps %xmm1, 64(%ecx)
    vmovaps 80(%ecx), %xmm2
    vmovaps 96(%ecx), %xmm3
    vmovaps (%eax), %xmm4
    vsubps  %xmm3, %xmm4, %xmm4
    vmulps  %xmm4, %xmm2, %xmm4
    vaddps  %xmm4, %xmm0, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 96(%ecx)
    vmovaps 112(%ecx), %xmm3
    vsubps  %xmm3, %xmm0, %xmm0
    vmulps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 112(%ecx)
    vmovaps 128(%ecx), %xmm3
    vsubps  %xmm3, %xmm0, %xmm0
    vmulps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 128(%ecx)
    vmovaps 144(%ecx), %xmm3
    vsubps  %xmm3, %xmm0, %xmm0
    vmulps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm3, %xmm0
    vmovaps %xmm0, 144(%ecx)
    vmovaps 192(%ecx), %xmm2
    vsubps  %xmm0, %xmm2, %xmm0
    vaddps  %xmm0, %xmm1, %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  208(%ecx), %xmm1, %xmm1
    vmulps  224(%ecx), %xmm2, %xmm2
    vmulps  240(%ecx), %xmm0, %xmm0
    vmovaps 176(%ecx), %xmm3
    vmovaps %xmm3, 192(%ecx)
    vmovaps 160(%ecx), %xmm3
    vmovaps %xmm3, 176(%ecx)
    vmovaps (%eax), %xmm3
    vmovaps %xmm3, 160(%ecx)
    vaddps  %xmm2, %xmm0, %xmm0
    vaddps  %xmm0, %xmm1, %xmm0
    retl
Lfunc_end0:
"?TestEQ@@YA?AUVec4@@PAUEQSTATE@@AAU1@@Z": # @"\01?TestEQ@@YA?AUVec4@@PAUEQSTATE@@AAU1@@Z"
Lfunc_begin0:
Ltmp0:
# BB#0:                                 # %entry
    movl    12(%esp), %ecx
    movl    8(%esp), %edx
    vmovaps (%edx), %xmm0
    vmovaps 16(%edx), %xmm1
    vmovaps (%ecx), %xmm2
    vsubps  %xmm1, %xmm2, %xmm2
    vmulps  %xmm0, %xmm2, %xmm2
    vaddps  _vsa, %xmm2, %xmm2
    vaddps  %xmm2, %xmm1, %xmm1
    vmovaps %xmm1, 16(%edx)
    vmovaps 32(%edx), %xmm2
    vsubps  %xmm2, %xmm1, %xmm1
    vmulps  %xmm0, %xmm1, %xmm1
    vaddps  %xmm1, %xmm2, %xmm1
    vmovaps %xmm1, 32(%edx)
    vmovaps 48(%edx), %xmm2
    vsubps  %xmm2, %xmm1, %xmm1
    vmulps  %xmm0, %xmm1, %xmm1
    vaddps  %xmm1, %xmm2, %xmm1
    vmovaps %xmm1, 48(%edx)
    vmovaps 64(%edx), %xmm2
    vsubps  %xmm2, %xmm1, %xmm1
    vmulps  %xmm0, %xmm1, %xmm0
    vaddps  %xmm0, %xmm2, %xmm0
    vmovaps %xmm0, 64(%edx)
    vmovaps 80(%edx), %xmm1
    vmovaps 96(%edx), %xmm2
    vmovaps (%ecx), %xmm3
    vsubps  %xmm2, %xmm3, %xmm3
    vmulps  %xmm1, %xmm3, %xmm3
    vaddps  _vsa, %xmm3, %xmm3
    vaddps  %xmm3, %xmm2, %xmm2
    vmovaps %xmm2, 96(%edx)
    vmovaps 112(%edx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm1, %xmm2, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 112(%edx)
    vmovaps 128(%edx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm1, %xmm2, %xmm2
    vaddps  %xmm2, %xmm3, %xmm2
    vmovaps %xmm2, 128(%edx)
    vmovaps 144(%edx), %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  %xmm1, %xmm2, %xmm1
    vaddps  %xmm1, %xmm3, %xmm1
    vmovaps %xmm1, 144(%edx)
    vmovaps 192(%edx), %xmm2
    vsubps  %xmm1, %xmm2, %xmm1
    vaddps  %xmm1, %xmm0, %xmm3
    vsubps  %xmm3, %xmm2, %xmm2
    vmulps  208(%edx), %xmm0, %xmm0
    vmulps  224(%edx), %xmm2, %xmm2
    movl    4(%esp), %eax
    vmulps  240(%edx), %xmm1, %xmm1
    vmovaps 176(%edx), %xmm3
    vmovaps %xmm3, 192(%edx)
    vmovaps 160(%edx), %xmm3
    vmovaps %xmm3, 176(%edx)
    vmovaps (%ecx), %xmm3
    vmovaps %xmm3, 160(%edx)
    vaddps  %xmm2, %xmm0, %xmm0
    vaddps  %xmm0, %xmm1, %xmm0
    vmovaps %xmm0, (%eax)
    retl
Lfunc_end0:
尽管指令的数量相同,但第1版的速度仍然快了约50%


我试图找出问题的原因,但没有成功。在第二个MSVC程序集中有一些可疑的东西,比如丑陋的
vmovdqu
指令。构造、复制赋值运算符和pass by reference也可能不必要地将数据从SSE寄存器移回内存,但是我所有解决或准确识别问题的尝试都没有成功

我真的不认为这样一个简单的包装器不能达到与裸的
\uuum128
相同的性能,不管是什么原因造成的开销都可以消除


这是怎么回事?

事实证明,问题不在于用户定义的
结构Vec4
。 它与x86调用约定密切相关

VisualC++中默认的x86调用约定是<>代码> 按相反顺序(从右到左)在堆栈上推送参数

现在这是一个问题,因为
Vec4
应该保存并在XMM寄存器中传递。但让我们看看到底发生了什么


第一例 在第一种情况下,
Vec4
是一个简单的类型别名
\uuu m128

using Vec4 = __m128;
/* ... */
Vec4 TestEQ(EQSTATE* es, Vec4 &sample) { ... }
汇编中生成的TestEQ的
TestEQ
函数头为

?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z PROC      ; TestEQ, COMDAT
; _es$ = ecx
; _sample$ = edx
...
很好


第2例 在第二种情况下,
Vec4
不是
\uuu m128
的别名,它现在是用户定义的类型

这里我研究x86和x64平台的编译

x86(32位编译)

由于
\uuu cdecl
(这是x86中的默认调用约定)不允许将对齐的值传递给函数(这将发出
错误C2719:“sample”:请求对齐的16个形式参数将不会对齐
),因此我们通过
常量
引用传递它

struct Vec4{ __m128 simd; /* ... */ };
/* ... */
Vec4 TestEQ(EQSTATE* es, const Vec4 &sample) { ... }
它为
TestEQ
生成函数头,如下所示

?TestEQ@@YA?AUVec4@@PAUEQSTATE@@ABU1@@Z PROC        ; TestEQ, COMDAT
; ___$ReturnUdt$ = ecx
; _es$ = edx
    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -8                 ; fffffff8H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov eax, DWORD PTR _sample$[ebx]
    ...
?TestEQ@@YQ?AUVec4@@PAUEQSTATE@@ABU1@@Z PROC        ; TestEQ, COMDAT
; _es$ = ecx
; _sample$ = edx
...
这不像第一种情况那么简单。参数被移动到堆栈中。在前几个SSE指令之间还有一些附加的
mov
指令,此处未列出。总的来说,这些指令足以在一定程度上影响性能

x64(64位编译)

x64中的Windows使用不同的调用约定作为x64应用程序二进制接口(ABI)的一部分

如果可能的话,这个约定尝试将数据保存在寄存器中,就像浮点数据保存在XMM寄存器中一样

发件人:

x64应用程序二进制接口(ABI)是一个4寄存器快速调用 调用约定,为这些寄存器提供堆栈支持。有一个 函数中参数之间严格的一对一对应关系,以及 这些参数的寄存器。任何不符合8的论点 字节,或不是1、2、4或8字节,必须通过引用传递。 (...) 所有浮点操作都使用16个XMM寄存器完成。 参数在寄存器RCX、RDX、R8和R9中传递。如果争论是 float/double,它们在XMM0L、XMM1L、XMM2L和XMM3L中传递。16 字节参数通过引用传递

struct Vec4{ __m128 simd; /* ... */ };
/* ... */
Vec4 TestEQ(EQSTATE* es, const Vec4 &sample) { ... }

Windows上遵循Microsoft x64调用约定 和预引导UEFI(用于x86-64上的长模式)。它使用寄存器RCX, 前四个整数或指针参数的RDX、R8、R9 顺序),XMM0、XMM1、XMM2、XMM3用于浮点 论据。其他参数被推送到堆栈上(右键) 左)。整数返回值(类似于x86)以RAX形式返回,如果 64位或更少。浮点返回值在XMM0中返回

因此,x64模式下的第二种情况为
TestEQ
as生成函数头

?TestEQ@@YA?AUVec4@@PAUEQSTATE@@ABU1@@Z PROC        ; TestEQ, COMDAT
; ___$ReturnUdt$ = ecx
; _es$ = edx
    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -8                 ; fffffff8H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov eax, DWORD PTR _sample$[ebx]
    ...
?TestEQ@@YQ?AUVec4@@PAUEQSTATE@@ABU1@@Z PROC        ; TestEQ, COMDAT
; _es$ = ecx
; _sample$ = edx
...
这与第一个案例完全相同


解决方案 对于x86模式,所呈现的行为应该是明确固定的

最简单的解决方案是
内联
函数。 虽然这只是一个提示,编译器可以完全忽略,但您可以告诉编译器始终内联函数。然而,有时由于函数大小或任何其他原因,这是不需要的

幸运的是,Microsoft在Visual Studio 2013及更高版本中引入了
\uuu vectorcall
约定(可在x86和x64模式下使用)。这与默认的Windows x64调用约定非常相似,但具有更多可利用的寄存器

让我们用
\uu vectorcall
重写第二个案例:

Vec4 __vectorcall TestEQ(EQSTATE* es, const Vec4 &sample) { ... }
现在,
TestEQ
生成的汇编函数头是

?TestEQ@@YQ?AUVec4@@PAUEQSTATE@@ABU1@@Z PROC        ; TestEQ, COMDAT
; _es$ = ecx
; _sample$ = edx
...
最终与x64中的第一种情况和第二种情况相同

正如Peter Cordes指出的,为了充分利用
\uu vectorcall
Vec4
参数