C++ 循环展开和SSE——铿锵vs gcc
免责声明:可以找到完整的代码 16字节对齐 提供一个相当简单的类型来支持正确的SSE对齐C++ 循环展开和SSE——铿锵vs gcc,c++,gcc,assembly,clang,simd,C++,Gcc,Assembly,Clang,Simd,免责声明:可以找到完整的代码 16字节对齐 提供一个相当简单的类型来支持正确的SSE对齐 struct alignas(16) simd_pack { std::int32_t data[4]; }; 以及一个将两个数组相加的函数 void add_packed(simd_pack* lhs_and_result, simd_pack* rhs, std::size_t size) { for (std::size_t i = 0; i < size; i++)
struct alignas(16) simd_pack
{
std::int32_t data[4];
};
以及一个将两个数组相加的函数
void add_packed(simd_pack* lhs_and_result, simd_pack* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
for (std::size_t j = 0; j < 4; j++)
lhs_and_result[i].data[j] += rhs[i].data[j];
}
我不是很精通汇编,但对我来说,它看起来就像是简单地展开内部for循环。如果我们看看gcc,我们会得到:
add_packed(simd_pack*, simd_pack*, unsigned long):
test rdx, rdx
je .L1
sal rdx, 4
xor eax, eax
.L3:
movdqa xmm0, XMMWORD PTR [rdi+rax]
paddd xmm0, XMMWORD PTR [rsi+rax]
movaps XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, rdx
jne .L3
.L1:
ret
这正是我所期望的
64字节对齐
如果我们使用64字节对齐方式(如果我没有弄错的话,通常是缓存线),差异会更大(显然)
gcc使用SSE并展开:
add_cache_line(cache_line*, cache_line*, unsigned long):
mov rcx, rdx
test rdx, rdx
je .L9
sal rcx, 6
mov rax, rdi
mov rdx, rsi
add rcx, rdi
.L11:
movdqa xmm2, XMMWORD PTR [rdx+16]
movdqa xmm3, XMMWORD PTR [rax]
add rax, 64
add rdx, 64
movdqa xmm1, XMMWORD PTR [rdx-32]
movdqa xmm0, XMMWORD PTR [rdx-16]
paddd xmm3, XMMWORD PTR [rdx-64]
paddd xmm2, XMMWORD PTR [rax-48]
paddd xmm1, XMMWORD PTR [rax-32]
paddd xmm0, XMMWORD PTR [rax-16]
movaps XMMWORD PTR [rax-64], xmm3
movaps XMMWORD PTR [rax-48], xmm2
movaps XMMWORD PTR [rax-32], xmm1
movaps XMMWORD PTR [rax-16], xmm0
cmp rax, rcx
jne .L11
.L9:
ret
不对齐
如果我们使用完全没有对齐的普通32位整数数组,这会变得很有趣。我们使用完全相同的编译器标志
void add_unaligned(std::int32_t* lhs_and_result, std::int32_t* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
lhs_and_result[i] += rhs[i];
}
在.LBB2_4
和.LBB2_6
发生了什么?看起来它再次展开了一个循环,但我不确定那里发生了什么(主要是因为使用了寄存器)
在.LBB2_12
中,它甚至展开SSE部件。我认为它只是展开了两倍,因为它需要两个SIMD寄存器来加载每个操作数,因为它们现在没有对齐.LBB2_14
包含未展开的SSE零件
这里的控制流程如何?我想应该是:
xmm0..3
)xmm0
(本例中为4个整数),则执行一次add_unaligned(int*, int*, unsigned long):
test rdx, rdx
je .L16
lea rcx, [rsi+4]
mov rax, rdi
sub rax, rcx
cmp rax, 8
jbe .L22
lea rax, [rdx-1]
cmp rax, 2
jbe .L22
mov rcx, rdx
xor eax, eax
shr rcx, 2
sal rcx, 4
.L19:
movdqu xmm0, XMMWORD PTR [rdi+rax]
movdqu xmm1, XMMWORD PTR [rsi+rax]
paddd xmm0, xmm1
movups XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, rcx
jne .L19
mov rax, rdx
and rax, -4
test dl, 3
je .L16
mov ecx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], ecx
lea rcx, [rax+1]
cmp rdx, rcx
jbe .L16
add rax, 2
mov r8d, DWORD PTR [rsi+rcx*4]
add DWORD PTR [rdi+rcx*4], r8d
cmp rdx, rax
jbe .L16
mov edx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], edx
ret
.L22:
xor eax, eax
.L18:
mov ecx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], ecx
add rax, 1
cmp rdx, rax
jne .L18
.L16:
ret
我假设控制流类似于叮当声
xmm0
和xmm1
.L19
中发生的,但是.L18
在做什么呢
总结
是完整的代码,包括程序集。我的问题是:
.LBB2_4
和.LBB2_6
在做什么.L18
在做什么4) 这是长度小于8的情况下的代码,因此SSE路径不值得。无论是否自动矢量化,CLANG始终展开小循环。GCC不会,除非它知道它们来自PGO(
-fprofile use
)。GCC在围绕微小的矢量化内部循环完全展开标量介绍/清理上花费了大量的代码占用空间。:/对于struct案例,似乎存在重叠的可能性;尝试使用foo*\uu restrict p
2)那些似乎是展开的案例,以显示区域overlap@Jester为什么会有一个门槛?为什么是8?与简单指令相比,SSE指令的成本更高吗?@Timo:是的,这是一个遗漏的优化;这种重叠是不可能的。simd_pack
中遗漏的其他优化包括仍然只使用128位向量和-march=skylake
(不是256位AVX2),因此我想clang很难将这些结构平坦化。
add_cache_line(cache_line*, cache_line*, unsigned long):
mov rcx, rdx
test rdx, rdx
je .L9
sal rcx, 6
mov rax, rdi
mov rdx, rsi
add rcx, rdi
.L11:
movdqa xmm2, XMMWORD PTR [rdx+16]
movdqa xmm3, XMMWORD PTR [rax]
add rax, 64
add rdx, 64
movdqa xmm1, XMMWORD PTR [rdx-32]
movdqa xmm0, XMMWORD PTR [rdx-16]
paddd xmm3, XMMWORD PTR [rdx-64]
paddd xmm2, XMMWORD PTR [rax-48]
paddd xmm1, XMMWORD PTR [rax-32]
paddd xmm0, XMMWORD PTR [rax-16]
movaps XMMWORD PTR [rax-64], xmm3
movaps XMMWORD PTR [rax-48], xmm2
movaps XMMWORD PTR [rax-32], xmm1
movaps XMMWORD PTR [rax-16], xmm0
cmp rax, rcx
jne .L11
.L9:
ret
void add_unaligned(std::int32_t* lhs_and_result, std::int32_t* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
lhs_and_result[i] += rhs[i];
}
add_unaligned(int*, int*, unsigned long): # @add_unaligned(int*, int*, unsigned long)
test rdx, rdx
je .LBB2_16
cmp rdx, 7
jbe .LBB2_2
lea rax, [rsi + 4*rdx]
cmp rax, rdi
jbe .LBB2_9
lea rax, [rdi + 4*rdx]
cmp rax, rsi
jbe .LBB2_9
.LBB2_2:
xor r10d, r10d
.LBB2_3:
mov r8, r10
not r8
add r8, rdx
mov rcx, rdx
and rcx, 3
je .LBB2_5
.LBB2_4: # =>This Inner Loop Header: Depth=1
mov eax, dword ptr [rsi + 4*r10]
add dword ptr [rdi + 4*r10], eax
add r10, 1
add rcx, -1
jne .LBB2_4
.LBB2_5:
cmp r8, 3
jb .LBB2_16
.LBB2_6: # =>This Inner Loop Header: Depth=1
mov eax, dword ptr [rsi + 4*r10]
add dword ptr [rdi + 4*r10], eax
mov eax, dword ptr [rsi + 4*r10 + 4]
add dword ptr [rdi + 4*r10 + 4], eax
mov eax, dword ptr [rsi + 4*r10 + 8]
add dword ptr [rdi + 4*r10 + 8], eax
mov eax, dword ptr [rsi + 4*r10 + 12]
add dword ptr [rdi + 4*r10 + 12], eax
add r10, 4
cmp rdx, r10
jne .LBB2_6
jmp .LBB2_16
.LBB2_9:
mov r10, rdx
and r10, -8
lea rax, [r10 - 8]
mov r9, rax
shr r9, 3
add r9, 1
mov r8d, r9d
and r8d, 1
test rax, rax
je .LBB2_10
sub r9, r8
xor ecx, ecx
.LBB2_12: # =>This Inner Loop Header: Depth=1
movdqu xmm0, xmmword ptr [rsi + 4*rcx]
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 16]
movdqu xmm2, xmmword ptr [rdi + 4*rcx]
paddd xmm2, xmm0
movdqu xmm0, xmmword ptr [rdi + 4*rcx + 16]
paddd xmm0, xmm1
movdqu xmm1, xmmword ptr [rdi + 4*rcx + 32]
movdqu xmm3, xmmword ptr [rdi + 4*rcx + 48]
movdqu xmmword ptr [rdi + 4*rcx], xmm2
movdqu xmmword ptr [rdi + 4*rcx + 16], xmm0
movdqu xmm0, xmmword ptr [rsi + 4*rcx + 32]
paddd xmm0, xmm1
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 48]
paddd xmm1, xmm3
movdqu xmmword ptr [rdi + 4*rcx + 32], xmm0
movdqu xmmword ptr [rdi + 4*rcx + 48], xmm1
add rcx, 16
add r9, -2
jne .LBB2_12
test r8, r8
je .LBB2_15
.LBB2_14:
movdqu xmm0, xmmword ptr [rsi + 4*rcx]
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 16]
movdqu xmm2, xmmword ptr [rdi + 4*rcx]
paddd xmm2, xmm0
movdqu xmm0, xmmword ptr [rdi + 4*rcx + 16]
paddd xmm0, xmm1
movdqu xmmword ptr [rdi + 4*rcx], xmm2
movdqu xmmword ptr [rdi + 4*rcx + 16], xmm0
.LBB2_15:
cmp r10, rdx
jne .LBB2_3
.LBB2_16:
ret
.LBB2_10:
xor ecx, ecx
test r8, r8
jne .LBB2_14
jmp .LBB2_15
add_unaligned(int*, int*, unsigned long):
test rdx, rdx
je .L16
lea rcx, [rsi+4]
mov rax, rdi
sub rax, rcx
cmp rax, 8
jbe .L22
lea rax, [rdx-1]
cmp rax, 2
jbe .L22
mov rcx, rdx
xor eax, eax
shr rcx, 2
sal rcx, 4
.L19:
movdqu xmm0, XMMWORD PTR [rdi+rax]
movdqu xmm1, XMMWORD PTR [rsi+rax]
paddd xmm0, xmm1
movups XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, rcx
jne .L19
mov rax, rdx
and rax, -4
test dl, 3
je .L16
mov ecx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], ecx
lea rcx, [rax+1]
cmp rdx, rcx
jbe .L16
add rax, 2
mov r8d, DWORD PTR [rsi+rcx*4]
add DWORD PTR [rdi+rcx*4], r8d
cmp rdx, rax
jbe .L16
mov edx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], edx
ret
.L22:
xor eax, eax
.L18:
mov ecx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], ecx
add rax, 1
cmp rdx, rax
jne .L18
.L16:
ret