C++ 将char8的大型c数组转换为short16的最快方法是什么?
我的原始数据是一组长度>1000000的(无符号)字符(8位)c数组。 我想按照下面代码中的规则将它们相加(向量相加)。 结果: (无符号)短(16位)的c数组 我已经阅读了所有SSE和AVX/AVX2,但只有一个类似的调用 多个256bit的2个寄存器。前4个32位将相乘,每对32位的结果是一个64位将放入256寄存器。(\u mm256\u mul\u epi32,\u mm256\u mul\u epu32) 示例代码:C++ 将char8的大型c数组转换为short16的最快方法是什么?,c++,c,intel,intrinsics,C++,C,Intel,Intrinsics,我的原始数据是一组长度>1000000的(无符号)字符(8位)c数组。 我想按照下面代码中的规则将它们相加(向量相加)。 结果: (无符号)短(16位)的c数组 我已经阅读了所有SSE和AVX/AVX2,但只有一个类似的调用 多个256bit的2个寄存器。前4个32位将相乘,每对32位的结果是一个64位将放入256寄存器。(\u mm256\u mul\u epi32,\u mm256\u mul\u epu32) 示例代码: static inline void adder(uint16_
static inline void adder(uint16_t *canvas, uint8_t *addon, uint64_t count)
{
for (uint64_t i=0; i<count; i++)
canvas[i] += static_cast<uint16_t>(addon[i]);
}
静态内联无效加法器(uint16\u t*画布、uint8\u t*加载项、uint64\u t计数)
{
对于(uint64_t i=0;i来说,注释确实是正确的:编译器可以为您进行向量化。
我对你的代码做了一些修改,以改进自动矢量化。
对于gcc-O3-march=haswell-std=c++14(gcc版本8.2),以下代码:
#include <cstdint>
#include <immintrin.h>
void cvt_uint8_int16(uint16_t * __restrict__ canvas, uint8_t * __restrict__ addon, int64_t count) {
int64_t i;
/* If you know that n is always a multiple of 32 then insert */
/* n = n & 0xFFFFFFFFFFFFFFE0u; */
/* This leads to cleaner code. Now assume n is a multiple of 32: */
count = count & 0xFFFFFFFFFFFFFFE0u;
for (i = 0; i < count; i++){
canvas[i] += static_cast<uint16_t>(addon[i]);
}
}
编译器Clang产生的结果有点不同:它加载128位(char)向量并使用vpmovzxbw
转换它们。
编译器gcc加载256位(char)向量并转换上下128位
另一方面,这可能会稍微降低效率。
然而,您的问题可能是带宽受限(因为长度>1000000)
还可以使用内部函数(未测试)对代码进行矢量化:
void cvt\u uint8\u int16\u与\u intrinsic(uint16\u t*\u限制\u\u画布、uint8\u t*\u限制\u加载项、int64\u计数){
int64_t i;
/*假设n是16的倍数*/
对于(i=0;i
这导致类似于自动矢量化代码。添加到@wim answer上(这是一个很好的答案)考虑到@Bathsheba comment,信任编译器和检查编译器的输出都是值得的,这样既可以学习如何做,也可以检查它是否做了您想要的事情。通过运行稍微修改过的代码版本(对于msvc、gcc和clang),可以给出一些不完美的答案
如果您将自己限制在SSE2及以下(以及我测试的内容),这一点尤其正确。
所有编译器都对代码进行矢量化和展开,并使用punpcklbw
将uint8\u t
解包到uint16\u t
,然后运行SIMD添加和保存。这很好。然而,MSVC往往会在内部循环中不必要地溢出,而clang只使用punpcklbw
,而不是punpckhbw
,这意味着加载源数据两次。GCC正确地获取SIMD部分,但循环约束的开销较高
因此,从理论上讲,如果您想改进这些版本,您可以使用intrinsic来实现自己的版本,它看起来像:
static inline void adder2(uint16_t *canvas, uint8_t *addon, uint64_t count)
{
uint64_t count32 = (count / 32) * 32;
__m128i zero = _mm_set_epi32(0, 0, 0, 0);
uint64_t i = 0;
for (; i < count32; i+= 32)
{
uint8_t* addonAddress = (addon + i);
// Load data 32 bytes at a time and widen the input
// to `uint16_t`'sinto 4 temp xmm reigsters.
__m128i input = _mm_loadu_si128((__m128i*)(addonAddress + 0));
__m128i temp1 = _mm_unpacklo_epi8(input, zero);
__m128i temp2 = _mm_unpackhi_epi8(input, zero);
__m128i input2 = _mm_loadu_si128((__m128i*)(addonAddress + 16));
__m128i temp3 = _mm_unpacklo_epi8(input2, zero);
__m128i temp4 = _mm_unpackhi_epi8(input2, zero);
// Load data we need to update
uint16_t* canvasAddress = (canvas + i);
__m128i canvas1 = _mm_loadu_si128((__m128i*)(canvasAddress + 0));
__m128i canvas2 = _mm_loadu_si128((__m128i*)(canvasAddress + 8));
__m128i canvas3 = _mm_loadu_si128((__m128i*)(canvasAddress + 16));
__m128i canvas4 = _mm_loadu_si128((__m128i*)(canvasAddress + 24));
// Update the values
__m128i output1 = _mm_add_epi16(canvas1, temp1);
__m128i output2 = _mm_add_epi16(canvas2, temp2);
__m128i output3 = _mm_add_epi16(canvas3, temp3);
__m128i output4 = _mm_add_epi16(canvas4, temp4);
// Store the values
_mm_storeu_si128((__m128i*)(canvasAddress + 0), output1);
_mm_storeu_si128((__m128i*)(canvasAddress + 8), output2);
_mm_storeu_si128((__m128i*)(canvasAddress + 16), output3);
_mm_storeu_si128((__m128i*)(canvasAddress + 24), output4);
}
// Mop up
for (; i<count; i++)
canvas[i] += static_cast<uint16_t>(addon[i]);
}
static inline void adder2(uint16\u t*canvas、uint8\u t*addon、uint64\u t count)
{
uint64_t count32=(计数/32)*32;
__m128i零=_mm_set_epi32(0,0,0,0);
uint64_t i=0;
对于(;i 与WIM和迈克的伟大答案中的手动优化方法不同,让我们快速查看一下完全香草C++实现给我们的是什么:
std::transform(addon, addon + count, canvas, canvas, std::plus<void>());
<>但是,通用的C++解决方案可以在不同类型容器和元素类型的任意组合中工作,只要可以添加元素类型。因此,在其他答案中指出,当然可以从手动优化中获得更有效的实现,但是只要编写PLA就可以走很长的路。在C++代码中(如果做得正确)在使用手动编写SSE内联函数之前,请考虑一个通用的C++解决方案更灵活、更容易维护,尤其是更便携的。通过目标体系结构开关的简单翻转,可以使它产生类似于SSE、AVX、甚至ARM与氖以及任何其他指令集的相似质量的代码。您可能碰巧想要在上运行。如果您需要您的代码完美到某个特定CPU上的某个特定用例的最后一条指令,那么是的,内部函数甚至内联汇编可能是一种方法。但一般来说,我也建议您将重点放在
static inline void adder2(uint16_t *canvas, uint8_t *addon, uint64_t count)
{
uint64_t count32 = (count / 32) * 32;
__m128i zero = _mm_set_epi32(0, 0, 0, 0);
uint64_t i = 0;
for (; i < count32; i+= 32)
{
uint8_t* addonAddress = (addon + i);
// Load data 32 bytes at a time and widen the input
// to `uint16_t`'sinto 4 temp xmm reigsters.
__m128i input = _mm_loadu_si128((__m128i*)(addonAddress + 0));
__m128i temp1 = _mm_unpacklo_epi8(input, zero);
__m128i temp2 = _mm_unpackhi_epi8(input, zero);
__m128i input2 = _mm_loadu_si128((__m128i*)(addonAddress + 16));
__m128i temp3 = _mm_unpacklo_epi8(input2, zero);
__m128i temp4 = _mm_unpackhi_epi8(input2, zero);
// Load data we need to update
uint16_t* canvasAddress = (canvas + i);
__m128i canvas1 = _mm_loadu_si128((__m128i*)(canvasAddress + 0));
__m128i canvas2 = _mm_loadu_si128((__m128i*)(canvasAddress + 8));
__m128i canvas3 = _mm_loadu_si128((__m128i*)(canvasAddress + 16));
__m128i canvas4 = _mm_loadu_si128((__m128i*)(canvasAddress + 24));
// Update the values
__m128i output1 = _mm_add_epi16(canvas1, temp1);
__m128i output2 = _mm_add_epi16(canvas2, temp2);
__m128i output3 = _mm_add_epi16(canvas3, temp3);
__m128i output4 = _mm_add_epi16(canvas4, temp4);
// Store the values
_mm_storeu_si128((__m128i*)(canvasAddress + 0), output1);
_mm_storeu_si128((__m128i*)(canvasAddress + 8), output2);
_mm_storeu_si128((__m128i*)(canvasAddress + 16), output3);
_mm_storeu_si128((__m128i*)(canvasAddress + 24), output4);
}
// Mop up
for (; i<count; i++)
canvas[i] += static_cast<uint16_t>(addon[i]);
}
std::transform(addon, addon + count, canvas, canvas, std::plus<void>());
void f(uint16_t* canvas, const uint8_t* addon, size_t count)
{
for (size_t i = 0; i < count; ++i)
canvas[i] += addon[i];
}
void f(std::uint16_t* __restrict__ canvas, const std::uint8_t* __restrict__ addon, std::size_t count)
{
assert(count % 32 == 0);
count = count & -32;
std::transform(addon, addon + count, canvas, canvas, std::plus<void>());
}
f(unsigned short*, unsigned char const*, unsigned long):
and rdx, -32
je .LBB0_3
xor eax, eax
.LBB0_2: # =>This Inner Loop Header: Depth=1
vpmovzxbw xmm0, qword ptr [rsi + rax] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpmovzxbw xmm1, qword ptr [rsi + rax + 8] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpmovzxbw xmm2, qword ptr [rsi + rax + 16] # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpmovzxbw xmm3, qword ptr [rsi + rax + 24] # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpaddw xmm0, xmm0, xmmword ptr [rdi + 2*rax]
vpaddw xmm1, xmm1, xmmword ptr [rdi + 2*rax + 16]
vpaddw xmm2, xmm2, xmmword ptr [rdi + 2*rax + 32]
vpaddw xmm3, xmm3, xmmword ptr [rdi + 2*rax + 48]
vmovdqu xmmword ptr [rdi + 2*rax], xmm0
vmovdqu xmmword ptr [rdi + 2*rax + 16], xmm1
vmovdqu xmmword ptr [rdi + 2*rax + 32], xmm2
vmovdqu xmmword ptr [rdi + 2*rax + 48], xmm3
add rax, 32
cmp rdx, rax
jne .LBB0_2
.LBB0_3:
ret