Warning: file_get_contents(/data/phpspider/zhask/data//catemap/6/cplusplus/132.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C++ 将char8的大型c数组转换为short16的最快方法是什么?_C++_C_Intel_Intrinsics - Fatal编程技术网

C++ 将char8的大型c数组转换为short16的最快方法是什么?

C++ 将char8的大型c数组转换为short16的最快方法是什么?,c++,c,intel,intrinsics,C++,C,Intel,Intrinsics,我的原始数据是一组长度>1000000的(无符号)字符(8位)c数组。 我想按照下面代码中的规则将它们相加(向量相加)。 结果: (无符号)短(16位)的c数组 我已经阅读了所有SSE和AVX/AVX2,但只有一个类似的调用 多个256bit的2个寄存器。前4个32位将相乘,每对32位的结果是一个64位将放入256寄存器。(\u mm256\u mul\u epi32,\u mm256\u mul\u epu32) 示例代码: static inline void adder(uint16_

我的原始数据是一组长度>1000000的(无符号)字符(8位)c数组。 我想按照下面代码中的规则将它们相加(向量相加)。 结果: (无符号)短(16位)的c数组

我已经阅读了所有SSE和AVX/AVX2,但只有一个类似的调用 多个256bit的2个寄存器。前4个32位将相乘,每对32位的结果是一个64位将放入256寄存器。(\u mm256\u mul\u epi32,\u mm256\u mul\u epu32)

示例代码:

static inline void adder(uint16_t *canvas, uint8_t *addon, uint64_t count)
{
    for (uint64_t i=0; i<count; i++)
        canvas[i] += static_cast<uint16_t>(addon[i]);
}
静态内联无效加法器(uint16\u t*画布、uint8\u t*加载项、uint64\u t计数)
{

对于(uint64_t i=0;i来说,注释确实是正确的:编译器可以为您进行向量化。 我对你的代码做了一些修改,以改进自动矢量化。 对于gcc-O3-march=haswell-std=c++14(gcc版本8.2),以下代码:

#include <cstdint>
#include <immintrin.h>

void cvt_uint8_int16(uint16_t * __restrict__ canvas, uint8_t * __restrict__ addon, int64_t count) {
    int64_t i;
    /* If you know that n is always a multiple of 32 then insert       */
    /* n = n & 0xFFFFFFFFFFFFFFE0u;                                    */
    /* This leads to cleaner code. Now assume n is a multiple of 32:   */
    count = count & 0xFFFFFFFFFFFFFFE0u;                               
    for (i = 0; i < count; i++){
        canvas[i] += static_cast<uint16_t>(addon[i]);
    }
}
编译器Clang产生的结果有点不同:它加载128位(char)向量并使用
vpmovzxbw
转换它们。 编译器gcc加载256位(char)向量并转换上下128位 另一方面,这可能会稍微降低效率。 然而,您的问题可能是带宽受限(因为长度>1000000)

还可以使用内部函数(未测试)对代码进行矢量化:

void cvt\u uint8\u int16\u与\u intrinsic(uint16\u t*\u限制\u\u画布、uint8\u t*\u限制\u加载项、int64\u计数){
int64_t i;
/*假设n是16的倍数*/
对于(i=0;i
这导致类似于自动矢量化代码。

添加到@wim answer上(这是一个很好的答案)考虑到@Bathsheba comment,信任编译器和检查编译器的输出都是值得的,这样既可以学习如何做,也可以检查它是否做了您想要的事情。通过运行稍微修改过的代码版本(对于msvc、gcc和clang),可以给出一些不完美的答案

如果您将自己限制在SSE2及以下(以及我测试的内容),这一点尤其正确。

所有编译器都对代码进行矢量化和展开,并使用
punpcklbw
uint8\u t
解包到
uint16\u t
,然后运行SIMD添加和保存。这很好。然而,MSVC往往会在内部循环中不必要地溢出,而clang只使用
punpcklbw
,而不是
punpckhbw
,这意味着加载源数据两次。GCC正确地获取SIMD部分,但循环约束的开销较高

因此,从理论上讲,如果您想改进这些版本,您可以使用intrinsic来实现自己的版本,它看起来像:

static inline void adder2(uint16_t *canvas, uint8_t *addon, uint64_t count)
{
    uint64_t count32 = (count / 32) * 32;
    __m128i zero = _mm_set_epi32(0, 0, 0, 0);
    uint64_t i = 0;
    for (; i < count32; i+= 32)
    {
        uint8_t* addonAddress = (addon + i);

        // Load data 32 bytes at a time and widen the input
        // to `uint16_t`'sinto 4 temp xmm reigsters.
        __m128i input = _mm_loadu_si128((__m128i*)(addonAddress + 0));
        __m128i temp1 = _mm_unpacklo_epi8(input, zero);
        __m128i temp2 = _mm_unpackhi_epi8(input, zero);
        __m128i input2 = _mm_loadu_si128((__m128i*)(addonAddress + 16));
        __m128i temp3 = _mm_unpacklo_epi8(input2, zero);
        __m128i temp4 = _mm_unpackhi_epi8(input2, zero);

        // Load data we need to update
        uint16_t* canvasAddress = (canvas + i);
        __m128i canvas1 = _mm_loadu_si128((__m128i*)(canvasAddress + 0));
        __m128i canvas2 = _mm_loadu_si128((__m128i*)(canvasAddress + 8));
        __m128i canvas3 = _mm_loadu_si128((__m128i*)(canvasAddress + 16));
        __m128i canvas4 = _mm_loadu_si128((__m128i*)(canvasAddress + 24));

        // Update the values
        __m128i output1 = _mm_add_epi16(canvas1, temp1);
        __m128i output2 = _mm_add_epi16(canvas2, temp2);
        __m128i output3 = _mm_add_epi16(canvas3, temp3);
        __m128i output4 = _mm_add_epi16(canvas4, temp4);

        // Store the values
        _mm_storeu_si128((__m128i*)(canvasAddress + 0), output1);
        _mm_storeu_si128((__m128i*)(canvasAddress + 8), output2);
        _mm_storeu_si128((__m128i*)(canvasAddress + 16), output3);
        _mm_storeu_si128((__m128i*)(canvasAddress + 24), output4);
    }

    // Mop up
    for (; i<count; i++)
        canvas[i] += static_cast<uint16_t>(addon[i]);
}
static inline void adder2(uint16\u t*canvas、uint8\u t*addon、uint64\u t count)
{
uint64_t count32=(计数/32)*32;
__m128i零=_mm_set_epi32(0,0,0,0);
uint64_t i=0;
对于(;i与WIM和迈克的伟大答案中的手动优化方法不同,让我们快速查看一下完全香草C++实现给我们的是什么:

std::transform(addon, addon + count, canvas, canvas, std::plus<void>());
<>但是,通用的C++解决方案可以在不同类型容器和元素类型的任意组合中工作,只要可以添加元素类型。因此,在其他答案中指出,当然可以从手动优化中获得更有效的实现,但是只要编写PLA就可以走很长的路。在C++代码中(如果做得正确)在使用手动编写SSE内联函数之前,请考虑一个通用的C++解决方案更灵活、更容易维护,尤其是更便携的。通过目标体系结构开关的简单翻转,可以使它产生类似于SSE、AVX、甚至ARM与氖以及任何其他指令集的相似质量的代码。您可能碰巧想要在上运行。如果您需要您的代码完美到某个特定CPU上的某个特定用例的最后一条指令,那么是的,内部函数甚至内联汇编可能是一种方法。但一般来说,我也建议您将重点放在
static inline void adder2(uint16_t *canvas, uint8_t *addon, uint64_t count)
{
    uint64_t count32 = (count / 32) * 32;
    __m128i zero = _mm_set_epi32(0, 0, 0, 0);
    uint64_t i = 0;
    for (; i < count32; i+= 32)
    {
        uint8_t* addonAddress = (addon + i);

        // Load data 32 bytes at a time and widen the input
        // to `uint16_t`'sinto 4 temp xmm reigsters.
        __m128i input = _mm_loadu_si128((__m128i*)(addonAddress + 0));
        __m128i temp1 = _mm_unpacklo_epi8(input, zero);
        __m128i temp2 = _mm_unpackhi_epi8(input, zero);
        __m128i input2 = _mm_loadu_si128((__m128i*)(addonAddress + 16));
        __m128i temp3 = _mm_unpacklo_epi8(input2, zero);
        __m128i temp4 = _mm_unpackhi_epi8(input2, zero);

        // Load data we need to update
        uint16_t* canvasAddress = (canvas + i);
        __m128i canvas1 = _mm_loadu_si128((__m128i*)(canvasAddress + 0));
        __m128i canvas2 = _mm_loadu_si128((__m128i*)(canvasAddress + 8));
        __m128i canvas3 = _mm_loadu_si128((__m128i*)(canvasAddress + 16));
        __m128i canvas4 = _mm_loadu_si128((__m128i*)(canvasAddress + 24));

        // Update the values
        __m128i output1 = _mm_add_epi16(canvas1, temp1);
        __m128i output2 = _mm_add_epi16(canvas2, temp2);
        __m128i output3 = _mm_add_epi16(canvas3, temp3);
        __m128i output4 = _mm_add_epi16(canvas4, temp4);

        // Store the values
        _mm_storeu_si128((__m128i*)(canvasAddress + 0), output1);
        _mm_storeu_si128((__m128i*)(canvasAddress + 8), output2);
        _mm_storeu_si128((__m128i*)(canvasAddress + 16), output3);
        _mm_storeu_si128((__m128i*)(canvasAddress + 24), output4);
    }

    // Mop up
    for (; i<count; i++)
        canvas[i] += static_cast<uint16_t>(addon[i]);
}
std::transform(addon, addon + count, canvas, canvas, std::plus<void>());
void f(uint16_t* canvas, const uint8_t* addon, size_t count)
{
    for (size_t i = 0; i < count; ++i)
        canvas[i] += addon[i];
}
void f(std::uint16_t* __restrict__ canvas, const std::uint8_t* __restrict__ addon, std::size_t count)
{
    assert(count % 32 == 0);
    count = count & -32;
    std::transform(addon, addon + count, canvas, canvas, std::plus<void>());
}
f(unsigned short*, unsigned char const*, unsigned long):    
        and     rdx, -32
        je      .LBB0_3
        xor     eax, eax
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        vpmovzxbw       xmm0, qword ptr [rsi + rax] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
        vpmovzxbw       xmm1, qword ptr [rsi + rax + 8] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
        vpmovzxbw       xmm2, qword ptr [rsi + rax + 16] # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
        vpmovzxbw       xmm3, qword ptr [rsi + rax + 24] # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
        vpaddw  xmm0, xmm0, xmmword ptr [rdi + 2*rax]
        vpaddw  xmm1, xmm1, xmmword ptr [rdi + 2*rax + 16]
        vpaddw  xmm2, xmm2, xmmword ptr [rdi + 2*rax + 32]
        vpaddw  xmm3, xmm3, xmmword ptr [rdi + 2*rax + 48]
        vmovdqu xmmword ptr [rdi + 2*rax], xmm0
        vmovdqu xmmword ptr [rdi + 2*rax + 16], xmm1
        vmovdqu xmmword ptr [rdi + 2*rax + 32], xmm2
        vmovdqu xmmword ptr [rdi + 2*rax + 48], xmm3
        add     rax, 32
        cmp     rdx, rax
        jne     .LBB0_2
.LBB0_3:
        ret