C++ 可以使用SIMD优化两个字符串之间的字节匹配计数吗？_C++_Optimization_X86 64_Sse_Simd

C++ 可以使用SIMD优化两个字符串之间的字节匹配计数吗？

c++ optimization

C++ 可以使用SIMD优化两个字符串之间的字节匹配计数吗？,c++,optimization,x86-64,sse,simd,C++,Optimization,X86 64,Sse,Simd,分析表明，此函数对于我的应用程序来说是一个真正的瓶颈： static inline int countEqualChars(const char* string1, const char* string2, int size) { int r = 0; for (int j = 0; j < size; ++j) { if (string1[j] == string2[j]) { ++r; } } r

分析表明，此函数对于我的应用程序来说是一个真正的瓶颈：

static inline int countEqualChars(const char* string1, const char* string2, int size) {
    int r = 0;
    for (int j = 0; j < size; ++j) {
        if (string1[j] == string2[j]) {
            ++r;
        }
    }

    return r;
}

静态内联int countEqualChars（常量char*string1，常量char*string2，int size）{
int r=0；
对于（int j=0；j


即使使用-O3
和-march=native
，G++4.7.2也不会对该函数进行矢量化（我检查了汇编程序的输出）。现在，我不是SSE和friends的专家，但我认为同时比较多个角色应该更快。关于如何加快速度有什么想法吗？目标体系结构是x86-64。
当然可以
pcmpeqb
比较两个16字节的向量，并生成一个向量，在它们不同的地方为零，在它们匹配的地方为-1。使用此选项一次比较16个字节，将结果添加到累加器向量中（确保累计最多255个向量比较的结果，以避免溢出）。完成后，累加器中有16个结果。求和并求反得到相等元素的数目
如果长度很短，则很难从这种方法中获得显著的加速。如果长度很长，那么就值得追求。
当前gcc中的自动矢量化是一个帮助编译器理解易于矢量化代码的问题。在您的情况下：如果您删除条件并以更迫切的方式重写代码，它将理解矢量化请求：
    static inline int count(const char* string1, const char* string2, int size) {
            int r = 0;
            bool b;

            for (int j = 0; j < size; ++j) {
                    b = (string1[j] == string2[j]);
                    r += b;
            }

            return r;
    }

（等）
用于矢量化的编译器标志：
-ftree矢量化

-ftree vectorize-march=
（使用计算机上可用的所有指令集扩展，而不仅仅是像x86-64的SSE2这样的基线）。使用-march=native
为运行编译器的机器进行优化。）-march=
还设置-mtune=
，这也是一件好事
使用SSEx内部函数：

填充并将缓冲区对齐到16字节（根据实际使用的向量大小）
使用（0）创建累加器countU8

对于所有n/16输入（子）向量，执行以下操作：

使用或从两个字符串加载16个字符（对于未对齐的加载）

平行比较八位组。每个匹配产生0xFF
（-1），否则产生0x00

使用（减-1->+1）从countU8
中减去上述结果向量
始终在255个周期之后，必须将16个8位计数器提取为更大的整数类型，以防止溢出。有关如何做到这一点，请参见此漂亮答案中的“解包和水平添加”：


代码：
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#ifdef__SSE2__
#如果！已定义（uintpttr_MAX）| |！已定义（UINT64_最大值）| |！已定义（UINT32_最大值）
#错误“未定义限制宏”
#恩迪夫
#如果uintpttr_MAX==UINT64_MAX
#定义PTR_64
#elif uintpttr_MAX==UINT32_MAX
#定义PTR_32
#否则
#错误“不支持当前UINTPTR_MAX”
#恩迪夫
模板
无效打印向量（标准：：ostream&out，常数m128i&vec）
{
静态断言（sizeof（vec）%sizeof（T）=0，“无效元素大小”）；
输入通常是什么样的？大小，它们是变量字符串还是文本字符串？还有，需要这个函数的原因是什么？它的“深层含义”是什么在您的系统中？您是否尝试使用-msse等标志？并在事件前后测量性能？请参阅，我尝试了-msse，但在运行时未测量任何差异。这两个字符串保证具有相同的长度。大小差异很大。@Petesh:OP使用了-march=native
，这意味着-mfoo
标志s他的CPU支持。@Fanael医生是这么说的，但我并不真正相信-march=native
会做正确的事情（这是从gcc旧版本的经验中得出的，现在可能不是这样）我看了一下拆卸的其余部分，好吧，让我们说还有改进的余地。使用大型数据集时，速度仅提高了约5%：（感谢thoughsimfoo的建议，根据Stephen Canon的建议手工编写一个矢量化代码，在分别累加256个值后再减为一个单独的最终和。这将从内部循环中剔除部分代码。GCC在这方面的努力真的很可怜。您可能能够得到它，以避免所有扩大的转换。）如果您使用无符号字符内部累加器，那么您最好编写一些内部函数。谢谢，至少现在我知道这是有可能的，而不是使用掩码对pcmpeqb的结果进行ANDing，然后将其添加到累加器中，您还可以从累加器中减去结果，从而在循环中保存一条指令。谢谢很好，这真的很有帮助：）你可以使用psadbw更有效地进行水平加法，使用0，然后从高到低移动64位并进行加法。我们编写这个循环好吗？我的实现：它使用255次迭代的内循环。我喜欢最后的内循环，它只有7条指令：
movdqa  16(%rsp), %xmm1
movl    $.LC2, %esi
pxor    %xmm2, %xmm2
movzbl  416(%rsp), %edx
movdqa  .LC1(%rip), %xmm3
pcmpeqb 224(%rsp), %xmm1
cmpb    %dl, 208(%rsp)
movzbl  417(%rsp), %eax
movl    $1, %edi
pand    %xmm3, %xmm1
movdqa  %xmm1, %xmm5
sete    %dl
movdqa  %xmm1, %xmm4
movzbl  %dl, %edx
punpcklbw   %xmm2, %xmm5
punpckhbw   %xmm2, %xmm4
pxor    %xmm1, %xmm1
movdqa  %xmm5, %xmm6
movdqa  %xmm5, %xmm0
movdqa  %xmm4, %xmm5
punpcklwd   %xmm1, %xmm6

#include <iostream>
#include <vector>

#include <cassert>
#include <cstdint>
#include <climits>
#include <cstring>

#include <emmintrin.h>

#ifdef __SSE2__

#if !defined(UINTPTR_MAX) ||  !defined(UINT64_MAX) ||  !defined(UINT32_MAX)
#  error "Limit macros are not defined"
#endif

#if UINTPTR_MAX == UINT64_MAX
    #define PTR_64
#elif UINTPTR_MAX == UINT32_MAX
    #define PTR_32
#else
#  error "Current UINTPTR_MAX is not supported"
#endif

template<typename T>
void print_vector(std::ostream& out,const __m128i& vec)
{
    static_assert(sizeof(vec) % sizeof(T) == 0,"Invalid element size");
    std::cout << '{';
    const T* const end   = reinterpret_cast<const T*>(&vec)-1;
    const T* const upper = end+(sizeof(vec)/sizeof(T));
    for(const T* elem = upper;
        elem != end;
        --elem
    )
    {
        if(elem != upper)
            std::cout << ',';
        std::cout << +(*elem);
    }
    std::cout << '}' << std::endl;
}

#define PRINT_VECTOR(_TYPE,_VEC) do{  std::cout << #_VEC << " : "; print_vector<_TYPE>(std::cout,_VEC);    } while(0)

///@note SSE2 required (macro: __SSE2__)
///@warning Not tested!
size_t counteq_epi8(const __m128i* a_in,const __m128i* b_in,size_t count)
{
    assert(a_in != nullptr && (uintptr_t(a_in) % 16) == 0);
    assert(b_in != nullptr && (uintptr_t(b_in) % 16) == 0);
    //assert(count > 0);


/*
    //maybe not so good with all that branching and additional loop variables

    __m128i accumulatorU8 = _mm_set1_epi8(0);
    __m128i sum2xU64 = _mm_set1_epi8(0);
    for(size_t i = 0;i < count;++i)
    {

        //this operation could also be unrolled, where multiple result registers would be accumulated
        accumulatorU8 = _mm_sub_epi8(accumulatorU8,_mm_cmpeq_epi8(*a_in++,*b_in++));
        if(i % 255 == 0)
        {
            //before overflow of uint8, the counter will be extracted
            __m128i sum2xU16 = _mm_sad_epu8(accumulatorU8,_mm_set1_epi8(0));
            sum2xU64 = _mm_add_epi64(sum2xU64,sum2xU16);

            //reset accumulatorU8
            accumulatorU8 = _mm_set1_epi8(0);
        }
    }

    //blindly accumulate remaining values
    __m128i sum2xU16 = _mm_sad_epu8(accumulatorU8,_mm_set1_epi8(0));
    sum2xU64 = _mm_add_epi64(sum2xU64,sum2xU16);

    //do a horizontal addition of the two counter values
    sum2xU64 = _mm_add_epi64(sum2xU64,_mm_srli_si128(sum2xU64,64/8));

#if defined PTR_64
    return _mm_cvtsi128_si64(sum2xU64);
#elif defined PTR_32
    return _mm_cvtsi128_si32(sum2xU64);
#else
#  error "macro PTR_(32|64) is not set"
#endif

*/

    __m128i sum2xU64 = _mm_set1_epi32(0);
    while(count--)
    {
        __m128i matches     = _mm_sub_epi8(_mm_set1_epi32(0),_mm_cmpeq_epi8(*a_in++,*b_in++));
        __m128i sum2xU16    = _mm_sad_epu8(matches,_mm_set1_epi32(0));
                sum2xU64    = _mm_add_epi64(sum2xU64,sum2xU16);
#ifndef NDEBUG
        PRINT_VECTOR(uint16_t,sum2xU64);
#endif
    }

    //do a horizontal addition of the two counter values
    sum2xU64 = _mm_add_epi64(sum2xU64,_mm_srli_si128(sum2xU64,64/8));
#ifndef NDEBUG
    std::cout << "----------------------------------------" << std::endl;
    PRINT_VECTOR(uint16_t,sum2xU64);
#endif

#if !defined(UINTPTR_MAX) ||  !defined(UINT64_MAX) ||  !defined(UINT32_MAX)
#  error "Limit macros are not defined"
#endif

#if defined PTR_64
    return _mm_cvtsi128_si64(sum2xU64);
#elif defined PTR_32
    return _mm_cvtsi128_si32(sum2xU64);
#else
#  error "macro PTR_(32|64) is not set"
#endif

}

#endif

int main(int argc, char* argv[])
{

    std::vector<__m128i> a(64); // * 16 bytes
    std::vector<__m128i> b(a.size());
    const size_t nBytes = a.size() * sizeof(std::vector<__m128i>::value_type);

    char* const a_out = reinterpret_cast<char*>(a.data());
    char* const b_out = reinterpret_cast<char*>(b.data());

    memset(a_out,0,nBytes);
    memset(b_out,0,nBytes);

    a_out[1023] = 1;
    b_out[1023] = 1;

    size_t equalBytes = counteq_epi8(a.data(),b.data(),a.size());

    std::cout << "equalBytes = " << equalBytes << std::endl;

    return 0;
}

size_t counteq_epi8(const __m128i* a_in,const __m128i* b_in,size_t count)
{
    assert((count > 0 ? a_in != nullptr : true) && (uintptr_t(a_in) % sizeof(__m128i)) == 0);
    assert((count > 0 ? b_in != nullptr : true) && (uintptr_t(b_in) % sizeof(__m128i)) == 0);
    //assert(count > 0);

    const size_t maxInnerLoops    = 255;
    const size_t nNestedLoops     = count / maxInnerLoops;
    const size_t nRemainderLoops  = count % maxInnerLoops;

    const __m128i zero  = _mm_setzero_si128();
    __m128i sum16xU8    = zero;
    __m128i sum2xU64    = zero;

    for(size_t i = 0;i < nNestedLoops;++i)
    {
        for(size_t j = 0;j < maxInnerLoops;++j)
        {
            sum16xU8 = _mm_sub_epi8(sum16xU8,_mm_cmpeq_epi8(*a_in++,*b_in++));
        }
        sum2xU64 = _mm_add_epi64(sum2xU64,_mm_sad_epu8(sum16xU8,zero));
        sum16xU8 = zero;
    }

    for(size_t j = 0;j < nRemainderLoops;++j)
    {
        sum16xU8 = _mm_sub_epi8(sum16xU8,_mm_cmpeq_epi8(*a_in++,*b_in++));
    }
    sum2xU64 = _mm_add_epi64(sum2xU64,_mm_sad_epu8(sum16xU8,zero));

    sum2xU64 = _mm_add_epi64(sum2xU64,_mm_srli_si128(sum2xU64,64/8));

#if UINTPTR_MAX == UINT64_MAX
    return _mm_cvtsi128_si64(sum2xU64);
#elif UINTPTR_MAX == UINT32_MAX
    return _mm_cvtsi128_si32(sum2xU64);
#else
#  error "macro PTR_(32|64) is not set"
#endif
}