C 快速计数_m128i寄存器中的设置位数

C 快速计数_m128i寄存器中的设置位数,c,sse,simd,sse2,hammingweight,C,Sse,Simd,Sse2,Hammingweight,我应该计算uum128i寄存器的设置位数。 特别是,我应该使用以下方法编写两个能够计算寄存器位数的函数 寄存器的设置位总数 寄存器每个字节的设置位数 是否存在可以全部或部分执行上述操作的内在函数?编辑:我想我不明白OP在寻找什么,但我会保留我的答案,以防它对遇到此问题的其他人有用 C提供了一些不错的按位操作 下面是计算整数中设置的位数的代码: countBitsSet(int toCount) { int numBitsSet = 0; while(toCount != 0)

我应该计算uum128i寄存器的设置位数。 特别是,我应该使用以下方法编写两个能够计算寄存器位数的函数

  • 寄存器的设置位总数
  • 寄存器每个字节的设置位数

  • 是否存在可以全部或部分执行上述操作的内在函数?

    编辑:我想我不明白OP在寻找什么,但我会保留我的答案,以防它对遇到此问题的其他人有用

    C提供了一些不错的按位操作

    下面是计算整数中设置的位数的代码:

    countBitsSet(int toCount)
    {
        int numBitsSet = 0;
        while(toCount != 0)
        {
            count += toCount % 2;
            toCount = toCount >> 1;
        }
        return numBitsSet;
    }
    
    说明:

    toCount % 2
    
    返回整数中的最后一位。(除以2并检查余数)。我们将其添加到总计数中,然后将toCount值的位移位1。应继续此操作,直到toCount中没有设置更多位为止(当toCount等于0时)

    要计算特定字节中的位数,您需要使用掩码。以下是一个例子:

    countBitsInByte(int toCount, int byteNumber)
    {
        int mask = 0x000F << byteNumber * 8
        return countBitsSet(toCount & mask)
    }
    
    countBitsInByte(int-toCount,int-byteNumber)
    {
    
    int mask=0x000F以下是我在旧项目()中使用的一些代码。下面的函数
    popcnt8
    计算每个字节中设置的位数

    仅SSE2版本(基于中的算法3):

    SSSE3版本(到期):

    XOP版本(相当于SSSE3,但使用AMD推土机上更快的XOP指令)

    下面的函数
    popcnt64
    统计SSE寄存器的低位和高位64位部分的位数:

    static inline int popcnt128(__m128i n) {
        const __m128i cnt64 = popcnt64(n);
        const __m128i cnt64_hi = _mm_unpackhi_epi64(cnt64, cnt64);
        const __m128i cnt128 = _mm_add_epi32(cnt64, cnt64_hi);
        return _mm_cvtsi128_si32(cnt128);
    }
    
    SSE2版本:

    static inline __m128i popcnt64(__m128i n) {
        const __m128i cnt8 = popcnt8(n);
        return _mm_sad_epu8(cnt8, _mm_setzero_si128());
    }
    
    XOP版本:

    static inline __m128i popcnt64(__m128i n) {
        const __m128i cnt8 = popcnt8(n);
        return _mm_haddq_epi8(cnt8);
    }
    
    最后,下面的函数
    popcnt128
    计算整个128位寄存器中的位数:

    static inline int popcnt128(__m128i n) {
        const __m128i cnt64 = popcnt64(n);
        const __m128i cnt64_hi = _mm_unpackhi_epi64(cnt64, cnt64);
        const __m128i cnt128 = _mm_add_epi32(cnt64, cnt64_hi);
        return _mm_cvtsi128_si32(cnt128);
    }
    
    但是,实现
    popcnt128
    的更有效方法是使用硬件POPCNT指令(在支持它的处理器上):


    正如第一条评论中所说,gcc 3.4+提供了一个方便的访问(希望是最佳的)内置via的途径

    int __builtin_popcount (unsigned int x) /* Returns the number of 1-bits in x. */
    
    如下文所述:

    并没有准确地回答128位的问题,但很好地回答了我在这里登陆时遇到的问题:)

    这是一个基于的版本,命名类似于其他内在函数,以及一些额外的16、32和64位向量函数

    #include "immintrin.h"
    
    /* bit masks: 0x55 = 01010101, 0x33 = 00110011, 0x0f = 00001111 */
    static const __m128i m1 = {0x5555555555555555ULL,0x5555555555555555ULL};
    static const __m128i m2 = {0x3333333333333333ULL,0x3333333333333333ULL};
    static const __m128i m3 = {0x0f0f0f0f0f0f0f0fULL,0x0f0f0f0f0f0f0f0fULL};
    static const __m128i m4 = {0x001f001f001f001fULL,0x001f001f001f001fULL};
    static const __m128i m5 = {0x0000003f0000003fULL,0x0000003f0000003fULL};
    
    __m128i _mm_popcnt_epi8(__m128i x) {
        /* Note: if we returned x here it would be like _mm_popcnt_epi1(x) */ 
        __m128i y;
        /* add even and odd bits*/
        y = _mm_srli_epi64(x,1);  //put even bits in odd place
        y = _mm_and_si128(y,m1);  //mask out the even bits (0x55)
        x = _mm_subs_epu8(x,y);   //shortcut to mask even bits and add
        /* if we just returned x here it would be like _mm_popcnt_epi2(x) */ 
        /* now add the half nibbles */
        y = _mm_srli_epi64 (x,2); //move half nibbles in place to add
        y = _mm_and_si128(y,m2);  //mask off the extra half nibbles (0x0f)
        x = _mm_and_si128(x,m2);  //ditto
        x = _mm_adds_epu8(x,y);   //totals are a maximum of 5 bits (0x1f)
        /* if we just returned x here it would be like _mm_popcnt_epi4(x) */ 
        /* now add the nibbles */
        y = _mm_srli_epi64(x,4);  //move nibbles in place to add
        x = _mm_adds_epu8(x,y);   //totals are a maximum of 6 bits (0x3f)
        x = _mm_and_si128(x,m3);  //mask off the extra bits
        return x;
    }
    
    __m128i _mm_popcnt_epi16(__m128i x) {
        __m128i y;
        x = _mm_popcnt_epi8(x);    //get byte popcount
        y = _mm_srli_si128(x,1);   //copy even bytes for adding
        x = _mm_add_epi16(x,y);    //add even bytes into the odd bytes
        return _mm_and_si128(x,m4);//mask off the even byte and return
    }
    
    __m128i _mm_popcnt_epi32(__m128i x) {
        __m128i y;
        x = _mm_popcnt_epi16(x);   //get word popcount
        y = _mm_srli_si128(x,2);   //copy even words for adding
        x = _mm_add_epi32(x,y);    //add even words into odd words
        return _mm_and_si128(x,m5);//mask off the even words and return
    }
    
    __m128i _mm_popcnt_epi64(__m128i x){
        /* _mm_sad_epu8() is weird
           It takes the absolute difference of bytes between 2 __m128i
           then horizontal adds the lower and upper 8 differences
           and stores the sums in the lower and upper 64 bits
        */
        return _mm_sad_epu8(_mm_popcnt_epi8(x),(__m128i){0});
    }
    
    int _mm_popcnt_si128(__m128i x){
        x = _mm_popcnt_epi64(x);
        __m128i y = _mm_srli_si128(x,8);
        return _mm_add_epi64(x,y)[0];
        //alternative: __builtin_popcntll(x[0])+__builtin_popcntll(x[1]);
    }
    

    最近的CPU有一条
    POPCNT
    (填充计数)指令;GCC通过内置的方式将其公开。请参阅,了解更多信息。MS也有popcount函数…请参阅…注意,这些函数不一定比bithack快;如果在数组中计算位,则一些bithack函数会稍微快一些。有内置函数(映射到CPU指令的内部函数,如
    POPCNT
    ),问题是如何计算128位SSE(XMM)中的设置位注册,不是一个
    int
    。啊,我知道我没有完全理解这个问题。如果合适的话,我会编辑我的回答并保持它,以防有人无意中发现。C不提供“nice”按位运算。你甚至不能移植得到算术右移!实现可以是2的补码,但有符号类型上的
    >
    可以是逻辑移位。但实际上,人们真正想要使用的所有编译器都给你符号类型上的算术右移,因此你的函数是一个infinite循环负数
    toCount
    。有符号的
    %2
    &1
    需要更多的工作,因为它必须为负数奇数生成
    -1
    。但是(在普通编译器上)如果
    toCount
    为负数,则您的函数永远不会返回,因此问题被隐藏了……似乎您是上述研究论文的共同作者之一:-)剪贴工作人员的总结也很好。你的解决方案是最新的。哈克姆的把戏不再是最新的了。太好了,伙计!哦,太糟糕了。你在ACM上发表了你的论文,所以不幸的是,我不付15美元就看不懂了:-(@NilsPipenbrinck,论文可在会议网站上免费获得:conferences.computer.org/sc/2012/papers/1000a033.pdfappa事实上,你的SSE2版本通常比SSSE3版本快。SSSE3的指令更少并不重要。这里有一个基准:@Soonts它可能是,但单凭Microsoft编译器的结果并不矛盾vincing。为什么第一步之后的步骤需要饱和
    adds
    而不是常规的
    add
    (尽管根据Agner Fog的指令表,
    paddusb
    在所有方面都与
    paddb
    具有相同的性能,因此没有性能理由避免饱和add。这只是令人惊讶。)
    static inline int popcnt128(__m128i n) {
        const __m128i n_hi = _mm_unpackhi_epi64(n, n);
        #ifdef _MSC_VER
            return __popcnt64(_mm_cvtsi128_si64(n)) + __popcnt64(_mm_cvtsi128_si64(n_hi));
        #else
            return __popcntq(_mm_cvtsi128_si64(n)) + __popcntq(_mm_cvtsi128_si64(n_hi));
        #endif
    }
    
    int __builtin_popcount (unsigned int x) /* Returns the number of 1-bits in x. */
    
    #include "immintrin.h"
    
    /* bit masks: 0x55 = 01010101, 0x33 = 00110011, 0x0f = 00001111 */
    static const __m128i m1 = {0x5555555555555555ULL,0x5555555555555555ULL};
    static const __m128i m2 = {0x3333333333333333ULL,0x3333333333333333ULL};
    static const __m128i m3 = {0x0f0f0f0f0f0f0f0fULL,0x0f0f0f0f0f0f0f0fULL};
    static const __m128i m4 = {0x001f001f001f001fULL,0x001f001f001f001fULL};
    static const __m128i m5 = {0x0000003f0000003fULL,0x0000003f0000003fULL};
    
    __m128i _mm_popcnt_epi8(__m128i x) {
        /* Note: if we returned x here it would be like _mm_popcnt_epi1(x) */ 
        __m128i y;
        /* add even and odd bits*/
        y = _mm_srli_epi64(x,1);  //put even bits in odd place
        y = _mm_and_si128(y,m1);  //mask out the even bits (0x55)
        x = _mm_subs_epu8(x,y);   //shortcut to mask even bits and add
        /* if we just returned x here it would be like _mm_popcnt_epi2(x) */ 
        /* now add the half nibbles */
        y = _mm_srli_epi64 (x,2); //move half nibbles in place to add
        y = _mm_and_si128(y,m2);  //mask off the extra half nibbles (0x0f)
        x = _mm_and_si128(x,m2);  //ditto
        x = _mm_adds_epu8(x,y);   //totals are a maximum of 5 bits (0x1f)
        /* if we just returned x here it would be like _mm_popcnt_epi4(x) */ 
        /* now add the nibbles */
        y = _mm_srli_epi64(x,4);  //move nibbles in place to add
        x = _mm_adds_epu8(x,y);   //totals are a maximum of 6 bits (0x3f)
        x = _mm_and_si128(x,m3);  //mask off the extra bits
        return x;
    }
    
    __m128i _mm_popcnt_epi16(__m128i x) {
        __m128i y;
        x = _mm_popcnt_epi8(x);    //get byte popcount
        y = _mm_srli_si128(x,1);   //copy even bytes for adding
        x = _mm_add_epi16(x,y);    //add even bytes into the odd bytes
        return _mm_and_si128(x,m4);//mask off the even byte and return
    }
    
    __m128i _mm_popcnt_epi32(__m128i x) {
        __m128i y;
        x = _mm_popcnt_epi16(x);   //get word popcount
        y = _mm_srli_si128(x,2);   //copy even words for adding
        x = _mm_add_epi32(x,y);    //add even words into odd words
        return _mm_and_si128(x,m5);//mask off the even words and return
    }
    
    __m128i _mm_popcnt_epi64(__m128i x){
        /* _mm_sad_epu8() is weird
           It takes the absolute difference of bytes between 2 __m128i
           then horizontal adds the lower and upper 8 differences
           and stores the sums in the lower and upper 64 bits
        */
        return _mm_sad_epu8(_mm_popcnt_epi8(x),(__m128i){0});
    }
    
    int _mm_popcnt_si128(__m128i x){
        x = _mm_popcnt_epi64(x);
        __m128i y = _mm_srli_si128(x,8);
        return _mm_add_epi64(x,y)[0];
        //alternative: __builtin_popcntll(x[0])+__builtin_popcntll(x[1]);
    }