Algorithm 如何计算32位整数中的设置位数？_Algorithm_Binary_Bit Manipulation_Hammingweight_Iec10967

Algorithm 如何计算32位整数中的设置位数？

algorithm binary

Algorithm 如何计算32位整数中的设置位数？,algorithm,binary,bit-manipulation,hammingweight,iec10967,Algorithm,Binary,Bit Manipulation,Hammingweight,Iec10967,表示数字7的8位如下所示： 00000111 设置了三位确定32位整数中设置位数的算法有哪些？这称为“”、“popcount”或“侧向加法” 一些CPU有一条内置指令来执行此操作，而另一些CPU有作用于位向量的并行指令。像x86这样的指令（在支持它的CPU上）对于单个整数几乎肯定是最快的。其他一些架构可能有一个使用微代码循环实现的慢指令，该循环每周期测试一位（需要引用-硬件popcount通常很快，如果它存在的话） “最佳”算法实际上取决于您使用的CPU以及您的使用模式你的编译器可能知道如

表示数字7的8位如下所示：

00000111

设置了三位

确定32位整数中设置位数的算法有哪些？

这称为“”、“popcount”或“侧向加法”

一些CPU有一条内置指令来执行此操作，而另一些CPU有作用于位向量的并行指令。像x86这样的指令（在支持它的CPU上）对于单个整数几乎肯定是最快的。其他一些架构可能有一个使用微代码循环实现的慢指令，该循环每周期测试一位（需要引用-硬件popcount通常很快，如果它存在的话）

“最佳”算法实际上取决于您使用的CPU以及您的使用模式

<>你的编译器可能知道如何做一些对你正在编译的特定CPU来说是好的，例如，或者C++，作为一种可移植的方法来访问内置/内部函数（参见这个问题）。但是，对于没有硬件popcnt的目标CPU，编译器选择的回退可能并不适合您的用例。或者，您的语言（例如C）可能不会公开任何可移植函数，这些函数可能会在存在CPU特定popcount时使用CPU特定popcount

不需要（或受益于）任何硬件支持的可移植算法如果您的CPU有一个大的缓存，并且您在一个紧密的循环中执行大量这些操作，那么预填充的表查找方法可以非常快。但是，由于“缓存未命中”的代价，CPU必须从主内存中获取一些表，因此它可能会受到影响。（分别查找每个字节以保持表小。）如果希望popcount用于连续的数字范围，则对于256个数字的组，只有低字节在更改

如果您知道您的字节大部分是0或1，那么针对这些场景有一些有效的算法，例如，在循环中使用bithack清除最低的集合，直到它变为零

我相信一个非常好的通用算法如下，称为“并行”或“可变精度SWAR算法”。我用C语言表示了这一点，你可能需要调整它来适应特定语言（例如在爪哇使用C++和u>） GCC10和clang 10.0可以识别此模式/习惯用法，并在可用时将其编译为硬件popcnt或等效指令，从而让您两全其美。（）

int numberofsetbit（uint32\u t i）
{
//Java：使用int，并使用>>>而不是>>。或者使用Integer.bitCount（）
//C或C++：使用uint32\u t
i=i-（（i>>1）&0x5555）；//添加位对
i=（i&0x33333333）+（i>>2&0x33333333）；//四边形
i=（i+（i>>4））&0x0f0f；//一组8个
返回（i*0x01010101）>>24；//字节的水平和
}

对于JavaScript:with

|0

对于性能：将第一行更改为

i=（i | 0）-（i>>1）和0x5555）
这是所讨论的任何算法中最好的最坏情况行为，因此将有效地处理您向其抛出的任何使用模式或值。（它的性能不依赖于普通的CPU，在CPU中，包括乘法在内的所有整数运算都是常数时间。使用“简单”输入，它的速度不会更快，但仍然相当不错。）
参考资料：







SWAR bithack的工作原理：
第一步是屏蔽的优化版本，以隔离奇偶位，移动以对齐它们，然后添加。这在2位累加器中有效地进行了16次单独的加法（）。类似于（i&0x555555）+（（i>>1）&0x555555）

下一步将使用这些16x 2位累加器中的奇数/偶数八个累加器，然后再次相加，生成8 x 4位和。这次不可能进行i-…
优化，因此它只会在移位前/移位后屏蔽。在编译需要在寄存器中分别构造32位常量的ISA时，在移位之前使用相同的0x33…
常量，而不是0xccc…
，这是一件好事
（i+（i>>4））&0x0f0f
的最后移位和添加步骤扩展到4x 8位累加器。它在加法之后而不是之前屏蔽，因为如果设置了相应输入位的所有4位，则任何4位累加器中的最大值都是4
。4+4=8，仍然适合4位，因此在i+（i>>4）
中，半字节元素之间的进位是不可能的
到目前为止，这只是使用SWAR技术的非常普通的SIMD，并进行了一些巧妙的优化。继续使用相同的模式再执行两个步骤可以将计数扩大到2倍16位，然后是1倍32位。但在具有快速硬件乘法的机器上有一种更有效的方法：
一旦我们的“元素”足够少，一个带有魔法常数的乘法可以将所有元素相加到顶部元素中。在本例中为字节元素。乘法是通过左移和加法完成的，因此乘以x*0x01010101
会得到x+（x为什么不迭代除以2
count = 0
while n > 0
  if (n % 2) == 1
    count += 1
  n /= 2
计数=0
当n>0时
如果（n%2）==1
计数+=1
n/=2
我同意这不是最快的，但“最好”有点含糊不清。我认为“最好”应该有一个清晰的元素
如果您碰巧使用Java，内置的方法Integer.bitCount
可以做到这一点。
一些语言可移植地以一种可以使用有效硬件支持（如果可用）的方式公开操作，否则一些库可能会退却，这很不错
例如（来自）：

C++具有std:：bitset:：count（），或
Java有Java.lang.Integer.bitCount（）
（也适用于Long或BigInteger）
C#hasSystem.Numerics.BitOperations.PopCount（）
Python具有int.bit\u count（）

不是所有的编译器
count = 0
while n > 0
  if (n % 2) == 1
    count += 1
  n /= 2
       static final int[] BIT_COUNT = { 0, 1, 1, ... 256 values with a bitsize of a byte ... };
        static int bitCountOfByte( int value ){
            return BIT_COUNT[ value & 0xFF ];
        }

        static int bitCountOfInt( int value ){
            return bitCountOfByte( value ) 
                 + bitCountOfByte( value >> 8 ) 
                 + bitCountOfByte( value >> 16 ) 
                 + bitCountOfByte( value >> 24 );
        }

int pop(unsigned x)
{
    x = x - ((x >> 1) & 0x55555555);
    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
    x = (x + (x >> 4)) & 0x0F0F0F0F;
    x = x + (x >> 8);
    x = x + (x >> 16);
    return x & 0x0000003F;
}

unsigned int bitCount (unsigned int value) {
    unsigned int count = 0;
    while (value > 0) {           // until all bits are zero
        if ((value & 1) == 1)     // check lower bit
            count++;
        value >>= 1;              // shift bits, removing lower bit
    }
    return count;
}

// Lookup table for fast calculation of bits set in 8-bit unsigned char.

static unsigned char oneBitsInUChar[] = {
//  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F (<- n)
//  =====================================================
    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, // 0n
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, // 1n
    : : :
    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, // Fn
};

// Function for fast calculation of bits set in 16-bit unsigned short.

unsigned char oneBitsInUShort (unsigned short x) {
    return oneBitsInUChar [x >>    8]
         + oneBitsInUChar [x &  0xff];
}

// Function for fast calculation of bits set in 32-bit unsigned int.

unsigned char oneBitsInUInt (unsigned int x) {
    return oneBitsInUShort (x >>     16)
         + oneBitsInUShort (x &  0xffff);
}

int bitcount(unsigned int num){
    int count = 0;
    static int nibblebits[] =
        {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
    for(; num != 0; num >>= 4)
        count += nibblebits[num & 0x0f];
    return count;
}

#define BITCOUNT(x)    (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255)
#define BX_(x)         ((x) - (((x)>>1)&0x77777777)
                             - (((x)>>2)&0x33333333)
                             - (((x)>>3)&0x11111111))
static unsigned char wordbits[65536] = { bitcounts of ints between 0 and 65535 };
static int popcount( unsigned int i )
{
    return( wordbits[i&0xFFFF] + wordbits[i>>16] );
}

inline int pop2(unsigned x, unsigned y)
{
    x = x - ((x >> 1) & 0x55555555);
    y = y - ((y >> 1) & 0x55555555);
    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
    y = (y & 0x33333333) + ((y >> 2) & 0x33333333);
    x = (x + (x >> 4)) & 0x0F0F0F0F;
    y = (y + (y >> 4)) & 0x0F0F0F0F;
    x = x + (x >> 8);
    y = y + (y >> 8);
    x = x + (x >> 16);
    y = y + (y >> 16);
    return (x+y) & 0x000000FF;
}
inline int pop2(unsigned long x, unsigned long y)
{
    x = x - ((x >> 1) & 0x5555555555555555);
    y = y - ((y >> 1) & 0x5555555555555555);
    x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
    y = (y & 0x3333333333333333) + ((y >> 2) & 0x3333333333333333);
    x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0F;
    y = (y + (y >> 4)) & 0x0F0F0F0F0F0F0F0F;
    x = x + y; 
    x = x + (x >> 8);
    x = x + (x >> 16);
    x = x + (x >> 32); 
    return x & 0xFF;
}
inline int pop4(unsigned long x, unsigned long y, 
                unsigned long u, unsigned long v)
{
  enum { m1 = 0x5555555555555555, 
         m2 = 0x3333333333333333, 
         m3 = 0x0F0F0F0F0F0F0F0F, 
         m4 = 0x000000FF000000FF };

    x = x - ((x >> 1) & m1);
    y = y - ((y >> 1) & m1);
    u = u - ((u >> 1) & m1);
    v = v - ((v >> 1) & m1);
    x = (x & m2) + ((x >> 2) & m2);
    y = (y & m2) + ((y >> 2) & m2);
    u = (u & m2) + ((u >> 2) & m2);
    v = (v & m2) + ((v >> 2) & m2);
    x = x + y; 
    u = u + v; 
    x = (x & m3) + ((x >> 4) & m3);
    u = (u & m3) + ((u >> 4) & m3);
    x = x + u; 
    x = x + (x >> 8);
    x = x + (x >> 16);
    x = x & m4; 
    x = x + (x >> 32);
    return x & 0x000001FF;
}
hitime b4 = rdtsc(); 
   for (unsigned long i = 10L * 1000*1000*1000; i < 11L * 1000*1000*1000; ++i) 
      sum += pop4 (i,  i^1, ~i, i|1); 
   hitime e4 = rdtsc();
count = 0
while n != 0
if ((n % 2) == 1 || (n % 2) == -1
    count += 1
  n /= 2  
return count

int bit_count(int num)
{
    int count=0;
    while(num)
    {
        num=(num)&(num-1);
        count++;
    }
    return count;
}

#define BitCount(X,Y)           \
                Y = X - ((X >> 1) & 033333333333) - ((X >> 2) & 011111111111); \
                Y = ((Y + (Y >> 3)) & 030707070707); \
                Y =  (Y + (Y >> 6)); \
                Y = (Y + (Y >> 12) + (Y >> 24)) & 077;

input output
AB    CD             Note
00    00             = AB
01    01             = AB
10    01             = AB - (A >> 1) & 0x1
11    10             = AB - (A >> 1) & 0x1

Integer.highestOneBit(n);
Integer.lowestOneBit(n);
Integer.numberOfLeadingZeros(n);
Integer.numberOfTrailingZeros(n);

//Beginning with the value 1, rotate left 16 times
     n = 1;
         for (int i = 0; i < 16; i++) {
            n = Integer.rotateLeft(n, 1);
            System.out.println(n);
         }

unsigned int v; // count the number of bits set in v
unsigned int c; // c accumulates the total bits set in v

// option 1, for at most 14-bit values in v:
c = (v * 0x200040008001ULL & 0x111111111111111ULL) % 0xf;

// option 2, for at most 24-bit values in v:
c =  ((v & 0xfff) * 0x1001001001001ULL & 0x84210842108421ULL) % 0x1f;
c += (((v & 0xfff000) >> 12) * 0x1001001001001ULL & 0x84210842108421ULL) 
     % 0x1f;

// option 3, for at most 32-bit values in v:
c =  ((v & 0xfff) * 0x1001001001001ULL & 0x84210842108421ULL) % 0x1f;
c += (((v & 0xfff000) >> 12) * 0x1001001001001ULL & 0x84210842108421ULL) % 
     0x1f;
c += ((v >> 24) * 0x1001001001001ULL & 0x84210842108421ULL) % 0x1f;

// recursive template to sum bits in an int
template <int BITS>
int countBits(int val) {
        // return the least significant bit plus the result of calling ourselves with
        // .. the shifted value
        return (val & 0x1) + countBits<BITS-1>(val >> 1);
}

// template specialisation to terminate the recursion when there's only one bit left
template<>
int countBits<1>(int val) {
        return val & 0x1;
}

// to count bits in a byte/char (this returns 8)
countBits<8>( 255 )

// another byte (this returns 7)
countBits<8>( 254 )

// counting bits in a word/short (this returns 1)
countBits<16>( 256 )

int countSetBits(int n) {
    return !n ? 0 : 1 + countSetBits(n & (n-1));
}

unsigned int count_bit(unsigned int x)
{
  x = (x & 0x55555555) + ((x >> 1) & 0x55555555);
  x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
  x = (x & 0x0F0F0F0F) + ((x >> 4) & 0x0F0F0F0F);
  x = (x & 0x00FF00FF) + ((x >> 8) & 0x00FF00FF);
  x = (x & 0x0000FFFF) + ((x >> 16)& 0x0000FFFF);
  return x;
}

+-------------------------------+
| 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |  <- x
|  1 0  |  0 1  |  0 1  |  0 1  |  <- first time merge
|    0 0 1 1    |    0 0 1 0    |  <- second time merge
|        0 0 0 0 0 1 0 1        |  <- third time ( answer = 00000101 = 5)
+-------------------------------+

unsigned int f(unsigned int x)
{
    switch (x) {
        case 0:
            return 0;
        case 1:
            return 1;
        case 2:
            return 1;
        case 3:
            return 2;
        default:
            return f(x/4) + f(x%4);
    }
}

int popcount(int v) {
    v = v - ((v >> 1) & 0x55555555);                // put count of each 2 bits into those 2 bits
    v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // put count of each 4 bits into those 4 bits  
    return c = ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
}

v = v - ((v >> 1) & 0x55555555); 

 ---------------------------------------------
 |   v    |   (v >> 1) & 0b0101   |  v - x   |
 ---------------------------------------------
   0b00           0b00               0b00   
   0b01           0b00               0b01     
   0b10           0b01               0b01
   0b11           0b01               0b10

v = (v & 0x33333333) + ((v >> 2) & 0x33333333); 

v & 0b00110011         //masks out even two bits
(v >> 2) & 0b00110011  // masks out odd two bits

c = ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;

v + (v >> 4)

0b01000010 + 0b01000000

0b01100010 & 0xF0 = 0b01100000

unsigned int bitCount(unsigned int x)
{
  x = ((x >> 1) & 0b01010101010101010101010101010101)
     + (x       & 0b01010101010101010101010101010101);
  x = ((x >> 2) & 0b00110011001100110011001100110011)
     + (x       & 0b00110011001100110011001100110011); 
  x = ((x >> 4) & 0b00001111000011110000111100001111)
     + (x       & 0b00001111000011110000111100001111); 
  x = ((x >> 8) & 0b00000000111111110000000011111111)
     + (x       & 0b00000000111111110000000011111111); 
  x = ((x >> 16)& 0b00000000000000001111111111111111)
     + (x       & 0b00000000000000001111111111111111); 
  return x;
}

int NumberOfSetBits(int n)
{
    int count = 0;

    while (n){
        ++ count;
        n = (n - 1) & n;
    }

    return count;
}

public static class BitCount
{
    public static uint GetSetBitsCount(uint n)
    {
        var counts = BYTE_BIT_COUNTS;
        return n <= 0xff ? counts[n]
             : n <= 0xffff ? counts[n & 0xff] + counts[n >> 8]
             : n <= 0xffffff ? counts[n & 0xff] + counts[(n >> 8) & 0xff] + counts[(n >> 16) & 0xff]
             : counts[n & 0xff] + counts[(n >> 8) & 0xff] + counts[(n >> 16) & 0xff] + counts[(n >> 24) & 0xff];
    }

    public static readonly uint[] BYTE_BIT_COUNTS = 
    {
        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
        4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
    };
}

#include <smmintrin.h>
#include <stdint.h>

const __m128i Z = _mm_set1_epi8(0x0);
const __m128i F = _mm_set1_epi8(0xF);
//Vector with pre-calculated bit count:
const __m128i T = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);

uint64_t BitCount(const uint8_t * src, size_t size)
{
    __m128i _sum =  _mm128_setzero_si128();
    for (size_t i = 0; i < size; i += 16)
    {
        //load 16-byte vector
        __m128i _src = _mm_loadu_si128((__m128i*)(src + i));
        //get low 4 bit for every byte in vector
        __m128i lo = _mm_and_si128(_src, F);
        //sum precalculated value from T
        _sum = _mm_add_epi64(_sum, _mm_sad_epu8(Z, _mm_shuffle_epi8(T, lo)));
        //get high 4 bit for every byte in vector
        __m128i hi = _mm_and_si128(_mm_srli_epi16(_src, 4), F);
        //sum precalculated value from T
        _sum = _mm_add_epi64(_sum, _mm_sad_epu8(Z, _mm_shuffle_epi8(T, hi)));
    }
    uint64_t sum[2];
    _mm_storeu_si128((__m128i*)sum, _sum);
    return sum[0] + sum[1];
}

#include <immintrin.h>
#include <stdint.h>

const __m256i Z = _mm256_set1_epi8(0x0);
const __m256i F = _mm256_set1_epi8(0xF);
//Vector with pre-calculated bit count:
const __m256i T = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 
                                   0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);

uint64_t BitCount(const uint8_t * src, size_t size)
{
    __m256i _sum =  _mm256_setzero_si256();
    for (size_t i = 0; i < size; i += 32)
    {
        //load 32-byte vector
        __m256i _src = _mm256_loadu_si256((__m256i*)(src + i));
        //get low 4 bit for every byte in vector
        __m256i lo = _mm256_and_si256(_src, F);
        //sum precalculated value from T
        _sum = _mm256_add_epi64(_sum, _mm256_sad_epu8(Z, _mm256_shuffle_epi8(T, lo)));
        //get high 4 bit for every byte in vector
        __m256i hi = _mm256_and_si256(_mm256_srli_epi16(_src, 4), F);
        //sum precalculated value from T
        _sum = _mm256_add_epi64(_sum, _mm256_sad_epu8(Z, _mm256_shuffle_epi8(T, hi)));
    }
    uint64_t sum[4];
    _mm256_storeu_si256((__m256i*)sum, _sum);
    return sum[0] + sum[1] + sum[2] + sum[3];
}

int countSetBits(unsigned int n) { 
    unsigned int n; // count the number of bits set in n
    unsigned int c; // c accumulates the total bits set in n
    for (c=0;n>0;n=n&(n-1)) c++; 
    return c; 
}

#include <bits/stdc++.h>

using namespace std;

int countOnes(int n) {
    bitset<32> b(n);
    return b.count();
}

while(n){
    n=n&(n-1);
    count++;
}

#include <bit>
#include <iostream>

int main() {
    std::cout << std::popcount(0x55) << std::endl;
}

namespace std {

  // 25.5.6, counting
  template<class T>
    constexpr int popcount(T x) noexcept;

template<class T>
  constexpr int popcount(T x) noexcept;