Algorithm 如何计算32位整数中的设置位数?
表示数字7的8位如下所示:Algorithm 如何计算32位整数中的设置位数?,algorithm,binary,bit-manipulation,hammingweight,iec10967,Algorithm,Binary,Bit Manipulation,Hammingweight,Iec10967,表示数字7的8位如下所示: 00000111 设置了三位 确定32位整数中设置位数的算法有哪些?这称为“”、“popcount”或“侧向加法” 一些CPU有一条内置指令来执行此操作,而另一些CPU有作用于位向量的并行指令。像x86这样的指令(在支持它的CPU上)对于单个整数几乎肯定是最快的。其他一些架构可能有一个使用微代码循环实现的慢指令,该循环每周期测试一位(需要引用-硬件popcount通常很快,如果它存在的话) “最佳”算法实际上取决于您使用的CPU以及您的使用模式 你的编译器可能知道如
00000111
设置了三位
确定32位整数中设置位数的算法有哪些?这称为“”、“popcount”或“侧向加法”
一些CPU有一条内置指令来执行此操作,而另一些CPU有作用于位向量的并行指令。像x86这样的指令(在支持它的CPU上)对于单个整数几乎肯定是最快的。其他一些架构可能有一个使用微代码循环实现的慢指令,该循环每周期测试一位(需要引用-硬件popcount通常很快,如果它存在的话)
“最佳”算法实际上取决于您使用的CPU以及您的使用模式
<>你的编译器可能知道如何做一些对你正在编译的特定CPU来说是好的,例如,或者C++,作为一种可移植的方法来访问内置/内部函数(参见这个问题)。但是,对于没有硬件popcnt的目标CPU,编译器选择的回退可能并不适合您的用例。或者,您的语言(例如C)可能不会公开任何可移植函数,这些函数可能会在存在CPU特定popcount时使用CPU特定popcount
不需要(或受益于)任何硬件支持的可移植算法 如果您的CPU有一个大的缓存,并且您在一个紧密的循环中执行大量这些操作,那么预填充的表查找方法可以非常快。但是,由于“缓存未命中”的代价,CPU必须从主内存中获取一些表,因此它可能会受到影响。(分别查找每个字节以保持表小。)如果希望popcount用于连续的数字范围,则对于256个数字的组,只有低字节在更改 如果您知道您的字节大部分是0或1,那么针对这些场景有一些有效的算法,例如,在循环中使用bithack清除最低的集合,直到它变为零 我相信一个非常好的通用算法如下,称为“并行”或“可变精度SWAR算法”。我用C语言表示了这一点,你可能需要调整它来适应特定语言(例如在爪哇使用C++和u>) GCC10和clang 10.0可以识别此模式/习惯用法,并在可用时将其编译为硬件popcnt或等效指令,从而让您两全其美。()
int numberofsetbit(uint32\u t i)
{
//Java:使用int,并使用>>>而不是>>。或者使用Integer.bitCount()
//C或C++:使用uint32\u t
i=i-((i>>1)&0x5555);//添加位对
i=(i&0x33333333)+(i>>2&0x33333333);//四边形
i=(i+(i>>4))&0x0f0f;//一组8个
返回(i*0x01010101)>>24;//字节的水平和
}
对于JavaScript:with|0
对于性能:将第一行更改为i=(i | 0)-(i>>1)和0x5555)代码>
这是所讨论的任何算法中最好的最坏情况行为,因此将有效地处理您向其抛出的任何使用模式或值。(它的性能不依赖于普通的CPU,在CPU中,包括乘法在内的所有整数运算都是常数时间。使用“简单”输入,它的速度不会更快,但仍然相当不错。)
参考资料:
SWAR bithack的工作原理:
第一步是屏蔽的优化版本,以隔离奇偶位,移动以对齐它们,然后添加。这在2位累加器中有效地进行了16次单独的加法()。类似于(i&0x555555)+((i>>1)&0x555555)
下一步将使用这些16x 2位累加器中的奇数/偶数八个累加器,然后再次相加,生成8 x 4位和。这次不可能进行i-…
优化,因此它只会在移位前/移位后屏蔽。在编译需要在寄存器中分别构造32位常量的ISA时,在移位之前使用相同的0x33…
常量,而不是0xccc…
,这是一件好事
(i+(i>>4))&0x0f0f
的最后移位和添加步骤扩展到4x 8位累加器。它在加法之后而不是之前屏蔽,因为如果设置了相应输入位的所有4位,则任何4位累加器中的最大值都是4
。4+4=8,仍然适合4位,因此在i+(i>>4)
中,半字节元素之间的进位是不可能的
到目前为止,这只是使用SWAR技术的非常普通的SIMD,并进行了一些巧妙的优化。继续使用相同的模式再执行两个步骤可以将计数扩大到2倍16位,然后是1倍32位。但在具有快速硬件乘法的机器上有一种更有效的方法:
一旦我们的“元素”足够少,一个带有魔法常数的乘法可以将所有元素相加到顶部元素中。在本例中为字节元素。乘法是通过左移和加法完成的,因此乘以x*0x01010101
会得到x+(x为什么不迭代除以2
count = 0
while n > 0
if (n % 2) == 1
count += 1
n /= 2
计数=0
当n>0时
如果(n%2)==1
计数+=1
n/=2
我同意这不是最快的,但“最好”有点含糊不清。我认为“最好”应该有一个清晰的元素如果您碰巧使用Java,内置的方法Integer.bitCount
可以做到这一点。一些语言可移植地以一种可以使用有效硬件支持(如果可用)的方式公开操作,否则一些库可能会退却,这很不错
例如(来自):
- C++具有std::bitset::count()
,或
Java有Java.lang.Integer.bitCount()
(也适用于Long或BigInteger)
C#hasSystem.Numerics.BitOperations.PopCount()
Python具有int.bit\u count()
不是所有的编译器
count = 0
while n > 0
if (n % 2) == 1
count += 1
n /= 2
static final int[] BIT_COUNT = { 0, 1, 1, ... 256 values with a bitsize of a byte ... };
static int bitCountOfByte( int value ){
return BIT_COUNT[ value & 0xFF ];
}
static int bitCountOfInt( int value ){
return bitCountOfByte( value )
+ bitCountOfByte( value >> 8 )
+ bitCountOfByte( value >> 16 )
+ bitCountOfByte( value >> 24 );
}
int pop(unsigned x)
{
x = x - ((x >> 1) & 0x55555555);
x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
x = (x + (x >> 4)) & 0x0F0F0F0F;
x = x + (x >> 8);
x = x + (x >> 16);
return x & 0x0000003F;
}
unsigned int bitCount (unsigned int value) {
unsigned int count = 0;
while (value > 0) { // until all bits are zero
if ((value & 1) == 1) // check lower bit
count++;
value >>= 1; // shift bits, removing lower bit
}
return count;
}
// Lookup table for fast calculation of bits set in 8-bit unsigned char.
static unsigned char oneBitsInUChar[] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F (<- n)
// =====================================================
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, // 0n
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, // 1n
: : :
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, // Fn
};
// Function for fast calculation of bits set in 16-bit unsigned short.
unsigned char oneBitsInUShort (unsigned short x) {
return oneBitsInUChar [x >> 8]
+ oneBitsInUChar [x & 0xff];
}
// Function for fast calculation of bits set in 32-bit unsigned int.
unsigned char oneBitsInUInt (unsigned int x) {
return oneBitsInUShort (x >> 16)
+ oneBitsInUShort (x & 0xffff);
}
int bitcount(unsigned int num){
int count = 0;
static int nibblebits[] =
{0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
for(; num != 0; num >>= 4)
count += nibblebits[num & 0x0f];
return count;
}
#define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255)
#define BX_(x) ((x) - (((x)>>1)&0x77777777)
- (((x)>>2)&0x33333333)
- (((x)>>3)&0x11111111))
static unsigned char wordbits[65536] = { bitcounts of ints between 0 and 65535 };
static int popcount( unsigned int i )
{
return( wordbits[i&0xFFFF] + wordbits[i>>16] );
}
inline int pop2(unsigned x, unsigned y)
{
x = x - ((x >> 1) & 0x55555555);
y = y - ((y >> 1) & 0x55555555);
x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
y = (y & 0x33333333) + ((y >> 2) & 0x33333333);
x = (x + (x >> 4)) & 0x0F0F0F0F;
y = (y + (y >> 4)) & 0x0F0F0F0F;
x = x + (x >> 8);
y = y + (y >> 8);
x = x + (x >> 16);
y = y + (y >> 16);
return (x+y) & 0x000000FF;
}
inline int pop2(unsigned long x, unsigned long y)
{
x = x - ((x >> 1) & 0x5555555555555555);
y = y - ((y >> 1) & 0x5555555555555555);
x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
y = (y & 0x3333333333333333) + ((y >> 2) & 0x3333333333333333);
x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0F;
y = (y + (y >> 4)) & 0x0F0F0F0F0F0F0F0F;
x = x + y;
x = x + (x >> 8);
x = x + (x >> 16);
x = x + (x >> 32);
return x & 0xFF;
}
inline int pop4(unsigned long x, unsigned long y,
unsigned long u, unsigned long v)
{
enum { m1 = 0x5555555555555555,
m2 = 0x3333333333333333,
m3 = 0x0F0F0F0F0F0F0F0F,
m4 = 0x000000FF000000FF };
x = x - ((x >> 1) & m1);
y = y - ((y >> 1) & m1);
u = u - ((u >> 1) & m1);
v = v - ((v >> 1) & m1);
x = (x & m2) + ((x >> 2) & m2);
y = (y & m2) + ((y >> 2) & m2);
u = (u & m2) + ((u >> 2) & m2);
v = (v & m2) + ((v >> 2) & m2);
x = x + y;
u = u + v;
x = (x & m3) + ((x >> 4) & m3);
u = (u & m3) + ((u >> 4) & m3);
x = x + u;
x = x + (x >> 8);
x = x + (x >> 16);
x = x & m4;
x = x + (x >> 32);
return x & 0x000001FF;
}
hitime b4 = rdtsc();
for (unsigned long i = 10L * 1000*1000*1000; i < 11L * 1000*1000*1000; ++i)
sum += pop4 (i, i^1, ~i, i|1);
hitime e4 = rdtsc();
count = 0
while n != 0
if ((n % 2) == 1 || (n % 2) == -1
count += 1
n /= 2
return count
int bit_count(int num)
{
int count=0;
while(num)
{
num=(num)&(num-1);
count++;
}
return count;
}
#define BitCount(X,Y) \
Y = X - ((X >> 1) & 033333333333) - ((X >> 2) & 011111111111); \
Y = ((Y + (Y >> 3)) & 030707070707); \
Y = (Y + (Y >> 6)); \
Y = (Y + (Y >> 12) + (Y >> 24)) & 077;
input output
AB CD Note
00 00 = AB
01 01 = AB
10 01 = AB - (A >> 1) & 0x1
11 10 = AB - (A >> 1) & 0x1
Integer.highestOneBit(n);
Integer.lowestOneBit(n);
Integer.numberOfLeadingZeros(n);
Integer.numberOfTrailingZeros(n);
//Beginning with the value 1, rotate left 16 times
n = 1;
for (int i = 0; i < 16; i++) {
n = Integer.rotateLeft(n, 1);
System.out.println(n);
}
unsigned int v; // count the number of bits set in v
unsigned int c; // c accumulates the total bits set in v
// option 1, for at most 14-bit values in v:
c = (v * 0x200040008001ULL & 0x111111111111111ULL) % 0xf;
// option 2, for at most 24-bit values in v:
c = ((v & 0xfff) * 0x1001001001001ULL & 0x84210842108421ULL) % 0x1f;
c += (((v & 0xfff000) >> 12) * 0x1001001001001ULL & 0x84210842108421ULL)
% 0x1f;
// option 3, for at most 32-bit values in v:
c = ((v & 0xfff) * 0x1001001001001ULL & 0x84210842108421ULL) % 0x1f;
c += (((v & 0xfff000) >> 12) * 0x1001001001001ULL & 0x84210842108421ULL) %
0x1f;
c += ((v >> 24) * 0x1001001001001ULL & 0x84210842108421ULL) % 0x1f;
// recursive template to sum bits in an int
template <int BITS>
int countBits(int val) {
// return the least significant bit plus the result of calling ourselves with
// .. the shifted value
return (val & 0x1) + countBits<BITS-1>(val >> 1);
}
// template specialisation to terminate the recursion when there's only one bit left
template<>
int countBits<1>(int val) {
return val & 0x1;
}
// to count bits in a byte/char (this returns 8)
countBits<8>( 255 )
// another byte (this returns 7)
countBits<8>( 254 )
// counting bits in a word/short (this returns 1)
countBits<16>( 256 )
int countSetBits(int n) {
return !n ? 0 : 1 + countSetBits(n & (n-1));
}
unsigned int count_bit(unsigned int x)
{
x = (x & 0x55555555) + ((x >> 1) & 0x55555555);
x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
x = (x & 0x0F0F0F0F) + ((x >> 4) & 0x0F0F0F0F);
x = (x & 0x00FF00FF) + ((x >> 8) & 0x00FF00FF);
x = (x & 0x0000FFFF) + ((x >> 16)& 0x0000FFFF);
return x;
}
+-------------------------------+
| 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | <- x
| 1 0 | 0 1 | 0 1 | 0 1 | <- first time merge
| 0 0 1 1 | 0 0 1 0 | <- second time merge
| 0 0 0 0 0 1 0 1 | <- third time ( answer = 00000101 = 5)
+-------------------------------+
unsigned int f(unsigned int x)
{
switch (x) {
case 0:
return 0;
case 1:
return 1;
case 2:
return 1;
case 3:
return 2;
default:
return f(x/4) + f(x%4);
}
}
int popcount(int v) {
v = v - ((v >> 1) & 0x55555555); // put count of each 2 bits into those 2 bits
v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // put count of each 4 bits into those 4 bits
return c = ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
}
v = v - ((v >> 1) & 0x55555555);
---------------------------------------------
| v | (v >> 1) & 0b0101 | v - x |
---------------------------------------------
0b00 0b00 0b00
0b01 0b00 0b01
0b10 0b01 0b01
0b11 0b01 0b10
v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
v & 0b00110011 //masks out even two bits
(v >> 2) & 0b00110011 // masks out odd two bits
c = ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
v + (v >> 4)
0b01000010 + 0b01000000
0b01100010 & 0xF0 = 0b01100000
unsigned int bitCount(unsigned int x)
{
x = ((x >> 1) & 0b01010101010101010101010101010101)
+ (x & 0b01010101010101010101010101010101);
x = ((x >> 2) & 0b00110011001100110011001100110011)
+ (x & 0b00110011001100110011001100110011);
x = ((x >> 4) & 0b00001111000011110000111100001111)
+ (x & 0b00001111000011110000111100001111);
x = ((x >> 8) & 0b00000000111111110000000011111111)
+ (x & 0b00000000111111110000000011111111);
x = ((x >> 16)& 0b00000000000000001111111111111111)
+ (x & 0b00000000000000001111111111111111);
return x;
}
int NumberOfSetBits(int n)
{
int count = 0;
while (n){
++ count;
n = (n - 1) & n;
}
return count;
}
public static class BitCount
{
public static uint GetSetBitsCount(uint n)
{
var counts = BYTE_BIT_COUNTS;
return n <= 0xff ? counts[n]
: n <= 0xffff ? counts[n & 0xff] + counts[n >> 8]
: n <= 0xffffff ? counts[n & 0xff] + counts[(n >> 8) & 0xff] + counts[(n >> 16) & 0xff]
: counts[n & 0xff] + counts[(n >> 8) & 0xff] + counts[(n >> 16) & 0xff] + counts[(n >> 24) & 0xff];
}
public static readonly uint[] BYTE_BIT_COUNTS =
{
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
}
#include <smmintrin.h>
#include <stdint.h>
const __m128i Z = _mm_set1_epi8(0x0);
const __m128i F = _mm_set1_epi8(0xF);
//Vector with pre-calculated bit count:
const __m128i T = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
uint64_t BitCount(const uint8_t * src, size_t size)
{
__m128i _sum = _mm128_setzero_si128();
for (size_t i = 0; i < size; i += 16)
{
//load 16-byte vector
__m128i _src = _mm_loadu_si128((__m128i*)(src + i));
//get low 4 bit for every byte in vector
__m128i lo = _mm_and_si128(_src, F);
//sum precalculated value from T
_sum = _mm_add_epi64(_sum, _mm_sad_epu8(Z, _mm_shuffle_epi8(T, lo)));
//get high 4 bit for every byte in vector
__m128i hi = _mm_and_si128(_mm_srli_epi16(_src, 4), F);
//sum precalculated value from T
_sum = _mm_add_epi64(_sum, _mm_sad_epu8(Z, _mm_shuffle_epi8(T, hi)));
}
uint64_t sum[2];
_mm_storeu_si128((__m128i*)sum, _sum);
return sum[0] + sum[1];
}
#include <immintrin.h>
#include <stdint.h>
const __m256i Z = _mm256_set1_epi8(0x0);
const __m256i F = _mm256_set1_epi8(0xF);
//Vector with pre-calculated bit count:
const __m256i T = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
uint64_t BitCount(const uint8_t * src, size_t size)
{
__m256i _sum = _mm256_setzero_si256();
for (size_t i = 0; i < size; i += 32)
{
//load 32-byte vector
__m256i _src = _mm256_loadu_si256((__m256i*)(src + i));
//get low 4 bit for every byte in vector
__m256i lo = _mm256_and_si256(_src, F);
//sum precalculated value from T
_sum = _mm256_add_epi64(_sum, _mm256_sad_epu8(Z, _mm256_shuffle_epi8(T, lo)));
//get high 4 bit for every byte in vector
__m256i hi = _mm256_and_si256(_mm256_srli_epi16(_src, 4), F);
//sum precalculated value from T
_sum = _mm256_add_epi64(_sum, _mm256_sad_epu8(Z, _mm256_shuffle_epi8(T, hi)));
}
uint64_t sum[4];
_mm256_storeu_si256((__m256i*)sum, _sum);
return sum[0] + sum[1] + sum[2] + sum[3];
}
int countSetBits(unsigned int n) {
unsigned int n; // count the number of bits set in n
unsigned int c; // c accumulates the total bits set in n
for (c=0;n>0;n=n&(n-1)) c++;
return c;
}
#include <bits/stdc++.h>
using namespace std;
int countOnes(int n) {
bitset<32> b(n);
return b.count();
}
while(n){
n=n&(n-1);
count++;
}
#include <bit>
#include <iostream>
int main() {
std::cout << std::popcount(0x55) << std::endl;
}
namespace std {
// 25.5.6, counting
template<class T>
constexpr int popcount(T x) noexcept;
template<class T>
constexpr int popcount(T x) noexcept;