Optimization 计算[1..N]中的整数,其中K个零位在前导1以下?(不带HW POPCNT的连续范围的POPCUNT)

Optimization 计算[1..N]中的整数,其中K个零位在前导1以下?(不带HW POPCNT的连续范围的POPCUNT),optimization,x86,nasm,micro-optimization,hammingweight,Optimization,X86,Nasm,Micro Optimization,Hammingweight,我有以下任务: 计算1和N之间有多少个数字正好有K个非前导位。(例如,710=1112将有0个,4个将有2个) N和K满足条件0≤ K、 N≤ 100000000 此版本使用POPCNT,在我的机器上速度足够快: %include "io.inc" section .bss n resd 1 k resd 1 ans resd 1 section .text global CMAIN CMAIN: GET_DEC 4,n GET_DE

我有以下任务: 计算1和N之间有多少个数字正好有K个非前导位。(例如,710=1112将有0个,4个将有2个)

N和K满足条件0≤ K、 N≤ 100000000

此版本使用POPCNT,在我的机器上速度足够快:

%include "io.inc"

section .bss
    n resd 1
    k resd 1
    ans resd 1
section .text
global CMAIN
CMAIN:
    GET_DEC 4,n
    GET_DEC 4,k
    mov ecx,1
    mov edx,0
    ;ecx is counter from 1 to n

loop_:
    mov eax, ecx
    popcnt eax,eax;in eax now amount of bits set
    mov edx, 32
    sub edx, eax;in edx now 32-bits set=bits not set
    
    mov eax, ecx;count leading bits
    bsr eax, eax;
    xor eax, 0x1f;
    sub edx, eax
    mov eax, edx
    ; all this lines something like (gcc):
    ; eax=32-__builtin_clz(x)-_mm_popcnt_u32(x);

    cmp eax,[k];is there k non-leading bits in ecx?
    jnz notk
    ;if so, then increment ans
    
    mov edx,[ans]
    add edx,1
    mov [ans],edx
notk:
    ;increment counter, compare to n and loop
    inc ecx
    cmp ecx,dword[n]
    jna loop_
    
    ;print ans
    PRINT_DEC 4,ans
    xor  eax, eax
    ret
就速度而言(约0.8秒),这应该是可以接受的,但它并没有被接受,因为(我猜)测试服务器上使用的CPU太旧,所以它表明发生了运行时错误

我尝试对64K*4字节的查找表使用预计算技巧,但速度不够快:

%include "io.inc"
section .bss
    n resd 1
    k resd 1
    ans resd 1
    wordbits resd 65536; bits set in numbers from 0 to 65536
section .text
global CMAIN
CMAIN:
    mov ebp, esp; for correct debugging
    mov ecx,0
    ;mov eax, ecx
    ;fill in wordbits, ecx is wordbits array index
precount_:
    mov eax,ecx
    xor ebx,ebx
    ;c is ebx, v is eax
    ;for (c = 0; v; c++){
    ;    v &= v - 1; // clear the least significant bit set
    ;}
lloop_:
    mov edx,eax
    dec edx
    and eax,edx
    inc ebx
    test eax,eax
    jnz lloop_
    
    ;computed bits set
    mov dword[wordbits+4*ecx],ebx
    
    inc ecx
    cmp ecx,65536
    jna precount_
    
    ;0'th element should be 0
    mov dword[wordbits],0
    
    GET_DEC 4,edi;n
    GET_DEC 4,esi;k
    
    mov ecx,1
    xor edx,edx
    xor ebp,ebp
    
loop_:
    mov eax, ecx
    ;popcnt eax,eax
    mov edx,ecx
    and eax,0xFFFF 
    shr edx,16
    mov eax,dword[wordbits+4*eax]
    add eax,dword[wordbits+4*edx]
    ;previous lines are to implement absent instruction popcnt.
    ; they simply do eax=wordbits[x & 0xFFFF] + wordbits[x >> 16]
    mov edx, 32
    sub edx, eax
    ;and the same as before: 
    ;non-leading zero bits=32-bits set-__builtin_clz(x)
    mov eax, ecx
    bsr eax, eax
    xor eax, 0x1f
    sub edx, eax
    mov eax, edx

    ;compare to k again to see if this number has exactly k 
    ;non-leading zero bits

    cmp edx, esi
    jnz notk

    ;increment ebp (answer) if so
    mov edx, ebp
    add edx, 1
    mov ebp, edx
    ;and (or) go to then next iteration 
notk:
    inc ecx
    cmp ecx, edi
    jna loop_
    
    ;print answer what is in ebp
    PRINT_DEC 4, ebp
    xor  eax, eax
    ret
(>1秒)


我应该加快第二个程序的速度(如果是,那么怎么做?)还是以某种方式用其他(哪些?)指令替换POPCNT(我想SSE2和更早的指令应该是可用的)?

首先,一个太旧而没有
POPCNT
的服务器在其他方面会明显变慢,并且有不同的瓶颈。考虑到它有pshufb但没有popcnt,它是核心2第一代或第二代(Conroe或Penryn)。请参阅Agner Fog的Microach PDF(上)。同时降低时钟速度,因此您在该CPU上所能做的最好工作可能不足以让蛮力工作

可能有一些算法改进可以节省大量时间,比如注意每4个增量通过00、01、10、11模式循环低2位:每4个增量2个零发生一次,1个零发生两次,没有零发生一次。对于每一个>=4的数字,这2位低于前导位,因此是计数的一部分。将其推广到1和log2(N)之间的每个MSB位置的组合数学公式中,可能会大大减少工作量。处理2^M和N之间的数字不太明显


此处的版本:

  • 已清理popcnt版本,i7-6700k@3.9GHz上536ms,迭代期间无算法优化。对于k=8,N=100000000
  • 朴素的LUT版本(每次迭代加载2次,无迭代间优化):良好运行时约595毫秒,对于k=8,N=100000000,更常见的情况是约610毫秒。Core2Duo(Conroe)@2.4GHz:1.69秒。(在编辑历史记录中有几个更糟糕的版本,第一个版本在核心2上有部分寄存器暂停。)
  • 未完成,未编写清理代码)优化LUT版本(展开,高半/MSB BSR工作提升,每次迭代只留下1次查找(cmp/jcc),Skylake上210毫秒,Core 2上0.58秒@2.4GHz。时间应该是现实的;我们完成了所有的工作,只是错过了MSB处于低位16的最后2^16次迭代。在外循环中处理任何必要的拐角情况以及清理,对速度的影响不应超过1%
  • (更未完成):使用
    pcmpeqb
    /
    psubb
    (使用
    psadbw
    在外循环中对优化的LUT版本进行矢量化,如图所示-内循环减少到在固定大小的数组中对匹配在外循环中计算的值的字节元素进行计数。就像标量版本一样)。天湖18毫秒,核心2约0.036秒。这些时间现在可能包括相当大的启动开销。但正如预期/希望的那样,两者的速度都快了16倍
  • wordbits
    表进行一次柱状图分析(可能在生成时)
    。与其搜索64kiB来查找匹配的字节,只需查找每个外部循环迭代的答案!这将使您在大N时的速度提高数千倍(尽管您仍然需要处理低1..64K以及N不是64K倍数时的部分范围)
为了有效地测量更快的版本,您可以在整个过程中重复循环,这样整个过程仍然需要一些可测量的时间,比如半秒钟。(因为它是asm,没有编译器会通过重复执行相同的N,k来优化工作。)或者,如果您知道TSC频率,您可以在程序内部使用
rdtsc
进行计时。但是能够在整个过程中轻松地使用
perf stat
,这很好,所以我会继续这样做(取出printf并生成一个静态可执行文件,以进一步减少启动开销)


你似乎在问关于微观优化蛮力方法的问题,该方法仍然单独检查每个数字。(不过,对于如何实现
32-clz-popcnt==k
有一些重要的优化。)

还有其他通常更快的popcnt方法,例如像中的Bithack。但是,当您在一个紧密循环中有很多popcounting要做时(足以使查找表在缓存中保持热状态),LUT可能是好的

如果您有fast SSSE3
pshufb
,则值得使用它在XMM寄存器(自动矢量化循环)中并行对四个DWORD执行SIMD popcount,或者在带有AVX2的YMM寄存器中执行更好的功能。(第一代Core2拥有
pshufb
,但在第二代Core2之前,它不是单一uop。可能仍然值得一试。)

或者更好的方法是,使用SIMD来计算与我们所寻找的匹配的LUT元素,即给定的高半个数字


强力检查连续的数字范围为LUT策略打开了一个主要的优化:数字的上n位每2^n增量只更改一次。因此,您可以将这些位的计数从最内部的循环中提升出来。这也使得使用较小的表(适合L1d缓存)变得值得

说到这里,您的64k*4表是256KiB,即二级缓存的大小。这意味着它可能必须在每次循环通过它时从L3进入。您的桌面CPU应该有足够的L3带宽(由于增量,访问模式是连续的),现代服务器有更大的L2,但几乎没有理由不使用字节LUT(popcnt(-1)只有32),而
movzx
字节加载与
mov
dword加载一样便宜

; General LUT lookup with two 16-bit halves
    movzx  edx, cx            ; low 16 bits
    mov    eax, ecx
    shr    eax, 16            ; high 16 bits
    movzx  edx, byte [wordbits + edx]
    add     dl,      [wordbits + eax]
      ; no partial-reg stall for reading EDX after this, on Intel Sandybridge and later
      ; on Core 2, set up so you can cmp al,dl later to avoid it
在英特尔CPU上,因此
%use SMARTALIGN
alignmode p6, 64

section .bss
 wordbits: resb 65536
;    n resd 1
;    k resd 1
    ans resd 1
section .rodata
  n: dd 1000000000
  k: dd 8
  print_fmt: db `ans: %d\n`, 0

section .text

global main
main:            ; no popcnt version
    push  ebp
    push  edi    ; save some call-preserved registers
    push  esi
    push  ebx

    mov   edi, wordbits
%define wordbits edi             ; dirty hack, use indexed addressing modes instead of reg+disp32.
                                 ; Avoids Skylake JCC erratum problems, and is is slightly better on Core2 with good instruction scheduling
    ;fill in wordbits, ecx is wordbits array index
    mov   ecx, 1     ; leave wordbits[0] = 0
.init_loop:
    mov   eax,ecx
    xor   ebx,ebx
  .popc_loop:
      lea   edx, [eax-1]
      inc   ebx
      and   eax,edx         ; v &= v - 1; // blsr
      jnz  .popc_loop

    ;computed bits set
    mov [wordbits + ecx], bl

    inc ecx
    cmp ecx,65536
    jb .init_loop       ; bugfix: array out of bounds with jna: stores to wordbits[65536]


;    GET_DEC 4,n
;    GET_DEC 4,k
    mov   ecx, [n]      ; ecx counts from n down to 1
;    mov   esi, [k]
    xor   ebx, ebx      ; ebx = ans

    mov   esi, 1
    sub   esi, [k]      ; 1-k
align 32
.loop:
    ;popcnt eax, ecx
    movzx  eax, cx
    mov    ebp, ecx         ; using an extra register (EBP) to schedule instructions better(?) for Core2 decode
    movzx  edx, byte [wordbits + eax]
    shr    ebp, 16
;    xor eax, eax        ; break false dependency, or just let OoO exec hide it after breaking once per iter
    bsr    eax, ecx         ; eax = 31-lzcnt for non-zero ecx
;    sub    edx, esi         ; sub now avoids partial-reg stuff.  Could have just used EBX to allow BL.
    add eax, esi           ; Add to BSR result seems slightly better on Core2 than sub from popcnt
    add     dl,      [wordbits + ebp]   ; we don't read EDX, no partial-register stall even on P6-family

                        ;; want: k == 32-__builtin_clz(x)-_mm_popcnt_u32(x)
    cmp  al, dl         ; 31-clz+(1-k)  == popcount.  or  31-clz == popcnt - (1-k)
    je .yesk          ; not-taken is the more common fast path
 .done_inc:
    dec ecx
    jnz .loop         ; }while(--n >= 0U)

.print_and_exit:
    ;print ans
;    PRINT_DEC 4,ans
    push  ebx
    push  print_fmt
extern printf
    call  printf
    add   esp, 8

    pop  ebx
    pop  esi
    pop  edi
    pop  ebp
    xor  eax, eax
    ret

align 8
.yesk:
   inc  ebx
;   jmp  .done_inc           ; tail duplication is a *tiny* bit faster
   dec  ecx
   jnz  .loop
   jmp  .print_and_exit
# Results from version 1, not the Core2-friendly version.
# Version 3 sometimes runs this fast, but more often ~610ms
# Event counts are near identical for both, except cycles, but uops_issue and executed are mysteriously lower, like 9,090,858,203 executed.
$ nasm -felf32 foo.asm -l/dev/stdout &&
    gcc -m32 -no-pie -fno-pie -fno-plt foo.o 
$ taskset -c 3 perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,branches,branch-misses,instructions,uops_issued.any,uops_executed.thread -r 2 ./a.out 
ans: 12509316
ans: 12509316

 Performance counter stats for './a.out' (2 runs):

            597.78 msec task-clock                #    0.999 CPUs utilized            ( +-  0.12% )
                 0      context-switches          #    0.000 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
                62      page-faults               #    0.103 K/sec                    ( +-  0.81% )
     2,328,038,096      cycles                    #    3.894 GHz                      ( +-  0.12% )
     2,000,637,322      branches                  # 3346.789 M/sec                    ( +-  0.00% )
         1,719,724      branch-misses             #    0.09% of all branches          ( +-  0.02% )
    11,015,217,584      instructions              #    4.73  insn per cycle           ( +-  0.00% )
     9,148,164,159      uops_issued.any           # 15303.609 M/sec                   ( +-  0.00% )
     9,102,818,982      uops_executed.thread      # 15227.753 M/sec                   ( +-  0.00% )

   (from a separate run):
      9,204,430,548      idq.dsb_uops              # 15513.249 M/sec                   ( +-  0.00% )
         1,008,922      idq.mite_uops             #    1.700 M/sec                    ( +- 20.51% )


          0.598156 +- 0.000760 seconds time elapsed  ( +-  0.13% )
align 32
.loop:
    mov    eax, ecx
    popcnt eax,eax
    lea    edx, [dword eax - 32 + 31]  ; popcnt - 32  =  -(bits not set)
                   ; dword displacement pads the cmp/jnz location to avoid the JCC erratum penalty on Intel

;    xor eax, eax         ; break false dependency, or just let OoO exec hide it after breaking once per iter
    bsr eax, ecx         ; eax = 31-lzcnt
;    xor eax, 0x1f        ; eax = lzcnt (for non-zero x)
    ; want:  32-__builtin_clz(x)-_mm_popcnt_u32(x)  = (31-clz) + 1-popcnt = (31-clz) - (popcnt-1)
    sub eax, edx

    cmp eax, esi  ;is there k non-leading bits in ecx?
%if 0
    jnz .notk
    inc ebx       ;if so, then increment ans
.notk:
%else
    jz .yesk      ; not-taken is the more common fast path
 .done_inc:
%endif
    dec ecx
    jnz .loop   ; }while(--n >= 0U)
    
    ;print ans
;    PRINT_DEC 4,ans
    push  ebx
    push  print_fmt
extern printf
    call  printf
    add   esp, 8

    pop  ebx
    pop  esi
    xor  eax, eax
    ret

.yesk:
   inc  ebx
   jmp  .done_inc         ;; TODO: tail duplication
%use SMARTALIGN
alignmode p6, 64

section .bss
align 4096
 wordbits: resb 65536
;    n resd 1
;    k resd 1
;    ans resd 1
section .rodata
  ;n: dd 0x40000000        ; low half zero, maybe useful to test correctness for a version that doesn't handle that.
  n:  dd 1000000000 ; = 0x3b9aca00
  k: dd 8
  print_fmt: db `ans: %d\n`, 0

section .text
global main

align 16
main:
main_1lookup:
    push  ebp
    push  edi    ; save some call-preserved registers
    push  esi
    push  ebx

    mov   edi, wordbits
;%define wordbits edi             ; dirty hack, use indexed addressing modes instead of reg+disp32.
                                 ; actually slightly worse on Skylake: causes un-lamination of cmp bl, [reg+reg],
                                 ; although the front-end isn't much of a bottleneck anymore
                                 ; also seems pretty much neutral to use disp32+reg on Core 2, maybe reg-read stalls or just not a front-end bottleneck
    ;fill in wordbits, ecx is wordbits array index
    mov   ecx, 1     ; leave wordbits[0] = 0
.init_loop:
    mov   eax,ecx
    xor   ebx,ebx
  .popc_loop:
      lea   edx, [eax-1]
      inc   ebx
      and   eax,edx         ; v &= v - 1; // blsr
      jnz  .popc_loop

    ;computed bits set
    mov [wordbits + ecx], bl

    inc ecx
    cmp ecx,65536
    jb .init_loop


;    GET_DEC 4,n
;    GET_DEC 4,k
    mov   ecx, [n]      ; ecx counts from n down to 1
;    mov   esi, [k]
    xor   esi, esi      ; ans

    mov   ebp, 1
    sub   ebp, [k]      ; 1-k
align 32
.outer:
    mov    eax, ecx         ; using an extra register (EBP) to schedule instructions better(?) for Core2 decode
    shr    eax, 16
;    xor eax, eax        ; break false dependency, or just let OoO exec hide it after breaking once per iter
    bsr    ebx, ecx         ; eax = 31-lzcnt for non-zero ecx
         ;; want: k == 32-__builtin_clz(x)-_mm_popcnt_u32(x)
         ; 31-clz+(1-k)  == popcount.  or  31-clz == popcnt - (1-k)
         ; 31-cls+(1-k) - popcount(hi(x)) == popcount(lo(x))
    add    ebx, ebp
    sub     bl, byte [wordbits + eax]

    ;movzx  edx, cx
    lea    edx, [ecx - 4]   ; TODO: handle cx < 4 making this wrap
    movzx  edx, dx
    and    ecx, -65536      ; clear low 16 bits, which we're processing with the inner loop.
align 16
  .low16:
    cmp   bl, [wordbits + edx + 0]
    je    .yesk0
  .done_inc0:
    cmp   bl, [wordbits + edx + 1]
    je    .yesk1
  .done_inc1:
    cmp   bl, [wordbits + edx + 2]
    je    .yesk2
  .done_inc2:
    cmp   bl, [wordbits + edx + 3]
    je    .yesk3
  .done_inc3:

; TODO: vectorize with pcmpeqb / psubb / psadbw!!
; perhaps over fewer low bits to only use 16kiB of L1d cache
    
    sub  edx, 4
    jae  .low16        ; }while(lowhalf-=4 doesn't wrap)

   sub   ecx, 65536
   ja   .outer
; TODO: handle ECX < 65536 initially or after handling leading bits.  Probably with BSR in the inner loop


.print_and_exit:
    ;print ans
;    PRINT_DEC 4,ans
    push  esi
    push  print_fmt
extern printf
    call  printf
    add   esp, 8

    pop  ebx
    pop  esi
    pop  edi
    pop  ebp
    xor  eax, eax
    ret

align 16
%assign i 0
%rep 4
;align 4
.yesk%+i:
   inc  esi
   jmp  .done_inc%+i
%assign i  i+1
%endrep
  ; could use a similar %rep block for the inner loop

     ; attempt tail duplication?
     ; TODO: skip the next cmp/jcc when jumping back.
     ; Two in a row will never both be equal

;   dec  ecx
;   jnz  .loop
;   jmp  .print_and_exit
(update after outer-loop over-count on first iter bugfix, ans: 12497876)

ans: 12498239        # This is too low by a bit vs. 12509316
                     # looks reasonable given skipping cleanup

            209.46 msec task-clock                #    0.992 CPUs utilized          
                 0      context-switches          #    0.000 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
                62      page-faults               #    0.296 K/sec                  
       813,311,333      cycles                    #    3.883 GHz                    
     1,263,086,089      branches                  # 6030.123 M/sec                  
           824,103      branch-misses             #    0.07% of all branches        
     2,527,743,287      instructions              #    3.11  insn per cycle         
     1,300,567,770      uops_issued.any           # 6209.065 M/sec                  
     2,299,321,355      uops_executed.thread      # 10977.234 M/sec                 

(from another run)
        37,150,918      idq.dsb_uops              #  174.330 M/sec                  
     1,266,487,977      idq.mite_uops             # 5942.976 M/sec

       0.211235157 seconds time elapsed

       0.209838000 seconds user
       0.000000000 seconds sys
;;;;;  Just the loop from main_SSE2, same init stuff and print as main_1lookup
align 32
.outer:
    mov    eax, ecx         ; using an extra register (EBP) to schedule instructions better(?) for Core2 decode
    shr    eax, 16-2

;    xor eax, eax        ; break false dependency, or just let OoO exec hide it after breaking once per iter
    bsr    ebx, ecx         ; eax = 31-lzcnt for non-zero ecx
         ;; want: k == 32-__builtin_clz(x)-_mm_popcnt_u32(x)
         ; 31-clz+(1-k)  == popcount.  or  31-clz == popcnt - (1-k)
         ; 31-cls+(1-k) - popcount(hi(x)) == popcount(lo(x))
    add    ebx, ebp
    movzx  edx, al
;    movzx  edx, byte [wordbits + edx]
    sub    bl, byte [wordbits + edx]
    shr    eax, 8            ; high part is more than 16 bits if low is 14, needs to be broken up
    sub    bl, byte [wordbits + eax]
;    movzx  eax, byte [wordbits + eax]
;    add    eax, edx
;    sub    ebx, eax

    movzx  eax,  bl
    movd   xmm7, eax
    pxor   xmm0, xmm0
    pxor   xmm1, xmm1    ; 2 accumulators
    pshufb xmm7, xmm0    ; broadcast byte to search for.
      ;;  Actually SSSE3, but it only takes a few more insns to broadcast a byte with just SSE2.  
      ;; e.g. imul eax, 0x01010101 / movd / pshufd

    ;movzx  edx, cx
;    lea    edx, [ecx - 4]   ; TODO: handle cx < 4 making this wrap
;    movzx  edx, dx
    and    ecx, -16384      ; clear low bits, which we're processing with the inner loop.

    mov    edx, wordbits     ; quick and dirty, just loop forward over the array
     ;; FIXME: handle non-zero CX on first outer loop iteration, maybe loop backwards so we can go downwards toward 0,
     ;; or calculate an end-pointer if we can use that without register-read stalls on Core 2.
     ;; Also need to handle the leftover part not being a multiple of 32 in size
     ;; So maybe just make a more-flexible copy of this loop and peel the first outer iteration (containing that inner loop)
     ;;  if the cleanup for that slows down the common case of doing exactly 16K 
align 16
  .low14:
    movdqa  xmm2, [edx]
    movdqa  xmm3, [edx + 16]
 ds   pcmpeqb xmm2, xmm7           ; extra prefixes for padding for Skylake JCC erratum: 18ms vs. 25ms
 ds   psubb   xmm0, xmm2
    ds add     edx, 32
 cs   pcmpeqb xmm3, xmm7
 cs   psubb   xmm1, xmm3

  ; hits are rare enough to not wrap counters?
  ; TODO: may need an inner loop to accumulate after 256 steps if every other 32nd element is a match overflowing some SIMD element
    cmp    edx, wordbits + 16384
    jb   .low14

   pxor   xmm7, xmm7
   psadbw xmm0, xmm7
   psadbw xmm1, xmm7       ; byte -> qword horizontal sum
   paddd  xmm0, xmm1       ; reduce to 1 vector
   movhlps xmm1, xmm0
   paddd  xmm0, xmm1       ; hsum the low/high counts
   movd   eax, xmm0
   add    esi, eax         ; sum in scalar (could sink this out)

   sub   ecx, 16384
   ja   .outer
; TODO: handle ECX < 65536 initially or after handling leading bits.  Probably with BSR in the inner loop
n! / (k! * (n - k)!)
#include <stdio.h>
#include <chrono>

template<typename T>
struct Coefficients {
  static constexpr unsigned size_v = sizeof(T) * 8;

  // Zero-initialize.
  // Indexed by [number_of_zeros][number_of_bits]
  T value[size_v][size_v] = {};

  constexpr Coefficients() {
    // How many different ways we can choose k items from n items
    // without order and without repetition.
    //
    // n! / k! (n - k)!

    value[0][0] = 1;
    value[0][1] = 1;
    value[1][1] = 1;

    for(unsigned i = 2; i < size_v; ++i) {
      value[0][i] = 1;
      value[1][i] = i;

      T r = i;

      for(unsigned j = 2; j < i; ++j) {
        r = (r * (i - j + 1)) / j;
        value[j][i] = r;
      }

      value[i][i] = 1;
    }
  }
};


template<typename T>
__attribute__((noinline)) // To make it easier to benchmark
T count_combinations(T max_value, T zero_bits) {
  if( max_value == 0 )
    return 0;

  constexpr int size = sizeof(T) * 8;
  constexpr Coefficients<T> coefs;
  // assert(zeros_bits < size)

  int bits = size - __builtin_clz(max_value);

  T total = 0;

  // Count all-ones count.
#pragma clang loop vectorize(disable)
  for(int i = 0; i < bits - 1; ++i) {
    total += coefs.value[zero_bits][i];
  }

  // Count interval [2**bits, max_value]
  bits -= 1;
  T mask = T(1) << bits;
  max_value &= ~mask;      // Remove leading bit
  mask = mask >> 1;

#pragma clang loop vectorize(disable)
  while( zero_bits && zero_bits < bits ) {
    if( max_value & mask ) {
      // If current bit is one, then we can pretend that it is zero
      // (which would only make the value smaller, which means that
      // it would still be < max_value) and grab all combinations of
      // zeros within the remaining bits.
      total += coefs.value[zero_bits - 1][bits - 1];

      // And then stop pretending it's zero and continue as normal.
    } else {
      // If current bit is zero, we can't do anything about it, just
      // have to spend a zero from our budget.

      zero_bits--;
    }

    max_value &= ~mask;
    mask = mask >> 1;
    bits--;
  }

  // At this point we don't have any more zero bits, or we don't
  // have any more bits at all.

  if( (zero_bits == bits) ||
      (zero_bits == 0 && max_value == ((mask << 1) - 1)) ) {
    total++;
  }

  return total;
}

int main() {
  using namespace std::chrono;

  unsigned count = 0;
  time_point t0 = high_resolution_clock::now();

  for(int i = 0; i < 1000; ++i) {
    count |= count_combinations<unsigned>(1'000'000'000, 8);
  }
  time_point t1 = high_resolution_clock::now();

  auto duration = duration_cast<nanoseconds>(t1 - t0).count();

  printf("result = %u, time = %lld ns\n", count, duration / 1000);

  return 0;
}
result = 12509316, time = 35 ns