Optimization 计算[1..N]中的整数,其中K个零位在前导1以下?(不带HW POPCNT的连续范围的POPCUNT)
我有以下任务: 计算1和N之间有多少个数字正好有K个非前导位。(例如,710=1112将有0个,4个将有2个) N和K满足条件0≤ K、 N≤ 100000000 此版本使用POPCNT,在我的机器上速度足够快:Optimization 计算[1..N]中的整数,其中K个零位在前导1以下?(不带HW POPCNT的连续范围的POPCUNT),optimization,x86,nasm,micro-optimization,hammingweight,Optimization,X86,Nasm,Micro Optimization,Hammingweight,我有以下任务: 计算1和N之间有多少个数字正好有K个非前导位。(例如,710=1112将有0个,4个将有2个) N和K满足条件0≤ K、 N≤ 100000000 此版本使用POPCNT,在我的机器上速度足够快: %include "io.inc" section .bss n resd 1 k resd 1 ans resd 1 section .text global CMAIN CMAIN: GET_DEC 4,n GET_DE
%include "io.inc"
section .bss
n resd 1
k resd 1
ans resd 1
section .text
global CMAIN
CMAIN:
GET_DEC 4,n
GET_DEC 4,k
mov ecx,1
mov edx,0
;ecx is counter from 1 to n
loop_:
mov eax, ecx
popcnt eax,eax;in eax now amount of bits set
mov edx, 32
sub edx, eax;in edx now 32-bits set=bits not set
mov eax, ecx;count leading bits
bsr eax, eax;
xor eax, 0x1f;
sub edx, eax
mov eax, edx
; all this lines something like (gcc):
; eax=32-__builtin_clz(x)-_mm_popcnt_u32(x);
cmp eax,[k];is there k non-leading bits in ecx?
jnz notk
;if so, then increment ans
mov edx,[ans]
add edx,1
mov [ans],edx
notk:
;increment counter, compare to n and loop
inc ecx
cmp ecx,dword[n]
jna loop_
;print ans
PRINT_DEC 4,ans
xor eax, eax
ret
就速度而言(约0.8秒),这应该是可以接受的,但它并没有被接受,因为(我猜)测试服务器上使用的CPU太旧,所以它表明发生了运行时错误
我尝试对64K*4字节的查找表使用预计算技巧,但速度不够快:
%include "io.inc"
section .bss
n resd 1
k resd 1
ans resd 1
wordbits resd 65536; bits set in numbers from 0 to 65536
section .text
global CMAIN
CMAIN:
mov ebp, esp; for correct debugging
mov ecx,0
;mov eax, ecx
;fill in wordbits, ecx is wordbits array index
precount_:
mov eax,ecx
xor ebx,ebx
;c is ebx, v is eax
;for (c = 0; v; c++){
; v &= v - 1; // clear the least significant bit set
;}
lloop_:
mov edx,eax
dec edx
and eax,edx
inc ebx
test eax,eax
jnz lloop_
;computed bits set
mov dword[wordbits+4*ecx],ebx
inc ecx
cmp ecx,65536
jna precount_
;0'th element should be 0
mov dword[wordbits],0
GET_DEC 4,edi;n
GET_DEC 4,esi;k
mov ecx,1
xor edx,edx
xor ebp,ebp
loop_:
mov eax, ecx
;popcnt eax,eax
mov edx,ecx
and eax,0xFFFF
shr edx,16
mov eax,dword[wordbits+4*eax]
add eax,dword[wordbits+4*edx]
;previous lines are to implement absent instruction popcnt.
; they simply do eax=wordbits[x & 0xFFFF] + wordbits[x >> 16]
mov edx, 32
sub edx, eax
;and the same as before:
;non-leading zero bits=32-bits set-__builtin_clz(x)
mov eax, ecx
bsr eax, eax
xor eax, 0x1f
sub edx, eax
mov eax, edx
;compare to k again to see if this number has exactly k
;non-leading zero bits
cmp edx, esi
jnz notk
;increment ebp (answer) if so
mov edx, ebp
add edx, 1
mov ebp, edx
;and (or) go to then next iteration
notk:
inc ecx
cmp ecx, edi
jna loop_
;print answer what is in ebp
PRINT_DEC 4, ebp
xor eax, eax
ret
(>1秒)
我应该加快第二个程序的速度(如果是,那么怎么做?)还是以某种方式用其他(哪些?)指令替换POPCNT(我想SSE2和更早的指令应该是可用的)?首先,一个太旧而没有
POPCNT
的服务器在其他方面会明显变慢,并且有不同的瓶颈。考虑到它有pshufb但没有popcnt,它是核心2第一代或第二代(Conroe或Penryn)。请参阅Agner Fog的Microach PDF(上)。同时降低时钟速度,因此您在该CPU上所能做的最好工作可能不足以让蛮力工作
可能有一些算法改进可以节省大量时间,比如注意每4个增量通过00、01、10、11模式循环低2位:每4个增量2个零发生一次,1个零发生两次,没有零发生一次。对于每一个>=4的数字,这2位低于前导位,因此是计数的一部分。将其推广到1和log2(N)之间的每个MSB位置的组合数学公式中,可能会大大减少工作量。处理2^M和N之间的数字不太明显
此处的版本:
- 已清理popcnt版本,i7-6700k@3.9GHz上536ms,迭代期间无算法优化。对于k=8,N=100000000
- 朴素的LUT版本(每次迭代加载2次,无迭代间优化):良好运行时约595毫秒,对于k=8,N=100000000,更常见的情况是约610毫秒。Core2Duo(Conroe)@2.4GHz:1.69秒。(在编辑历史记录中有几个更糟糕的版本,第一个版本在核心2上有部分寄存器暂停。)
- (未完成,未编写清理代码)优化LUT版本(展开,高半/MSB BSR工作提升,每次迭代只留下1次查找(cmp/jcc),Skylake上210毫秒,Core 2上0.58秒@2.4GHz。时间应该是现实的;我们完成了所有的工作,只是错过了MSB处于低位16的最后2^16次迭代。在外循环中处理任何必要的拐角情况以及清理,对速度的影响不应超过1%
- (更未完成):使用
/pcmpeqb
(使用psubb
在外循环中对优化的LUT版本进行矢量化,如图所示-内循环减少到在固定大小的数组中对匹配在外循环中计算的值的字节元素进行计数。就像标量版本一样)。天湖18毫秒,核心2约0.036秒。这些时间现在可能包括相当大的启动开销。但正如预期/希望的那样,两者的速度都快了16倍psadbw
- 对
表进行一次柱状图分析(可能在生成时)。与其搜索64kiB来查找匹配的字节,只需查找每个外部循环迭代的答案!这将使您在大N时的速度提高数千倍(尽管您仍然需要处理低1..64K以及N不是64K倍数时的部分范围)wordbits
rdtsc
进行计时。但是能够在整个过程中轻松地使用perf stat
,这很好,所以我会继续这样做(取出printf并生成一个静态可执行文件,以进一步减少启动开销)
你似乎在问关于微观优化蛮力方法的问题,该方法仍然单独检查每个数字。(不过,对于如何实现
32-clz-popcnt==k
有一些重要的优化。)
还有其他通常更快的popcnt方法,例如像中的Bithack。但是,当您在一个紧密循环中有很多popcounting要做时(足以使查找表在缓存中保持热状态),LUT可能是好的
如果您有fast SSSE3pshufb
,则值得使用它在XMM寄存器(自动矢量化循环)中并行对四个DWORD执行SIMD popcount,或者在带有AVX2的YMM寄存器中执行更好的功能。(第一代Core2拥有pshufb
,但在第二代Core2之前,它不是单一uop。可能仍然值得一试。)
或者更好的方法是,使用SIMD来计算与我们所寻找的匹配的LUT元素,即给定的高半个数字
强力检查连续的数字范围为LUT策略打开了一个主要的优化:数字的上n位每2^n增量只更改一次。因此,您可以将这些位的计数从最内部的循环中提升出来。这也使得使用较小的表(适合L1d缓存)变得值得 说到这里,您的64k*4表是256KiB,即二级缓存的大小。这意味着它可能必须在每次循环通过它时从L3进入。您的桌面CPU应该有足够的L3带宽(由于增量,访问模式是连续的),现代服务器有更大的L2,但几乎没有理由不使用字节LUT(popcnt(-1)只有32),而
movzx
字节加载与mov
dword加载一样便宜
; General LUT lookup with two 16-bit halves
movzx edx, cx ; low 16 bits
mov eax, ecx
shr eax, 16 ; high 16 bits
movzx edx, byte [wordbits + edx]
add dl, [wordbits + eax]
; no partial-reg stall for reading EDX after this, on Intel Sandybridge and later
; on Core 2, set up so you can cmp al,dl later to avoid it
在英特尔CPU上,因此
%use SMARTALIGN
alignmode p6, 64
section .bss
wordbits: resb 65536
; n resd 1
; k resd 1
ans resd 1
section .rodata
n: dd 1000000000
k: dd 8
print_fmt: db `ans: %d\n`, 0
section .text
global main
main: ; no popcnt version
push ebp
push edi ; save some call-preserved registers
push esi
push ebx
mov edi, wordbits
%define wordbits edi ; dirty hack, use indexed addressing modes instead of reg+disp32.
; Avoids Skylake JCC erratum problems, and is is slightly better on Core2 with good instruction scheduling
;fill in wordbits, ecx is wordbits array index
mov ecx, 1 ; leave wordbits[0] = 0
.init_loop:
mov eax,ecx
xor ebx,ebx
.popc_loop:
lea edx, [eax-1]
inc ebx
and eax,edx ; v &= v - 1; // blsr
jnz .popc_loop
;computed bits set
mov [wordbits + ecx], bl
inc ecx
cmp ecx,65536
jb .init_loop ; bugfix: array out of bounds with jna: stores to wordbits[65536]
; GET_DEC 4,n
; GET_DEC 4,k
mov ecx, [n] ; ecx counts from n down to 1
; mov esi, [k]
xor ebx, ebx ; ebx = ans
mov esi, 1
sub esi, [k] ; 1-k
align 32
.loop:
;popcnt eax, ecx
movzx eax, cx
mov ebp, ecx ; using an extra register (EBP) to schedule instructions better(?) for Core2 decode
movzx edx, byte [wordbits + eax]
shr ebp, 16
; xor eax, eax ; break false dependency, or just let OoO exec hide it after breaking once per iter
bsr eax, ecx ; eax = 31-lzcnt for non-zero ecx
; sub edx, esi ; sub now avoids partial-reg stuff. Could have just used EBX to allow BL.
add eax, esi ; Add to BSR result seems slightly better on Core2 than sub from popcnt
add dl, [wordbits + ebp] ; we don't read EDX, no partial-register stall even on P6-family
;; want: k == 32-__builtin_clz(x)-_mm_popcnt_u32(x)
cmp al, dl ; 31-clz+(1-k) == popcount. or 31-clz == popcnt - (1-k)
je .yesk ; not-taken is the more common fast path
.done_inc:
dec ecx
jnz .loop ; }while(--n >= 0U)
.print_and_exit:
;print ans
; PRINT_DEC 4,ans
push ebx
push print_fmt
extern printf
call printf
add esp, 8
pop ebx
pop esi
pop edi
pop ebp
xor eax, eax
ret
align 8
.yesk:
inc ebx
; jmp .done_inc ; tail duplication is a *tiny* bit faster
dec ecx
jnz .loop
jmp .print_and_exit
# Results from version 1, not the Core2-friendly version.
# Version 3 sometimes runs this fast, but more often ~610ms
# Event counts are near identical for both, except cycles, but uops_issue and executed are mysteriously lower, like 9,090,858,203 executed.
$ nasm -felf32 foo.asm -l/dev/stdout &&
gcc -m32 -no-pie -fno-pie -fno-plt foo.o
$ taskset -c 3 perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,branches,branch-misses,instructions,uops_issued.any,uops_executed.thread -r 2 ./a.out
ans: 12509316
ans: 12509316
Performance counter stats for './a.out' (2 runs):
597.78 msec task-clock # 0.999 CPUs utilized ( +- 0.12% )
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
62 page-faults # 0.103 K/sec ( +- 0.81% )
2,328,038,096 cycles # 3.894 GHz ( +- 0.12% )
2,000,637,322 branches # 3346.789 M/sec ( +- 0.00% )
1,719,724 branch-misses # 0.09% of all branches ( +- 0.02% )
11,015,217,584 instructions # 4.73 insn per cycle ( +- 0.00% )
9,148,164,159 uops_issued.any # 15303.609 M/sec ( +- 0.00% )
9,102,818,982 uops_executed.thread # 15227.753 M/sec ( +- 0.00% )
(from a separate run):
9,204,430,548 idq.dsb_uops # 15513.249 M/sec ( +- 0.00% )
1,008,922 idq.mite_uops # 1.700 M/sec ( +- 20.51% )
0.598156 +- 0.000760 seconds time elapsed ( +- 0.13% )
align 32
.loop:
mov eax, ecx
popcnt eax,eax
lea edx, [dword eax - 32 + 31] ; popcnt - 32 = -(bits not set)
; dword displacement pads the cmp/jnz location to avoid the JCC erratum penalty on Intel
; xor eax, eax ; break false dependency, or just let OoO exec hide it after breaking once per iter
bsr eax, ecx ; eax = 31-lzcnt
; xor eax, 0x1f ; eax = lzcnt (for non-zero x)
; want: 32-__builtin_clz(x)-_mm_popcnt_u32(x) = (31-clz) + 1-popcnt = (31-clz) - (popcnt-1)
sub eax, edx
cmp eax, esi ;is there k non-leading bits in ecx?
%if 0
jnz .notk
inc ebx ;if so, then increment ans
.notk:
%else
jz .yesk ; not-taken is the more common fast path
.done_inc:
%endif
dec ecx
jnz .loop ; }while(--n >= 0U)
;print ans
; PRINT_DEC 4,ans
push ebx
push print_fmt
extern printf
call printf
add esp, 8
pop ebx
pop esi
xor eax, eax
ret
.yesk:
inc ebx
jmp .done_inc ;; TODO: tail duplication
%use SMARTALIGN
alignmode p6, 64
section .bss
align 4096
wordbits: resb 65536
; n resd 1
; k resd 1
; ans resd 1
section .rodata
;n: dd 0x40000000 ; low half zero, maybe useful to test correctness for a version that doesn't handle that.
n: dd 1000000000 ; = 0x3b9aca00
k: dd 8
print_fmt: db `ans: %d\n`, 0
section .text
global main
align 16
main:
main_1lookup:
push ebp
push edi ; save some call-preserved registers
push esi
push ebx
mov edi, wordbits
;%define wordbits edi ; dirty hack, use indexed addressing modes instead of reg+disp32.
; actually slightly worse on Skylake: causes un-lamination of cmp bl, [reg+reg],
; although the front-end isn't much of a bottleneck anymore
; also seems pretty much neutral to use disp32+reg on Core 2, maybe reg-read stalls or just not a front-end bottleneck
;fill in wordbits, ecx is wordbits array index
mov ecx, 1 ; leave wordbits[0] = 0
.init_loop:
mov eax,ecx
xor ebx,ebx
.popc_loop:
lea edx, [eax-1]
inc ebx
and eax,edx ; v &= v - 1; // blsr
jnz .popc_loop
;computed bits set
mov [wordbits + ecx], bl
inc ecx
cmp ecx,65536
jb .init_loop
; GET_DEC 4,n
; GET_DEC 4,k
mov ecx, [n] ; ecx counts from n down to 1
; mov esi, [k]
xor esi, esi ; ans
mov ebp, 1
sub ebp, [k] ; 1-k
align 32
.outer:
mov eax, ecx ; using an extra register (EBP) to schedule instructions better(?) for Core2 decode
shr eax, 16
; xor eax, eax ; break false dependency, or just let OoO exec hide it after breaking once per iter
bsr ebx, ecx ; eax = 31-lzcnt for non-zero ecx
;; want: k == 32-__builtin_clz(x)-_mm_popcnt_u32(x)
; 31-clz+(1-k) == popcount. or 31-clz == popcnt - (1-k)
; 31-cls+(1-k) - popcount(hi(x)) == popcount(lo(x))
add ebx, ebp
sub bl, byte [wordbits + eax]
;movzx edx, cx
lea edx, [ecx - 4] ; TODO: handle cx < 4 making this wrap
movzx edx, dx
and ecx, -65536 ; clear low 16 bits, which we're processing with the inner loop.
align 16
.low16:
cmp bl, [wordbits + edx + 0]
je .yesk0
.done_inc0:
cmp bl, [wordbits + edx + 1]
je .yesk1
.done_inc1:
cmp bl, [wordbits + edx + 2]
je .yesk2
.done_inc2:
cmp bl, [wordbits + edx + 3]
je .yesk3
.done_inc3:
; TODO: vectorize with pcmpeqb / psubb / psadbw!!
; perhaps over fewer low bits to only use 16kiB of L1d cache
sub edx, 4
jae .low16 ; }while(lowhalf-=4 doesn't wrap)
sub ecx, 65536
ja .outer
; TODO: handle ECX < 65536 initially or after handling leading bits. Probably with BSR in the inner loop
.print_and_exit:
;print ans
; PRINT_DEC 4,ans
push esi
push print_fmt
extern printf
call printf
add esp, 8
pop ebx
pop esi
pop edi
pop ebp
xor eax, eax
ret
align 16
%assign i 0
%rep 4
;align 4
.yesk%+i:
inc esi
jmp .done_inc%+i
%assign i i+1
%endrep
; could use a similar %rep block for the inner loop
; attempt tail duplication?
; TODO: skip the next cmp/jcc when jumping back.
; Two in a row will never both be equal
; dec ecx
; jnz .loop
; jmp .print_and_exit
(update after outer-loop over-count on first iter bugfix, ans: 12497876)
ans: 12498239 # This is too low by a bit vs. 12509316
# looks reasonable given skipping cleanup
209.46 msec task-clock # 0.992 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
62 page-faults # 0.296 K/sec
813,311,333 cycles # 3.883 GHz
1,263,086,089 branches # 6030.123 M/sec
824,103 branch-misses # 0.07% of all branches
2,527,743,287 instructions # 3.11 insn per cycle
1,300,567,770 uops_issued.any # 6209.065 M/sec
2,299,321,355 uops_executed.thread # 10977.234 M/sec
(from another run)
37,150,918 idq.dsb_uops # 174.330 M/sec
1,266,487,977 idq.mite_uops # 5942.976 M/sec
0.211235157 seconds time elapsed
0.209838000 seconds user
0.000000000 seconds sys
;;;;; Just the loop from main_SSE2, same init stuff and print as main_1lookup
align 32
.outer:
mov eax, ecx ; using an extra register (EBP) to schedule instructions better(?) for Core2 decode
shr eax, 16-2
; xor eax, eax ; break false dependency, or just let OoO exec hide it after breaking once per iter
bsr ebx, ecx ; eax = 31-lzcnt for non-zero ecx
;; want: k == 32-__builtin_clz(x)-_mm_popcnt_u32(x)
; 31-clz+(1-k) == popcount. or 31-clz == popcnt - (1-k)
; 31-cls+(1-k) - popcount(hi(x)) == popcount(lo(x))
add ebx, ebp
movzx edx, al
; movzx edx, byte [wordbits + edx]
sub bl, byte [wordbits + edx]
shr eax, 8 ; high part is more than 16 bits if low is 14, needs to be broken up
sub bl, byte [wordbits + eax]
; movzx eax, byte [wordbits + eax]
; add eax, edx
; sub ebx, eax
movzx eax, bl
movd xmm7, eax
pxor xmm0, xmm0
pxor xmm1, xmm1 ; 2 accumulators
pshufb xmm7, xmm0 ; broadcast byte to search for.
;; Actually SSSE3, but it only takes a few more insns to broadcast a byte with just SSE2.
;; e.g. imul eax, 0x01010101 / movd / pshufd
;movzx edx, cx
; lea edx, [ecx - 4] ; TODO: handle cx < 4 making this wrap
; movzx edx, dx
and ecx, -16384 ; clear low bits, which we're processing with the inner loop.
mov edx, wordbits ; quick and dirty, just loop forward over the array
;; FIXME: handle non-zero CX on first outer loop iteration, maybe loop backwards so we can go downwards toward 0,
;; or calculate an end-pointer if we can use that without register-read stalls on Core 2.
;; Also need to handle the leftover part not being a multiple of 32 in size
;; So maybe just make a more-flexible copy of this loop and peel the first outer iteration (containing that inner loop)
;; if the cleanup for that slows down the common case of doing exactly 16K
align 16
.low14:
movdqa xmm2, [edx]
movdqa xmm3, [edx + 16]
ds pcmpeqb xmm2, xmm7 ; extra prefixes for padding for Skylake JCC erratum: 18ms vs. 25ms
ds psubb xmm0, xmm2
ds add edx, 32
cs pcmpeqb xmm3, xmm7
cs psubb xmm1, xmm3
; hits are rare enough to not wrap counters?
; TODO: may need an inner loop to accumulate after 256 steps if every other 32nd element is a match overflowing some SIMD element
cmp edx, wordbits + 16384
jb .low14
pxor xmm7, xmm7
psadbw xmm0, xmm7
psadbw xmm1, xmm7 ; byte -> qword horizontal sum
paddd xmm0, xmm1 ; reduce to 1 vector
movhlps xmm1, xmm0
paddd xmm0, xmm1 ; hsum the low/high counts
movd eax, xmm0
add esi, eax ; sum in scalar (could sink this out)
sub ecx, 16384
ja .outer
; TODO: handle ECX < 65536 initially or after handling leading bits. Probably with BSR in the inner loop
n! / (k! * (n - k)!)
#include <stdio.h>
#include <chrono>
template<typename T>
struct Coefficients {
static constexpr unsigned size_v = sizeof(T) * 8;
// Zero-initialize.
// Indexed by [number_of_zeros][number_of_bits]
T value[size_v][size_v] = {};
constexpr Coefficients() {
// How many different ways we can choose k items from n items
// without order and without repetition.
//
// n! / k! (n - k)!
value[0][0] = 1;
value[0][1] = 1;
value[1][1] = 1;
for(unsigned i = 2; i < size_v; ++i) {
value[0][i] = 1;
value[1][i] = i;
T r = i;
for(unsigned j = 2; j < i; ++j) {
r = (r * (i - j + 1)) / j;
value[j][i] = r;
}
value[i][i] = 1;
}
}
};
template<typename T>
__attribute__((noinline)) // To make it easier to benchmark
T count_combinations(T max_value, T zero_bits) {
if( max_value == 0 )
return 0;
constexpr int size = sizeof(T) * 8;
constexpr Coefficients<T> coefs;
// assert(zeros_bits < size)
int bits = size - __builtin_clz(max_value);
T total = 0;
// Count all-ones count.
#pragma clang loop vectorize(disable)
for(int i = 0; i < bits - 1; ++i) {
total += coefs.value[zero_bits][i];
}
// Count interval [2**bits, max_value]
bits -= 1;
T mask = T(1) << bits;
max_value &= ~mask; // Remove leading bit
mask = mask >> 1;
#pragma clang loop vectorize(disable)
while( zero_bits && zero_bits < bits ) {
if( max_value & mask ) {
// If current bit is one, then we can pretend that it is zero
// (which would only make the value smaller, which means that
// it would still be < max_value) and grab all combinations of
// zeros within the remaining bits.
total += coefs.value[zero_bits - 1][bits - 1];
// And then stop pretending it's zero and continue as normal.
} else {
// If current bit is zero, we can't do anything about it, just
// have to spend a zero from our budget.
zero_bits--;
}
max_value &= ~mask;
mask = mask >> 1;
bits--;
}
// At this point we don't have any more zero bits, or we don't
// have any more bits at all.
if( (zero_bits == bits) ||
(zero_bits == 0 && max_value == ((mask << 1) - 1)) ) {
total++;
}
return total;
}
int main() {
using namespace std::chrono;
unsigned count = 0;
time_point t0 = high_resolution_clock::now();
for(int i = 0; i < 1000; ++i) {
count |= count_combinations<unsigned>(1'000'000'000, 8);
}
time_point t1 = high_resolution_clock::now();
auto duration = duration_cast<nanoseconds>(t1 - t0).count();
printf("result = %u, time = %lld ns\n", count, duration / 1000);
return 0;
}
result = 12509316, time = 35 ns