Optimization 计算[1..N]中的整数，其中K个零位在前导1以下？（不带HW POPCNT的连续范围的POPCUNT）_Optimization_X86_Nasm_Micro Optimization_Hammingweight

Optimization 计算[1..N]中的整数，其中K个零位在前导1以下？（不带HW POPCNT的连续范围的POPCUNT）

optimization x86

Optimization 计算[1..N]中的整数，其中K个零位在前导1以下？（不带HW POPCNT的连续范围的POPCUNT）,optimization,x86,nasm,micro-optimization,hammingweight,Optimization,X86,Nasm,Micro Optimization,Hammingweight,我有以下任务：计算1和N之间有多少个数字正好有K个非前导位。（例如，710=1112将有0个，4个将有2个） N和K满足条件0≤ K、 N≤ 100000000 此版本使用POPCNT，在我的机器上速度足够快： %include "io.inc" section .bss n resd 1 k resd 1 ans resd 1 section .text global CMAIN CMAIN: GET_DEC 4,n GET_DE

我有以下任务：计算1和N之间有多少个数字正好有K个非前导位。（例如，710=1112将有0个，4个将有2个）

N和K满足条件0≤ K、 N≤ 100000000

此版本使用POPCNT，在我的机器上速度足够快：

%include "io.inc"

section .bss
    n resd 1
    k resd 1
    ans resd 1
section .text
global CMAIN
CMAIN:
    GET_DEC 4,n
    GET_DEC 4,k
    mov ecx,1
    mov edx,0
    ;ecx is counter from 1 to n

loop_:
    mov eax, ecx
    popcnt eax,eax;in eax now amount of bits set
    mov edx, 32
    sub edx, eax;in edx now 32-bits set=bits not set
    
    mov eax, ecx;count leading bits
    bsr eax, eax;
    xor eax, 0x1f;
    sub edx, eax
    mov eax, edx
    ; all this lines something like (gcc):
    ; eax=32-__builtin_clz(x)-_mm_popcnt_u32(x);

    cmp eax,[k];is there k non-leading bits in ecx?
    jnz notk
    ;if so, then increment ans
    
    mov edx,[ans]
    add edx,1
    mov [ans],edx
notk:
    ;increment counter, compare to n and loop
    inc ecx
    cmp ecx,dword[n]
    jna loop_
    
    ;print ans
    PRINT_DEC 4,ans
    xor  eax, eax
    ret

就速度而言（约0.8秒），这应该是可以接受的，但它并没有被接受，因为（我猜）测试服务器上使用的CPU太旧，所以它表明发生了运行时错误

我尝试对64K*4字节的查找表使用预计算技巧，但速度不够快：

%include "io.inc"
section .bss
    n resd 1
    k resd 1
    ans resd 1
    wordbits resd 65536; bits set in numbers from 0 to 65536
section .text
global CMAIN
CMAIN:
    mov ebp, esp; for correct debugging
    mov ecx,0
    ;mov eax, ecx
    ;fill in wordbits, ecx is wordbits array index
precount_:
    mov eax,ecx
    xor ebx,ebx
    ;c is ebx, v is eax
    ;for (c = 0; v; c++){
    ;    v &= v - 1; // clear the least significant bit set
    ;}
lloop_:
    mov edx,eax
    dec edx
    and eax,edx
    inc ebx
    test eax,eax
    jnz lloop_
    
    ;computed bits set
    mov dword[wordbits+4*ecx],ebx
    
    inc ecx
    cmp ecx,65536
    jna precount_
    
    ;0'th element should be 0
    mov dword[wordbits],0
    
    GET_DEC 4,edi;n
    GET_DEC 4,esi;k
    
    mov ecx,1
    xor edx,edx
    xor ebp,ebp
    
loop_:
    mov eax, ecx
    ;popcnt eax,eax
    mov edx,ecx
    and eax,0xFFFF 
    shr edx,16
    mov eax,dword[wordbits+4*eax]
    add eax,dword[wordbits+4*edx]
    ;previous lines are to implement absent instruction popcnt.
    ; they simply do eax=wordbits[x & 0xFFFF] + wordbits[x >> 16]
    mov edx, 32
    sub edx, eax
    ;and the same as before: 
    ;non-leading zero bits=32-bits set-__builtin_clz(x)
    mov eax, ecx
    bsr eax, eax
    xor eax, 0x1f
    sub edx, eax
    mov eax, edx

    ;compare to k again to see if this number has exactly k 
    ;non-leading zero bits

    cmp edx, esi
    jnz notk

    ;increment ebp (answer) if so
    mov edx, ebp
    add edx, 1
    mov ebp, edx
    ;and (or) go to then next iteration 
notk:
    inc ecx
    cmp ecx, edi
    jna loop_
    
    ;print answer what is in ebp
    PRINT_DEC 4, ebp
    xor  eax, eax
    ret

（>1秒）

我应该加快第二个程序的速度（如果是，那么怎么做？）还是以某种方式用其他（哪些？）指令替换POPCNT（我想SSE2和更早的指令应该是可用的）？

首先，一个太旧而没有

POPCNT

的服务器在其他方面会明显变慢，并且有不同的瓶颈。考虑到它有pshufb但没有popcnt，它是核心2第一代或第二代（Conroe或Penryn）。请参阅Agner Fog的Microach PDF（上）。同时降低时钟速度，因此您在该CPU上所能做的最好工作可能不足以让蛮力工作

可能有一些算法改进可以节省大量时间，比如注意每4个增量通过00、01、10、11模式循环低2位：每4个增量2个零发生一次，1个零发生两次，没有零发生一次。对于每一个>=4的数字，这2位低于前导位，因此是计数的一部分。将其推广到1和log2（N）之间的每个MSB位置的组合数学公式中，可能会大大减少工作量。处理2^M和N之间的数字不太明显

此处的版本：

已清理popcnt版本，i7-6700k@3.9GHz上536ms，迭代期间无算法优化。对于k=8，N=100000000
朴素的LUT版本（每次迭代加载2次，无迭代间优化）：良好运行时约595毫秒，对于k=8，N=100000000，更常见的情况是约610毫秒。Core2Duo（Conroe）@2.4GHz:1.69秒。（在编辑历史记录中有几个更糟糕的版本，第一个版本在核心2上有部分寄存器暂停。）
（未完成，未编写清理代码）优化LUT版本（展开，高半/MSB BSR工作提升，每次迭代只留下1次查找（cmp/jcc），Skylake上210毫秒，Core 2上0.58秒@2.4GHz。时间应该是现实的；我们完成了所有的工作，只是错过了MSB处于低位16的最后2^16次迭代。在外循环中处理任何必要的拐角情况以及清理，对速度的影响不应超过1%
（更未完成）：使用
```
pcmpeqb
```
/
```
psubb
```
（使用
```
psadbw
```
在外循环中对优化的LUT版本进行矢量化，如图所示-内循环减少到在固定大小的数组中对匹配在外循环中计算的值的字节元素进行计数。就像标量版本一样）。天湖18毫秒，核心2约0.036秒。这些时间现在可能包括相当大的启动开销。但正如预期/希望的那样，两者的速度都快了16倍
对
wordbits
表进行一次柱状图分析（可能在生成时）。与其搜索64kiB来查找匹配的字节，只需查找每个外部循环迭代的答案！这将使您在大N时的速度提高数千倍（尽管您仍然需要处理低1..64K以及N不是64K倍数时的部分范围）

为了有效地测量更快的版本，您可以在整个过程中重复循环，这样整个过程仍然需要一些可测量的时间，比如半秒钟。（因为它是asm，没有编译器会通过重复执行相同的N，k来优化工作。）或者，如果您知道TSC频率，您可以在程序内部使用

rdtsc

进行计时。但是能够在整个过程中轻松地使用

perf stat

，这很好，所以我会继续这样做（取出printf并生成一个静态可执行文件，以进一步减少启动开销）

你似乎在问关于微观优化蛮力方法的问题，该方法仍然单独检查每个数字。（不过，对于如何实现

32-clz-popcnt==k

有一些重要的优化。）

还有其他通常更快的popcnt方法，例如像中的Bithack。但是，当您在一个紧密循环中有很多popcounting要做时（足以使查找表在缓存中保持热状态），LUT可能是好的

如果您有fast SSSE3

pshufb

，则值得使用它在XMM寄存器（自动矢量化循环）中并行对四个DWORD执行SIMD popcount，或者在带有AVX2的YMM寄存器中执行更好的功能。（第一代Core2拥有

pshufb

，但在第二代Core2之前，它不是单一uop。可能仍然值得一试。）

或者更好的方法是，使用SIMD来计算与我们所寻找的匹配的LUT元素，即给定的高半个数字

强力检查连续的数字范围为LUT策略打开了一个主要的优化：数字的上n位每2^n增量只更改一次。因此，您可以将这些位的计数从最内部的循环中提升出来。这也使得使用较小的表（适合L1d缓存）变得值得

说到这里，您的64k*4表是256KiB，即二级缓存的大小。这意味着它可能必须在每次循环通过它时从L3进入。您的桌面CPU应该有足够的L3带宽（由于增量，访问模式是连续的），现代服务器有更大的L2，但几乎没有理由不使用字节LUT（popcnt（-1）只有32），而

movzx

字节加载与

mov

dword加载一样便宜

; General LUT lookup with two 16-bit halves
    movzx  edx, cx            ; low 16 bits
    mov    eax, ecx
    shr    eax, 16            ; high 16 bits
    movzx  edx, byte [wordbits + edx]
    add     dl,      [wordbits + eax]
      ; no partial-reg stall for reading EDX after this, on Intel Sandybridge and later
      ; on Core 2, set up so you can cmp al,dl later to avoid it

在英特尔CPU上，因此
%use SMARTALIGN alignmode p6, 64 section .bss wordbits: resb 65536 ; n resd 1 ; k resd 1 ans resd 1 section .rodata n: dd 1000000000 k: dd 8 print_fmt: db `ans: %d\n`, 0 section .text global main main: ; no popcnt version push ebp push edi ; save some call-preserved registers push esi push ebx mov edi, wordbits %define wordbits edi ; dirty hack, use indexed addressing modes instead of reg+disp32. ; Avoids Skylake JCC erratum problems, and is is slightly better on Core2 with good instruction scheduling ;fill in wordbits, ecx is wordbits array index mov ecx, 1 ; leave wordbits[0] = 0 .init_loop: mov eax,ecx xor ebx,ebx .popc_loop: lea edx, [eax-1] inc ebx and eax,edx ; v &= v - 1; // blsr jnz .popc_loop ;computed bits set mov [wordbits + ecx], bl inc ecx cmp ecx,65536 jb .init_loop ; bugfix: array out of bounds with jna: stores to wordbits[65536] ; GET_DEC 4,n ; GET_DEC 4,k mov ecx, [n] ; ecx counts from n down to 1 ; mov esi, [k] xor ebx, ebx ; ebx = ans mov esi, 1 sub esi, [k] ; 1-k align 32 .loop: ;popcnt eax, ecx movzx eax, cx mov ebp, ecx ; using an extra register (EBP) to schedule instructions better(?) for Core2 decode movzx edx, byte [wordbits + eax] shr ebp, 16 ; xor eax, eax ; break false dependency, or just let OoO exec hide it after breaking once per iter bsr eax, ecx ; eax = 31-lzcnt for non-zero ecx ; sub edx, esi ; sub now avoids partial-reg stuff. Could have just used EBX to allow BL. add eax, esi ; Add to BSR result seems slightly better on Core2 than sub from popcnt add dl, [wordbits + ebp] ; we don't read EDX, no partial-register stall even on P6-family ;; want: k == 32-__builtin_clz(x)-_mm_popcnt_u32(x) cmp al, dl ; 31-clz+(1-k) == popcount. or 31-clz == popcnt - (1-k) je .yesk ; not-taken is the more common fast path .done_inc: dec ecx jnz .loop ; }while(--n >= 0U) .print_and_exit: ;print ans ; PRINT_DEC 4,ans push ebx push print_fmt extern printf call printf add esp, 8 pop ebx pop esi pop edi pop ebp xor eax, eax ret align 8 .yesk: inc ebx ; jmp .done_inc ; tail duplication is a *tiny* bit faster dec ecx jnz .loop jmp .print_and_exit

# Results from version 1, not the Core2-friendly version. # Version 3 sometimes runs this fast, but more often ~610ms # Event counts are near identical for both, except cycles, but uops_issue and executed are mysteriously lower, like 9,090,858,203 executed. $ nasm -felf32 foo.asm -l/dev/stdout && gcc -m32 -no-pie -fno-pie -fno-plt foo.o $ taskset -c 3 perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,branches,branch-misses,instructions,uops_issued.any,uops_executed.thread -r 2 ./a.out ans: 12509316 ans: 12509316 Performance counter stats for './a.out' (2 runs): 597.78 msec task-clock # 0.999 CPUs utilized ( +- 0.12% ) 0 context-switches # 0.000 K/sec 0 cpu-migrations # 0.000 K/sec 62 page-faults # 0.103 K/sec ( +- 0.81% ) 2,328,038,096 cycles # 3.894 GHz ( +- 0.12% ) 2,000,637,322 branches # 3346.789 M/sec ( +- 0.00% ) 1,719,724 branch-misses # 0.09% of all branches ( +- 0.02% ) 11,015,217,584 instructions # 4.73 insn per cycle ( +- 0.00% ) 9,148,164,159 uops_issued.any # 15303.609 M/sec ( +- 0.00% ) 9,102,818,982 uops_executed.thread # 15227.753 M/sec ( +- 0.00% ) (from a separate run): 9,204,430,548 idq.dsb_uops # 15513.249 M/sec ( +- 0.00% ) 1,008,922 idq.mite_uops # 1.700 M/sec ( +- 20.51% ) 0.598156 +- 0.000760 seconds time elapsed ( +- 0.13% )

align 32 .loop: mov eax, ecx popcnt eax,eax lea edx, [dword eax - 32 + 31] ; popcnt - 32 = -(bits not set) ; dword displacement pads the cmp/jnz location to avoid the JCC erratum penalty on Intel ; xor eax, eax ; break false dependency, or just let OoO exec hide it after breaking once per iter bsr eax, ecx ; eax = 31-lzcnt ; xor eax, 0x1f ; eax = lzcnt (for non-zero x) ; want: 32-__builtin_clz(x)-_mm_popcnt_u32(x) = (31-clz) + 1-popcnt = (31-clz) - (popcnt-1) sub eax, edx cmp eax, esi ;is there k non-leading bits in ecx? %if 0 jnz .notk inc ebx ;if so, then increment ans .notk: %else jz .yesk ; not-taken is the more common fast path .done_inc: %endif dec ecx jnz .loop ; }while(--n >= 0U) ;print ans ; PRINT_DEC 4,ans push ebx push print_fmt extern printf call printf add esp, 8 pop ebx pop esi xor eax, eax ret .yesk: inc ebx jmp .done_inc ;; TODO: tail duplication

%use SMARTALIGN alignmode p6, 64 section .bss align 4096 wordbits: resb 65536 ; n resd 1 ; k resd 1 ; ans resd 1 section .rodata ;n: dd 0x40000000 ; low half zero, maybe useful to test correctness for a version that doesn't handle that. n: dd 1000000000 ; = 0x3b9aca00 k: dd 8 print_fmt: db `ans: %d\n`, 0 section .text global main align 16 main: main_1lookup: push ebp push edi ; save some call-preserved registers push esi push ebx mov edi, wordbits ;%define wordbits edi ; dirty hack, use indexed addressing modes instead of reg+disp32. ; actually slightly worse on Skylake: causes un-lamination of cmp bl, [reg+reg], ; although the front-end isn't much of a bottleneck anymore ; also seems pretty much neutral to use disp32+reg on Core 2, maybe reg-read stalls or just not a front-end bottleneck ;fill in wordbits, ecx is wordbits array index mov ecx, 1 ; leave wordbits[0] = 0 .init_loop: mov eax,ecx xor ebx,ebx .popc_loop: lea edx, [eax-1] inc ebx and eax,edx ; v &= v - 1; // blsr jnz .popc_loop ;computed bits set mov [wordbits + ecx], bl inc ecx cmp ecx,65536 jb .init_loop ; GET_DEC 4,n ; GET_DEC 4,k mov ecx, [n] ; ecx counts from n down to 1 ; mov esi, [k] xor esi, esi ; ans mov ebp, 1 sub ebp, [k] ; 1-k align 32 .outer: mov eax, ecx ; using an extra register (EBP) to schedule instructions better(?) for Core2 decode shr eax, 16 ; xor eax, eax ; break false dependency, or just let OoO exec hide it after breaking once per iter bsr ebx, ecx ; eax = 31-lzcnt for non-zero ecx ;; want: k == 32-__builtin_clz(x)-_mm_popcnt_u32(x) ; 31-clz+(1-k) == popcount. or 31-clz == popcnt - (1-k) ; 31-cls+(1-k) - popcount(hi(x)) == popcount(lo(x)) add ebx, ebp sub bl, byte [wordbits + eax] ;movzx edx, cx lea edx, [ecx - 4] ; TODO: handle cx < 4 making this wrap movzx edx, dx and ecx, -65536 ; clear low 16 bits, which we're processing with the inner loop. align 16 .low16: cmp bl, [wordbits + edx + 0] je .yesk0 .done_inc0: cmp bl, [wordbits + edx + 1] je .yesk1 .done_inc1: cmp bl, [wordbits + edx + 2] je .yesk2 .done_inc2: cmp bl, [wordbits + edx + 3] je .yesk3 .done_inc3: ; TODO: vectorize with pcmpeqb / psubb / psadbw!! ; perhaps over fewer low bits to only use 16kiB of L1d cache sub edx, 4 jae .low16 ; }while(lowhalf-=4 doesn't wrap) sub ecx, 65536 ja .outer ; TODO: handle ECX < 65536 initially or after handling leading bits. Probably with BSR in the inner loop .print_and_exit: ;print ans ; PRINT_DEC 4,ans push esi push print_fmt extern printf call printf add esp, 8 pop ebx pop esi pop edi pop ebp xor eax, eax ret align 16 %assign i 0 %rep 4 ;align 4 .yesk%+i: inc esi jmp .done_inc%+i %assign i i+1 %endrep ; could use a similar %rep block for the inner loop ; attempt tail duplication? ; TODO: skip the next cmp/jcc when jumping back. ; Two in a row will never both be equal ; dec ecx ; jnz .loop ; jmp .print_and_exit

(update after outer-loop over-count on first iter bugfix, ans: 12497876) ans: 12498239 # This is too low by a bit vs. 12509316 # looks reasonable given skipping cleanup 209.46 msec task-clock # 0.992 CPUs utilized 0 context-switches # 0.000 K/sec 0 cpu-migrations # 0.000 K/sec 62 page-faults # 0.296 K/sec 813,311,333 cycles # 3.883 GHz 1,263,086,089 branches # 6030.123 M/sec 824,103 branch-misses # 0.07% of all branches 2,527,743,287 instructions # 3.11 insn per cycle 1,300,567,770 uops_issued.any # 6209.065 M/sec 2,299,321,355 uops_executed.thread # 10977.234 M/sec (from another run) 37,150,918 idq.dsb_uops # 174.330 M/sec 1,266,487,977 idq.mite_uops # 5942.976 M/sec 0.211235157 seconds time elapsed 0.209838000 seconds user 0.000000000 seconds sys

;;;;; Just the loop from main_SSE2, same init stuff and print as main_1lookup align 32 .outer: mov eax, ecx ; using an extra register (EBP) to schedule instructions better(?) for Core2 decode shr eax, 16-2 ; xor eax, eax ; break false dependency, or just let OoO exec hide it after breaking once per iter bsr ebx, ecx ; eax = 31-lzcnt for non-zero ecx ;; want: k == 32-__builtin_clz(x)-_mm_popcnt_u32(x) ; 31-clz+(1-k) == popcount. or 31-clz == popcnt - (1-k) ; 31-cls+(1-k) - popcount(hi(x)) == popcount(lo(x)) add ebx, ebp movzx edx, al ; movzx edx, byte [wordbits + edx] sub bl, byte [wordbits + edx] shr eax, 8 ; high part is more than 16 bits if low is 14, needs to be broken up sub bl, byte [wordbits + eax] ; movzx eax, byte [wordbits + eax] ; add eax, edx ; sub ebx, eax movzx eax, bl movd xmm7, eax pxor xmm0, xmm0 pxor xmm1, xmm1 ; 2 accumulators pshufb xmm7, xmm0 ; broadcast byte to search for. ;; Actually SSSE3, but it only takes a few more insns to broadcast a byte with just SSE2. ;; e.g. imul eax, 0x01010101 / movd / pshufd ;movzx edx, cx ; lea edx, [ecx - 4] ; TODO: handle cx < 4 making this wrap ; movzx edx, dx and ecx, -16384 ; clear low bits, which we're processing with the inner loop. mov edx, wordbits ; quick and dirty, just loop forward over the array ;; FIXME: handle non-zero CX on first outer loop iteration, maybe loop backwards so we can go downwards toward 0, ;; or calculate an end-pointer if we can use that without register-read stalls on Core 2. ;; Also need to handle the leftover part not being a multiple of 32 in size ;; So maybe just make a more-flexible copy of this loop and peel the first outer iteration (containing that inner loop) ;; if the cleanup for that slows down the common case of doing exactly 16K align 16 .low14: movdqa xmm2, [edx] movdqa xmm3, [edx + 16] ds pcmpeqb xmm2, xmm7 ; extra prefixes for padding for Skylake JCC erratum: 18ms vs. 25ms ds psubb xmm0, xmm2 ds add edx, 32 cs pcmpeqb xmm3, xmm7 cs psubb xmm1, xmm3 ; hits are rare enough to not wrap counters? ; TODO: may need an inner loop to accumulate after 256 steps if every other 32nd element is a match overflowing some SIMD element cmp edx, wordbits + 16384 jb .low14 pxor xmm7, xmm7 psadbw xmm0, xmm7 psadbw xmm1, xmm7 ; byte -> qword horizontal sum paddd xmm0, xmm1 ; reduce to 1 vector movhlps xmm1, xmm0 paddd xmm0, xmm1 ; hsum the low/high counts movd eax, xmm0 add esi, eax ; sum in scalar (could sink this out) sub ecx, 16384 ja .outer ; TODO: handle ECX < 65536 initially or after handling leading bits. Probably with BSR in the inner loop

n! / (k! * (n - k)!)

#include <stdio.h> #include <chrono> template<typename T> struct Coefficients { static constexpr unsigned size_v = sizeof(T) * 8; // Zero-initialize. // Indexed by [number_of_zeros][number_of_bits] T value[size_v][size_v] = {}; constexpr Coefficients() { // How many different ways we can choose k items from n items // without order and without repetition. // // n! / k! (n - k)! value[0][0] = 1; value[0][1] = 1; value[1][1] = 1; for(unsigned i = 2; i < size_v; ++i) { value[0][i] = 1; value[1][i] = i; T r = i; for(unsigned j = 2; j < i; ++j) { r = (r * (i - j + 1)) / j; value[j][i] = r; } value[i][i] = 1; } } }; template<typename T> __attribute__((noinline)) // To make it easier to benchmark T count_combinations(T max_value, T zero_bits) { if( max_value == 0 ) return 0; constexpr int size = sizeof(T) * 8; constexpr Coefficients<T> coefs; // assert(zeros_bits < size) int bits = size - __builtin_clz(max_value); T total = 0; // Count all-ones count. #pragma clang loop vectorize(disable) for(int i = 0; i < bits - 1; ++i) { total += coefs.value[zero_bits][i]; } // Count interval [2**bits, max_value] bits -= 1; T mask = T(1) << bits; max_value &= ~mask; // Remove leading bit mask = mask >> 1; #pragma clang loop vectorize(disable) while( zero_bits && zero_bits < bits ) { if( max_value & mask ) { // If current bit is one, then we can pretend that it is zero // (which would only make the value smaller, which means that // it would still be < max_value) and grab all combinations of // zeros within the remaining bits. total += coefs.value[zero_bits - 1][bits - 1]; // And then stop pretending it's zero and continue as normal. } else { // If current bit is zero, we can't do anything about it, just // have to spend a zero from our budget. zero_bits--; } max_value &= ~mask; mask = mask >> 1; bits--; } // At this point we don't have any more zero bits, or we don't // have any more bits at all. if( (zero_bits == bits) || (zero_bits == 0 && max_value == ((mask << 1) - 1)) ) { total++; } return total; } int main() { using namespace std::chrono; unsigned count = 0; time_point t0 = high_resolution_clock::now(); for(int i = 0; i < 1000; ++i) { count |= count_combinations<unsigned>(1'000'000'000, 8); } time_point t1 = high_resolution_clock::now(); auto duration = duration_cast<nanoseconds>(t1 - t0).count(); printf("result = %u, time = %lld ns\n", count, duration / 1000); return 0; }

result = 12509316, time = 35 ns