为什么C++；测试Collatz猜想的代码比手写程序集运行得更快？我为汇编和C++编写了这两种解决方案。它们实现了相同的暴力方法来测试。组装解决方案由以下部件组装而成： nasm-felf64 p14.asm和gcc p14.o-o p14 C++编写： g++p14.cpp-op14_C++_Performance_Assembly_Optimization_X86

为什么C++；测试Collatz猜想的代码比手写程序集运行得更快？我为汇编和C++编写了这两种解决方案。它们实现了相同的暴力方法来测试。组装解决方案由以下部件组装而成： nasm-felf64 p14.asm和gcc p14.o-o p14 C++编写： g++p14.cpp-op14

c++ performance assembly optimization x86

为什么C++；测试Collatz猜想的代码比手写程序集运行得更快？我为汇编和C++编写了这两种解决方案。它们实现了相同的暴力方法来测试。组装解决方案由以下部件组装而成： nasm-felf64 p14.asm和gcc p14.o-o p14 C++编写： g++p14.cpp-op14,c++,performance,assembly,optimization,x86,C++,Performance,Assembly,Optimization,X86,组件，p14.asm： section.data fmt数据库“%d”，10，0 全球主要外部打印第节.案文主要内容： mov rcx，1000000 异或rdi，rdi；马克斯一世 xor rsi，rsi；我 l1: 12月rcx 异或r10，r10；计数 mov-rax，rcx l2：测试rax，1 均价 mov rbx，3 mul rbx 拉克斯公司 jmp-c1 甚至： mov rbx，2 异或rdx，rdx 分区rbx c1: 公司r10 cmp-rax，1 jne l2 cm

组件，

p14.asm

：

section.data
fmt数据库“%d”，10，0
全球主要
外部打印
第节.案文
主要内容：
mov rcx，1000000
异或rdi，rdi；马克斯一世
xor rsi，rsi；我
l1:
12月rcx
异或r10，r10；计数
mov-rax，rcx
l2：
测试rax，1
均价
mov rbx，3
mul rbx
拉克斯公司
jmp-c1
甚至：
mov rbx，2
异或rdx，rdx
分区rbx
c1:
公司r10
cmp-rax，1
jne l2
cmp-rdi，r10
cmovl rdi，r10
cmovl rsi，rcx
cmp rcx，2
jne l1
mov-rdi，fmt
xor-rax，rax
调用printf
ret

C++，

p14.cpp

：

#包括
整数序列（长n）{
整数计数=1；
而（n！=1）{
如果（n%2==0）
n/=2；
其他的
n=3*n+1；
++计数；
}
返回计数；
}
int main（）{
int max=0，max；
对于（int i=999999；i>0；--i）{
int s=序列（i）；
如果（s>最大值）{
max=s；
maxi=i；
}
}
STD：：在源代码生成机器代码时，将CUT< P> C++程序转换成汇编程序，而汇编程序比C++慢，这是错误的，而且，二进制代码不同于编译器到编译器，因此，智能C++编译器比哑ASM更能产生二进制代码，效率更高。bler代码
但是，我认为您的评测方法存在某些缺陷。以下是评测的一般指导原则：
确保系统处于正常/空闲状态。停止所有已启动或大量使用CPU（或通过网络轮询）的正在运行的进程（应用程序）
您的数据大小必须更大
您的测试必须运行5-10秒以上
不要只依赖一个样本。进行N次测试。收集结果并计算结果的平均值或中位数
如果您认为64位DIV指令是一种很好的除以2的方法，那么编译器的asm输出就不足为奇了，即使使用-O0
（快速编译，无需额外优化，并在每个C语句之后/之前将其存储/重新加载到内存中，以便调试器可以修改变量）
请参阅以了解如何编写高效的asm。他还提供了指令表和微阵列指南，以了解特定CPU的具体细节。有关更多性能链接，请参阅标记wiki
另请参阅这个关于用手写asm击败编译器的更一般性问题：.TL:DR：如果你做错了，那么是的（如这个问题）
<>通常你可以让编译器做它的事情，特别是如果你很强的话，尝试写C++，可以有效地编译< /强>。也可以看到。其中一个答案链接显示了各种C编译器如何用酷技巧优化一些简单的函数。<强> Matt Godbolt的CPPCON2017对话。与此类似。

在英特尔Haswell上，div r64为36 uops，延迟为32-96个周期，吞吐量为每21-74个周期一个。（加上设置RBX和零RDX的2 uops，但无序执行可以提前运行）。在这种情况下，延迟是最相关的因素，因为它是循环携带依赖链的一部分
shr-rax，1
执行相同的无符号除法：1 uop，1c延迟，每个时钟周期可以运行2次
相比之下，32位除法速度更快，但与移位相比仍然很可怕。idiv r32
在Haswell上的吞吐量为9 uops，延迟为22-29c，每8-11c一个

从gcc的-O0
asm输出（）中可以看出，它只使用移位指令。clang-O0
确实像您所想的那样天真地编译，甚至两次使用64位IDIV。（优化时，如果源代码使用相同的操作数进行除法和模运算，则编译器会同时使用IDIV的两个输出）
GCC没有一个完全幼稚的模式。这包括通过常量识别除法，并使用移位（2的幂）或（2的非幂）来避免IDIV（请参见上述锁销链接中的div_by_13
）
gcc-Os（针对大小进行优化）确实将IDIV用于非2次幂次除法，
不幸的是，即使在乘法逆码只是稍微大一点，但要快得多的情况下

帮助编译器
（本例总结：使用uint64\u t n
）
首先，只需看看优化的编译器输出。（-O3
）。
查看asm输出（在Godbolt上，或参见）。当编译器没有首先生成最佳代码时：以引导编译器生成更好代码的方式编写C/C++源代码通常是最好的方法。。你必须了解asm，知道什么是有效的，但你可以间接地应用这些知识。编译器也是一个很好的想法来源：有时会发出叮当声将做一些很酷的事情，您可以手持gcc来做同样的事情：请参阅下面@Veedrac代码中我对非展开循环所做的操作。）
这种方法是可移植的，在20年后，未来的编译器可以将其编译成任何在未来硬件（x86或非x86）上有效的东西，可能使用新的ISA扩展或自动矢量化。15年前手工编写的x86-64 asm通常不会针对Skylake进行优化调整。例如，当时不存在比较和分支宏融合。对于一个微体系结构而言，手工制作的asm现在的优化可能不是最佳的
even:
    mov rbx, 2
    xor rdx, rdx
    div rbx

 # from gcc5.4 -O3  plus my comments

 # edx= count=1
 # rax= uint64_t n

.L9:                   # do{
    lea    rcx, [rax+1+rax*2]   # rcx = 3*n + 1
    mov    rdi, rax
    shr    rdi         # rdi = n>>1;
    test   al, 1       # set flags based on n%2 (aka n&1)
    mov    rax, rcx
    cmove  rax, rdi    # n= (n%2) ? 3*n+1 : n/2;
    add    edx, 1      # ++count;
    cmp    rax, 1
    jne   .L9          #}while(n!=1)

  cmp/branch to update max and maxi, and then do the next n

 ### Hand-optimized version of what gcc does
.L9:                       #do{
    lea     rcx, [rax+1+rax*2] # rcx = 3*n + 1
    shr     rax, 1         # n>>=1;    CF = n&1 = n%2
    cmovc   rax, rcx       # n= (n&1) ? 3*n+1 : n/2;
    inc     edx            # ++count;
    cmp     rax, 1
    jne     .L9            #}while(n!=1)

# starting with YMM0 = [ n_d, n_c, n_b, n_a ]  (64-bit elements)
# ymm4 = _mm256_set1_epi64x(1):  increment vector
# ymm5 = all-zeros:  count vector

.inner_loop:
    vpaddq    ymm1, ymm0, xmm0
    vpaddq    ymm1, ymm1, xmm0
    vpaddq    ymm1, ymm1, set1_epi64(1)     # ymm1= 3*n + 1.  Maybe could do this more efficiently?

    vprllq    ymm3, ymm0, 63                # shift bit 1 to the sign bit

    vpsrlq    ymm0, ymm0, 1                 # n /= 2

    # FP blend between integer insns may cost extra bypass latency, but integer blends don't have 1 bit controlling a whole qword.
    vpblendvpd ymm0, ymm0, ymm1, ymm3       # variable blend controlled by the sign bit of each 64-bit element.  I might have the source operands backwards, I always have to look this up.

    # ymm0 = updated n  in each element.

    vpcmpeqq ymm1, ymm0, set1_epi64(1)
    vpandn   ymm4, ymm1, ymm4         # zero out elements of ymm4 where the compare was true

    vpaddq   ymm5, ymm5, ymm4         # count++ in elements where n has never been == 1

    vptest   ymm4, ymm4
    jnz  .inner_loop
    # Fall through when all the n values have reached 1 at some point, and our increment vector is all-zero

    vextracti128 ymm0, ymm5, 1
    vpmaxq .... crap this doesn't exist
    # Actually just delay doing a horizontal max until the very very end.  But you need some way to record max and maxi.

goto loop_entry;  // C++ structured like the asm, for illustration only
do {
   n = n*3 + 1;
  loop_entry:
   shift = _tzcnt_u64(n);
   n >>= shift;
   count += shift;
} while(n != 1);

    .seq:
        inc     esi                 ; counter
        lea     edx, [3*eax+1]      ; edx = 3*n+1
        shr     eax, 1              ; eax = n/2
        cmovc   eax, edx            ; if CF eax = edx
        jnz     .seq                ; jmp if n<>1

include "%lib%/freshlib.inc"
@BinaryType console, compact
options.DebugMode = 1
include "%lib%/freshlib.asm"

start:
        InitializeAll
        mov ecx, 999999
        xor edi, edi        ; max
        xor ebx, ebx        ; max i

    .main_loop:

        xor     esi, esi
        mov     eax, ecx

    .seq:
        inc     esi                 ; counter
        lea     edx, [3*eax+1]      ; edx = 3*n+1
        shr     eax, 1              ; eax = n/2
        cmovc   eax, edx            ; if CF eax = edx
        jnz     .seq                ; jmp if n<>1

        cmp     edi, esi
        cmovb   edi, esi
        cmovb   ebx, ecx

        dec     ecx
        jnz     .main_loop

        OutputValue "Max sequence: ", edi, 10, -1
        OutputValue "Max index: ", ebx, 10, -1

        FinalizeAll
        stdcall TerminateAll, 0

(N << 1) + N + 1:     (N >> 1) + N + 1:

        b10                    b1
         b1                     b
       +  1                   + 1
       ----                   ---
       bBb0                   bBb

uint64_t sequence(uint64_t size, uint64_t *path) {
    uint64_t n, i, c, maxi = 0, maxc = 0;

    for (n = i = (size - 1) | 1; i > 2; n = i -= 2) {
        c = 2;
        while ((n = ((n & 3)? (n >> 1) + n + 1 : (n >> 2))) > 2)
            c += 2;
        if (n == 2)
            c++;
        if (c > maxc) {
            maxi = i;
            maxc = c;
        }
    }
    *path = maxc;
    return maxi;
}

int main() {
    uint64_t maxi, maxc;

    maxi = sequence(1000000, &maxc);
    printf("%llu, %llu\n", maxi, maxc);
    return 0;
}

MOV RCX, 1000000;



DEC RCX;
AND RCX, -2;
XOR RAX, RAX;
MOV RBX, RAX;

@main:
  XOR RSI, RSI;
  LEA RDI, [RCX + 1];

  @loop:
    ADD RSI, 2;
    LEA RDX, [RDI + RDI*2 + 2];
    SHR RDX, 1;
    SHRD RDI, RDI, 2;    ror rdi,2   would do the same thing
    CMOVL RDI, RDX;      Note that SHRD leaves OF = undefined with count>1, and this doesn't work on all CPUs.
    CMOVS RDI, RDX;
    CMP RDI, 2;
  JA @loop;

  LEA RDX, [RSI + 1];
  CMOVE RSI, RDX;

  CMP RAX, RSI;
  CMOVB RAX, RSI;
  CMOVB RBX, RCX;

  SUB RCX, 2;
JA @main;



MOV RDI, RCX;
ADD RCX, 10;
PUSH RDI;
PUSH RCX;

@itoa:
  XOR RDX, RDX;
  DIV RCX;
  ADD RDX, '0';
  PUSH RDX;
  TEST RAX, RAX;
JNE @itoa;

  PUSH RCX;
  LEA RAX, [RBX + 1];
  TEST RBX, RBX;
  MOV RBX, RDI;
JNE @itoa;

POP RCX;
INC RDI;
MOV RDX, RDI;

@outp:
  MOV RSI, RSP;
  MOV RAX, RDI;
  SYSCALL;
  POP RAX;
  TEST RAX, RAX;
JNE @outp;

LEA RAX, [RDI + 59];
DEC RDI;
SYSCALL;

nasm -f elf64 file.asm
ld -o file file.o

test rax, 1
jpe even

{
   n = (n*3+1) >> 1;
   count += 2;
}

if (n & 1)
{
    n = (n*3 + 1) >> 1;
    count += 2;
}
else
{
    n >>= 1;
    ++count;
}

while (n % 2 == 0) n /= 2;
if (n > 1) for (;;) {
    n = (3*n + 1) / 2;
    if (n % 2 == 0) {
        do n /= 2; while (n % 2 == 0);
        if (n == 1) break;
    }
}

3n+1 -> ???? 0000 0100
/ 2  -> ???? ?000 0010
/ 2  -> ???? ??00 0001
3n+1 -> ???? ??00 0100
/ 2  -> ???? ???0 0010
/ 2  -> ???? ???? 0001
3n+1 -> ???? ???? 0100
/ 2  -> ???? ???? ?010
/ 2  -> ???? ???? ??01
3n+1 -> ???? ???? ??00
/ 2  -> ???? ???? ???0
/ 2  -> ???? ???? ????

k = n / 256;
m = n % 256;

switch (m) {
    case 0: n = 1 * k + 0; break;
    case 1: n = 81 * k + 1; break; 
    case 2: n = 81 * k + 1; break; 
    ...
    case 155: n = 729 * k + 425; break;
    ...
}

static const unsigned int multipliers [256] = { ... }
static const unsigned int adders [256] = { ... }

while (n > 128) {
    size_t lastBits = n % 256;
    n = (n >> 8) * multipliers [lastBits] + adders [lastBits];
}

import sys

inner_loop = 0

def collatz_sequence(N, cache):
    global inner_loop

    l = [ ]
    stop = False
    n = N

    tails = [ ]

    while not stop:
        inner_loop += 1
        tmp = n
        l.append(n)
        if n <= 1:
            stop = True  
        elif n in cache:
            stop = True
        elif n % 2:
            n = 3*n + 1
        else:
            n = n // 2
        tails.append((tmp, len(l)))

    for key, offset in tails:
        if not key in cache:
            cache[key] = l[offset:]

    return l

def gen_sequence(l, cache):
    for elem in l:
        yield elem
        if elem in cache:
            yield from gen_sequence(cache[elem], cache)
            raise StopIteration

if __name__ == "__main__":
    le_cache = {}

    for n in range(1, 4711, 5):
        l = collatz_sequence(n, le_cache)
        print("{}: {}".format(n, len(list(gen_sequence(l, le_cache)))))

    print("inner_loop = {}".format(inner_loop))