Assembly 如何在汇编语言中找到奇数整数？_Assembly

Assembly 如何在汇编语言中找到奇数整数？

assembly

Assembly 如何在汇编语言中找到奇数整数？,assembly,Assembly,编写一个汇编语言程序，通过加倍和减半将两个整数相乘。下面是使用这种方法将A和B相乘的伪代码 Multiply A and B and store result in C: C = 0 ; Initialize to 0 while B is not equal to 0: if B is odd then C = C+A ; Add copy of A (even number left) A = 2*A ; Can be done quickly by shifting left B = B/

编写一个汇编语言程序，通过加倍和减半将两个整数相乘。下面是使用这种方法将A和B相乘的伪代码

Multiply A and B and store result in C:
C = 0 ; Initialize to 0
while B is not equal to 0:
if B is odd then C = C+A ; Add copy of A (even number left)
A = 2*A ; Can be done quickly by shifting left
B = B/2 ; Can be done quickly by shifting right

我已经做了很多工作，如何使用shl测试奇数整数

它可以使用“

shr

”指令

mov al, 0x10010010b
shr al, 1      ;CF = 0   even

mov al, 0x10010011b
shr al, 1      ;CF = 1   odd

jnb __lable    ;jump if CF = 0
or
jb  __lable    ;jump if CF = 1

对于x86_64，您可以使用

shr

进行奇数和零测试

例如：

要在中构建和测试，我使用了命令（在64b linux上）：

资料来源：

    segment .text

mulByBits:
; input: 2x uint32_t in edi, esi (System V AMD64 ABI calling convention)
; output: uint64_t in rax
; modifies also: rcx
    xor     eax, eax            ; rax = 0 "C"
    mov     edi, edi            ; clear upper 32b of input "A" (extend to 64b)
.mulLoop:
    lea     rcx, [rax + rdi]    ; rcx = C + A (new C in case B is odd)
    add     rdi, rdi            ; A *= 2 (for next loop)
    shr     esi, 1              ; B >>= 1 (sets ZF and CF)
    cmovc   rax, rcx            ; if B was odd, update the sum to new C
    jnz     .mulLoop            ; repeat until B is zero
    ret

global _start
_start:     ; run some hardcoded simple tests, verify in debugger
    mov     edi, 143254         ; "normal" values test
    mov     esi, 43526
    call    mulByBits
    mov     rbx, 6235273604     ; expected result, compare with rax

    mov     edi, 0
    mov     esi, 0
    call    mulByBits
    mov     rbx, 0

    mov     edi, 43257432
    mov     esi, 0
    call    mulByBits
    mov     rbx, 0

    mov     edi, 0
    mov     esi, 432543
    call    mulByBits
    mov     rbx, 0

    mov     edi, 3276547234
    mov     esi, 1
    call    mulByBits
    mov     rbx, 3276547234

    mov     edi, 1
    mov     esi, 3276547234
    call    mulByBits
    mov     rbx, 3276547234

    mov     edi, ~0             ; UINT_MAX * UINT_MAX
    mov     esi, ~0
    call    mulByBits
    mov     rbx, 0xFFFFFFFE00000001

    mov     rdi, 0xE00000004    ; garbage in upper 32 bits of inputs
    mov     rsi, 0xE00000004    ; should be multiplied as 4*4
    call    mulByBits
    mov     rbx, 0x10

    ; exit back to linux
    mov     eax, 60
    xor     edi, edi
    syscall

在大多数CPU上，加法是最有效的左移1的方法<代码>添加相同的，相同的可以在比

shl rdi，1

（）更多的执行端口上运行，允许更多的指令级并行，从而可能提高循环的吞吐量。

请将您的问题格式化为更具可读性的“我已经做了很多工作”-您应该向我们展示这一点。“我将如何使用shl”-我不确定你是否会。但是

shr

可以很好地工作。请同时标记您正在使用的架构。通常，当CF不是来自比较结果时，您会使用

jc

和

jnc

助记符（jb/jnb的别名）。它更容易记忆。

添加rdi，rdi

可以在比

shl rdi，1

更多的端口上运行，并给出相同的整数结果和大多数相同的标志结果。（对于

adc-same，与rcl-by-1类似）。不幸的是，现代Intel CPU没有将左移位1的短格式操作码解码/运行到任何ALU端口上运行的add-like uop中。@PeterCordes确实，谢谢，修改了源代码，因为这应该是为了炫耀“正确的”x86_64，而不是按照字面上的赋值。（虽然mul
本身确实存在，并且比这个比特碰撞更快，所以它仍然只是为了锻炼而锻炼）添加相同的，相同的是左移，并且shl
乘以1并不完全慢；但大多数CPU最多有2个整数移位端口。不管怎样，看起来不错，我明天会投票。我今天的投票已经用光了。顺便说一句，mov-edx，edi
在消除mov的英特尔CPU上的延迟比mov-edi，edi
低1个周期。().  但是我认为为你的函数重写寄存器分配是不值得的。是的，值多少取决于数据分布。在任何工作完成之前，分支未命中意味着您回到了dep链的开始，而如果以后的指令主要依赖于乘法结果，那么在~log2（n）次迭代后离开循环的预测失误也不是什么大问题。（单个循环shr
dep链可以在循环的其余部分之前运行。）但如果最高设置位经常有很大的不同，则可以节省大量迭代。使用可变计数移位和tzcnt仅在设定位上循环也是可能的，并且可能在设定位较少的情况下获胜。
    segment .text

mulByBits:
; input: 2x uint32_t in edi, esi (System V AMD64 ABI calling convention)
; output: uint64_t in rax
; modifies also: rcx
    xor     eax, eax            ; rax = 0 "C"
    mov     edi, edi            ; clear upper 32b of input "A" (extend to 64b)
.mulLoop:
    lea     rcx, [rax + rdi]    ; rcx = C + A (new C in case B is odd)
    add     rdi, rdi            ; A *= 2 (for next loop)
    shr     esi, 1              ; B >>= 1 (sets ZF and CF)
    cmovc   rax, rcx            ; if B was odd, update the sum to new C
    jnz     .mulLoop            ; repeat until B is zero
    ret

global _start
_start:     ; run some hardcoded simple tests, verify in debugger
    mov     edi, 143254         ; "normal" values test
    mov     esi, 43526
    call    mulByBits
    mov     rbx, 6235273604     ; expected result, compare with rax

    mov     edi, 0
    mov     esi, 0
    call    mulByBits
    mov     rbx, 0

    mov     edi, 43257432
    mov     esi, 0
    call    mulByBits
    mov     rbx, 0

    mov     edi, 0
    mov     esi, 432543
    call    mulByBits
    mov     rbx, 0

    mov     edi, 3276547234
    mov     esi, 1
    call    mulByBits
    mov     rbx, 3276547234

    mov     edi, 1
    mov     esi, 3276547234
    call    mulByBits
    mov     rbx, 3276547234

    mov     edi, ~0             ; UINT_MAX * UINT_MAX
    mov     esi, ~0
    call    mulByBits
    mov     rbx, 0xFFFFFFFE00000001

    mov     rdi, 0xE00000004    ; garbage in upper 32 bits of inputs
    mov     rsi, 0xE00000004    ; should be multiplied as 4*4
    call    mulByBits
    mov     rbx, 0x10

    ; exit back to linux
    mov     eax, 60
    xor     edi, edi
    syscall