X86 为什么mov ah、bh和mov al、bl加在一起比单指令mov ax、bx快得多？_X86_Assembly

X86 为什么mov ah、bh和mov al、bl加在一起比单指令mov ax、bx快得多？

x86 assembly

X86 为什么mov ah、bh和mov al、bl加在一起比单指令mov ax、bx快得多？,x86,assembly,X86,Assembly,我发现了 mov al, bl mov ah, bh 比以前快多了 mov ax, bx 谁能解释一下原因吗？我在WindowsXP下以32位模式运行Core2Duo3GHz。使用NASM编译，然后与VS2010链接。 Nasm编译命令： nasm -f coff -o triangle.o triangle.asm 下面是我用来渲染三角形的主循环： ; some variables on stack %define cr DWORD [ebp-20] %define dcr DWO

我发现了

mov al, bl
mov ah, bh

比以前快多了

mov ax, bx

谁能解释一下原因吗？我在WindowsXP下以32位模式运行Core2Duo3GHz。使用NASM编译，然后与VS2010链接。 Nasm编译命令：

nasm -f coff -o triangle.o triangle.asm

下面是我用来渲染三角形的主循环：

; some variables on stack
%define cr  DWORD [ebp-20]
%define dcr DWORD [ebp-24]
%define dcg DWORD [ebp-32]
%define dcb DWORD [ebp-40]

loop:

add esi, dcg
mov eax, esi
shr eax, 8

add edi, dcb
mov ebx, edi
shr ebx, 16
mov bh, ah

mov eax, cr
add eax, dcr
mov cr, eax

mov ah, bh  ; faster
mov al, bl
;mov ax, bx

mov DWORD [edx], eax

add edx, 4

dec ecx
jge loop

我可以为整个VS项目提供测试源。

在32位代码中，

mov ax、bx

需要操作数大小前缀，而字节大小的移动则不需要。显然，现代处理器设计人员并没有花费太多的精力来快速解码操作数大小前缀，但让我惊讶的是，惩罚足以执行两个字节大小的移动。

在我的核心2 Duo CPU L9300 1.60GHz上，速度也更快。正如我在评论中所写，我认为这与部分寄存器的使用有关（

ah

，

al

，

ax

）。参见更多内容，如和（第88页）

我已经编写了一个小测试套件来尝试改进代码，虽然没有使用OP中提供的

ax

版本是最聪明的，但尝试消除部分寄存器的使用确实会提高速度（甚至比我快速释放另一个寄存器的尝试还要快）

要获得关于为什么一个版本比另一个版本快的更多信息，我认为需要更仔细地阅读源材料和/或使用英特尔VTune或AMD CodeAnalyst之类的工具。（结果可能是我错了）

更新，虽然下面oprofile的输出不能证明任何事情，但它确实表明两个版本中都存在大量的部分寄存器暂停，但最慢版本（triAsm2）中的部分寄存器暂停大约是“快速”版本（triAsm1）中的两倍

结果:

triC:7410.000000ms，a5afb9（asm代码的C实现）

试验1:6690.000000ms，a5afb9（来自OP的代码，使用al和ah）

triAsm2:9290.000000毫秒，a5afb9（来自OP的代码，使用ax）

triAsm3:5760.000000毫秒，a5afb9（直接将OPs代码转换为不使用部分寄存器的代码）

triAsm4:5640.000000毫秒，a5afb9（快速尝试加快速度）

这是我的测试套件，用

-std=c99-ggdb-m32-O3-march=native-mtune=native编译而成：
测试c：
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>

extern void triC(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
extern void triAsm1(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
extern void triAsm2(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
extern void triAsm3(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
extern void triAsm4(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);

uint32_t scanline[640];

#define test(tri) \
    {\
        clock_t start = clock();\
        srand(60);\
        for (int i = 0; i < 5000000; i++) {\
            tri(scanline, rand() % 640, 10<<16, 20<<16, 30<<16, 1<<14, 1<<14, 1<<14);\
        }\
        printf(#tri ": %f ms, %x\n",(clock()-start)*1000.0/CLOCKS_PER_SEC,scanline[620]);\
    }

int main() {
    test(triC);
    test(triAsm1);
    test(triAsm2);
    test(triAsm3);
    test(triAsm4);
    return 0;
}

为什么速度慢

使用16位寄存器比使用8位寄存器昂贵的原因是16位寄存器指令在微码中解码。这意味着在解码过程中有一个额外的周期，并且在解码时无法配对。

此外，由于ax是一个部分寄存器，它需要额外的一个周期才能执行，因为寄存器的顶部需要与对下部的写入相结合。

8位写操作有专门的硬件来加快速度，但16位写操作没有。同样，在许多处理器上，16位指令需要2个周期而不是1个周期，并且它们不允许配对
这意味着，您现在不能在4个周期内处理12条指令（每个周期3条），而只能执行1条指令，因为将指令解码为微码时会出现暂停，处理微码时会出现暂停
我怎样才能让它更快？
mov al, bl
mov ah, bh

（此代码至少占用2个CPU周期，并且可能导致第二条指令暂停，因为在某些（较旧的）x86 CPU上，您会在EAX上获得锁）

下面是发生的情况：

EAX被读取。（第1周期）

EAX的低位字节已更改（仍为循环1）
然后将完整值写回EAX。（第1周期）

EAX被锁定以进行写入，直到第一次写入完全解决。（可能等待多个周期）
对EAX中的高字节重复该过程。（周期2）

在最新的Core2 CPU上，这并不是什么大问题，因为已经安装了额外的硬件，它们知道bl
和bh
实际上永远不会相互妨碍
mov eax, ebx

它一次移动4个字节，单个指令将在1个cpu周期内运行（并且可以与其他并行指令配对）

如果您想要快速编码，请始终使用32位（EAX、EBX等）寄存器
尽量避免使用8位子寄存器，除非必须使用
切勿使用16位寄存器。即使您必须在32位模式下使用5条指令，也会更快
使用movzx reg。。。（或movsx reg，…）说明

加快代码速度

我看到了一些加速代码的机会
; some variables on stack
%define cr  DWORD [ebp-20]
%define dcr DWORD [ebp-24]
%define dcg DWORD [ebp-32]
%define dcb DWORD [ebp-40]

mov edx,cr

loop:

add esi, dcg
mov eax, esi
shr eax, 8

add edi, dcb
mov ebx, edi
shr ebx, 16   ;higher 16 bits in ebx will be empty.
mov bh, ah

;mov eax, cr   
;add eax, dcr
;mov cr, eax

add edx,dcr
mov eax,edx

and eax,0xFFFF0000  ; clear lower 16 bits in EAX
or eax,ebx          ; merge the two. 
;mov ah, bh  ; faster
;mov al, bl


mov DWORD [epb+offset+ecx*4], eax ; requires storing the data in reverse order. 
;add edx, 4

sub ecx,1  ;dec ecx does not change the carry flag, which can cause
           ;a false dependency on previous instructions which do change CF    
jge loop

摘要：16位指令不是直接的问题问题是在写入部分寄存器后读取更宽的寄存器，导致Core2上的部分寄存器暂停。这在Sandybridge和更高版本上的问题要小得多，因为它们合并的成本要低得多movax，bx
会导致额外的合并，但即使是OP的“快速”版本也会出现一些暂停
请参阅此答案的结尾，了解一个替代标量内循环，它应该比其他两个答案更快，使用shld
在寄存器之间移动字节。将循环外的内容预移位8b，将我们想要的字节放在每个寄存器的顶部，这使得这非常便宜。它应该在32位core2上以略高于每4个时钟周期一次迭代的速度运行，并使所有三个执行端口饱和，而不会出现暂停。它应该在Haswell上每2.5c运行一次迭代
但是，要真正快速地完成这一点，请查看，或者将其缩减，或者使用向量内部函数重新实现

与16位操作数大小指令速度慢的说法相反，Core2理论上可以在每个时钟上交替mov ax、bx
和mov ecx、edx维持3 INSN。没有任何类型的“模式开关”。（正如每个人都指出的，“上下文切换”是一个可怕的选择
mov al, bl
mov ah, bh

mov eax, ebx

; some variables on stack
%define cr  DWORD [ebp-20]
%define dcr DWORD [ebp-24]
%define dcg DWORD [ebp-32]
%define dcb DWORD [ebp-40]

mov edx,cr

loop:

add esi, dcg
mov eax, esi
shr eax, 8

add edi, dcb
mov ebx, edi
shr ebx, 16   ;higher 16 bits in ebx will be empty.
mov bh, ah

;mov eax, cr   
;add eax, dcr
;mov cr, eax

add edx,dcr
mov eax,edx

and eax,0xFFFF0000  ; clear lower 16 bits in EAX
or eax,ebx          ; merge the two. 
;mov ah, bh  ; faster
;mov al, bl


mov DWORD [epb+offset+ecx*4], eax ; requires storing the data in reverse order. 
;add edx, 4

sub ecx,1  ;dec ecx does not change the carry flag, which can cause
           ;a false dependency on previous instructions which do change CF    
jge loop

.L4:
        movdqa  xmm0, XMMWORD PTR [esp+64]
        mov     ecx, edx
        add     edx, 1
        sal     ecx, 4
        paddd   xmm0, xmm3
        paddd   xmm3, XMMWORD PTR [esp+16]
        psrld   xmm0, 8
        movdqa  xmm1, xmm0
        movdqa  xmm0, XMMWORD PTR [esp+80]
        pand    xmm1, xmm7
        paddd   xmm0, xmm2
        paddd   xmm2, XMMWORD PTR [esp+32]
        psrld   xmm0, 16
        pand    xmm0, xmm6
        por     xmm0, xmm1
        movdqa  xmm1, XMMWORD PTR [esp+48]
        paddd   xmm1, xmm4
        paddd   xmm4, XMMWORD PTR [esp]
        pand    xmm1, xmm5
        por     xmm0, xmm1
        movaps  XMMWORD PTR [eax+ecx], xmm0
        cmp     ebp, edx
        ja      .L4

; use defines you can put [] around so it's clear they're memory refs
; %define cr  ebp+0x10
%define cr  esp+something that depends on how much we pushed
%define dcr ebp+0x1c  ;; change these to work from ebp, too.
%define dcg ebp+0x20
%define dcb ebp+0x24

; esp-relative offsets may be wrong, just quickly did it in my head without testing:
; we push 3 more regs after ebp, which was the point at which ebp snapshots esp in the stack-frame version.  So add 0xc (i.e. mentally add 0x10 and subract 4)
; 32bit code is dumb anyway.  64bit passes args in regs.

%define dest_arg  esp+14
%define cnt_arg   esp+18
... everything else

tri_pjc:
    push    ebp
    push    edi
    push    esi
    push    ebx  ; only these 4 need to be preserved in the normal 32bit calling convention

    mov     ebp, [cr]
    mov     esi, [cg]
    mov     edi, [cb]

    shl     esi,   8          ; put the bits we want at the high edge, so we don't have to mask after shifting in zeros
    shl     [dcg], 8
    shl     edi,   8
    shl     [dcb], 8
       ; apparently the original code doesn't care if cr overflows into the top byte.

    mov     edx, [dest_arg]
    mov     ecx, [cnt_arg]
    lea     ecx, [edx + ecx*4] ; one-past the end, to be used as a loop boundary
    mov    [dest_arg], ecx    ; spill it back to the stack, where we only need to read it.

ALIGN 16
.loop: ; SEE BELOW, this inner loop can be even more optimized
    add     esi, [dcg]
    mov     eax, esi
    shr     eax, 24           ; eax bytes = { 0  0  0 cg }

    add     edi, [dcb]
    shld    eax, edi, 8       ; eax bytes = { 0  0 cg cb }

    add     ebp, [dcr]
    mov     ecx, ebp
    and     ecx, 0xffff0000
    or      eax, ecx          ; eax bytes = { x cr cg cb}  where x is overflow from cr.  Kill that by changing the mask to 0x00ff0000
    ; another shld to merge might be faster on other CPUs, but not core2
    ; merging with mov cx, ax   would also be possible on CPUs where that's cheap (AMD, and Intel IvB and later)

    mov    DWORD [edx], eax
    ; alternatively:
    ; mov    DWORD [edx], ebp
    ; mov     WORD [edx], eax   ; this insn replaces the mov/and/or merging

    add     edx, 4
    cmp     edx, [dest_arg]   ; core2 can macro-fuse cmp/unsigned condition, but not signed
    jb .loop

    pop     ebx
    pop     esi
    pop     edi
    pop     ebp
    ret
ALIGN 16
;mov ebx, 111           ; IACA start
;db 0x64, 0x67, 0x90
.loop:
    add     ebp, [dcr]
    mov     eax, ebp
    shr     eax, 16           ; eax bytes = { 0  0  x cr}  where x is overflow from cr.  Kill that pre-shifting cr and dcr like the others, and use shr 24 here

    add     esi, [dcg]
    shld    eax, esi, 8       ; eax bytes = { 0  x cr cg}
    add     edx, 4     ; this goes between the `shld`s to help with decoder throughput on pre-SnB, and to not break macro-fusion.
    add     edi, [dcb]
    shld    eax, edi, 8       ; eax bytes = { x cr cg cb}
    mov    DWORD [edx-4], eax

    cmp     edx, ebx      ; use our spare register here
    jb .loop     ; core2 can macro-fuse cmp/unsigned condition, but not signed.  Macro-fusion works in 32-bit mode only on Core2.

;mov ebx, 222           ; IACA end
;db 0x64, 0x67, 0x90