Delphi 用于将8位扩展为8个布尔字节(0或1)的英特尔x86汇编优化技术
我学习汇编程序已经有一段时间了,我正在尝试将一些简单的过程\函数重写到汇编程序中,以查看性能优势(如果有的话)。我的主要开发工具是Delphi2007,第一个示例将使用该语言,但也可以轻松地翻译成其他语言 问题是: 我们给出了一个无符号字节值,其中八位中的每一位表示屏幕一行中的一个像素。每个像素可以是实心(1)或透明(0)。换句话说,我们在一个字节值中有8个像素。 我想将这些像素解压成一个8字节的数组,以使最年轻的像素(位)位于数组的最低索引之下,以此类推。以下是一个例子:Delphi 用于将8位扩展为8个布尔字节(0或1)的英特尔x86汇编优化技术,delphi,optimization,assembly,x86,basm,Delphi,Optimization,Assembly,X86,Basm,我学习汇编程序已经有一段时间了,我正在尝试将一些简单的过程\函数重写到汇编程序中,以查看性能优势(如果有的话)。我的主要开发工具是Delphi2007,第一个示例将使用该语言,但也可以轻松地翻译成其他语言 问题是: 我们给出了一个无符号字节值,其中八位中的每一位表示屏幕一行中的一个像素。每个像素可以是实心(1)或透明(0)。换句话说,我们在一个字节值中有8个像素。 我想将这些像素解压成一个8字节的数组,以使最年轻的像素(位)位于数组的最低索引之下,以此类推。以下是一个例子: One byte v
One byte value -----------> eight byte array
10011011 -----------------> [1][1][0][1][1][0][0][1]
Array index number -------> 0 1 2 3 4 5 6 7
下面我将介绍解决这个问题的五种方法。接下来,我将展示他们的时间比较,以及我是如何测量这些时间的
我的问题包括两部分:
1.
我要求您提供有关方法DecodePixels4a
和DecodePixels4b
的详细回答。为什么方法4b
比4a
慢一些
例如,如果因为我的代码没有正确对齐而速度较慢,那么请向我展示给定方法中的哪些指令可以更好地对齐,以及如何这样做以不破坏该方法
我想看到理论背后的真实例子。请记住,我正在学习汇编,我想从你的答案中获得知识,这使我将来能够编写更好的优化代码
2.
你能写出比解码像素更快的例程吗?如果是这样,请介绍并描述您所采取的优化步骤。
所谓更快的例程,我指的是在这里介绍的所有例程中,在测试环境中运行时间最短的例程
允许使用所有英特尔系列处理器以及与之兼容的处理器
以下是我编写的例程:
procedure DecodePixels1(EncPixels: Byte; var DecPixels: TDecodedPixels);
var
i3: Integer;
begin
DecPixels[0] := EncPixels and $01;
for i3 := 1 to 7 do
begin
EncPixels := EncPixels shr 1;
DecPixels[i3] := EncPixels and $01;
//DecPixels[i3] := (EncPixels shr i3) and $01; //this is even slower if you replace above 2 lines with it
end;
end;
//Lets unroll the loop and see if it will be faster.
procedure DecodePixels2(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
DecPixels[0] := EncPixels and $01;
EncPixels := EncPixels shr 1;
DecPixels[1] := EncPixels and $01;
EncPixels := EncPixels shr 1;
DecPixels[2] := EncPixels and $01;
EncPixels := EncPixels shr 1;
DecPixels[3] := EncPixels and $01;
EncPixels := EncPixels shr 1;
DecPixels[4] := EncPixels and $01;
EncPixels := EncPixels shr 1;
DecPixels[5] := EncPixels and $01;
EncPixels := EncPixels shr 1;
DecPixels[6] := EncPixels and $01;
EncPixels := EncPixels shr 1;
DecPixels[7] := EncPixels and $01;
end;
procedure DecodePixels3(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
asm
push eax;
push ebx;
push ecx;
mov bl, al;
and bl, $01;
mov [edx], bl;
mov ecx, $00;
@@Decode:
inc ecx;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + ecx], bl;
cmp ecx, $07;
jnz @@Decode;
pop ecx;
pop ebx;
pop eax;
end;
end;
//Unrolled assembly loop
procedure DecodePixels4a(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
asm
push eax;
push ebx;
mov bl, al;
and bl, $01;
mov [edx], bl;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + $01], bl;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + $02], bl;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + $03], bl;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + $04], bl;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + $05], bl;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + $06], bl;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + $07], bl;
pop ebx;
pop eax;
end;
end;
// it differs compared to 4a only in switching two instructions (but seven times)
procedure DecodePixels4b(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
asm
push eax;
push ebx;
mov bl, al;
and bl, $01;
shr al, $01; //
mov [edx], bl; //
mov bl, al;
and bl, $01;
shr al, $01; //
mov [edx + $01], bl; //
mov bl, al;
and bl, $01;
shr al, $01; //
mov [edx + $02], bl; //
mov bl, al;
and bl, $01;
shr al, $01; //
mov [edx + $03], bl; //
mov bl, al;
and bl, $01;
shr al, $01; //
mov [edx + $04], bl; //
mov bl, al;
and bl, $01;
shr al, $01; //
mov [edx + $05], bl; //
mov bl, al;
and bl, $01;
shr al, $01; //
mov [edx + $06], bl; //
mov bl, al;
and bl, $01;
mov [edx + $07], bl;
pop ebx;
pop eax;
end;
end;
下面是如何测试它们:
program Test;
{$APPTYPE CONSOLE}
uses
SysUtils, Windows;
type
TDecodedPixels = array[0..7] of Byte;
var
Pixels: TDecodedPixels;
Freq, TimeStart, TimeEnd :Int64;
Time1, Time2, Time3, Time4a, Time4b: Extended;
i, i2: Integer;
begin
if QueryPerformanceFrequency(Freq) then
begin
for i2 := 1 to 100 do
begin
QueryPerformanceCounter(TimeStart);
for i := 1 to 100000 do
DecodePixels1(155, Pixels);
QueryPerformanceCounter(TimeEnd);
Time1 := Time1 + ((TimeEnd - TimeStart) / Freq * 1000);
QueryPerformanceCounter(TimeStart);
for i := 1 to 100000 do
DecodePixels2(155, Pixels);
QueryPerformanceCounter(TimeEnd);
Time2 := Time2 + ((TimeEnd - TimeStart) / Freq * 1000);
QueryPerformanceCounter(TimeStart);
for i := 1 to 100000 do
DecodePixels3(155, Pixels);
QueryPerformanceCounter(TimeEnd);
Time3 := Time3 + ((TimeEnd - TimeStart) / Freq * 1000);
QueryPerformanceCounter(TimeStart);
for i := 1 to 100000 do
DecodePixels4a(155, Pixels);
QueryPerformanceCounter(TimeEnd);
Time4a := Time4a + ((TimeEnd - TimeStart) / Freq * 1000);
QueryPerformanceCounter(TimeStart);
for i := 1 to 100000 do
DecodePixels4b(155, Pixels);
QueryPerformanceCounter(TimeEnd);
Time4b := Time4b + ((TimeEnd - TimeStart) / Freq * 1000);
end;
Writeln('Time1 : ' + FloatToStr(Time1 / 100) + ' ms. <- Delphi loop.');
Writeln('Time2 : ' + FloatToStr(Time2 / 100) + ' ms. <- Delphi unrolled loop.');
Writeln('Time3 : ' + FloatToStr(Time3/ 100) + ' ms. <- BASM loop.');
Writeln('Time4a : ' + FloatToStr(Time4a / 100) + ' ms. <- BASM unrolled loop.');
Writeln('Time4b : ' + FloatToStr(Time4b / 100) + ' ms. <- BASM unrolled loop instruction switch.');
end;
Readln;
end.
看看Time5的结果,很奇怪吧?
我想我有不同的Delphi版本,因为我生成的汇编代码与Wouter提供的不同
第二次主要编辑:
我知道为什么常规
5
在我的机器上运行得较慢。我在编译器选项中选中了“范围检查”和“溢出检查”。我在例程9
中添加了assembler
指令,看看它是否有用。似乎这个指令的汇编程序和Delphi内联变量一样好,甚至稍好
以下是最终结果:
Time1 : 1,22508325749317 ms. <- Delphi loop.
Time2 : 1,33004145373084 ms. <- Delphi unrolled loop.
Time3 : 1,1473583622526 ms. <- BASM loop.
Time4a : 0,77322594033463 ms. <- BASM unrolled loop.
Time4b : 0,846033593023372 ms. <- BASM unrolled loop instruction switch.
Time5 : 0,688689382044384 ms. <- Wouter van Nifterick, Delphi unrolled
Time6 : 0,503233741036693 ms. <- PhiS, table lookup Delphi
Time7 : 0,385254722925063 ms. <- PhiS, table lookup Delphi inline
Time8 : 0,432993919452751 ms. <- PhiS, table lookup BASM
Time9 : 0,362680491244212 ms. <- PhiS, table lookup BASM with assembler directive
我想说,这相当依赖于数据
下面是测试和结果。我做了四次测试以确保没有事故。
我还为GJ建议的例程添加了新的时间(Time10a,Time10b)
test1test2test3test4
时间1:12112101201201201213
时间二:1280 1258 1253 1332
时间3:1129 1138 1130 1160
时间4A:0690068206170635
时间4b:07070698 07060659
时间4C:0679068506260625
时间5:0715 0682 0686 0679
时间6:0490048505220514
时间7:0323 0333 0336 0318
时间8:0407 0403 0373 0354
时间9:0352 0378 0355 0355
时间10A:1823 1812 1807 1813
时间10B:11131120 11151118
时间10C:0652063006530633
Time10d:0156 0155 0172 0160编译器在优化小例程方面做得非常好
我将使用查找表优化您的代码。
由于对单个字节(256种不同的状态)进行解码,所以可以使用未打包的值预先计算256个数组
编辑:请注意,奔腾处理器可以并行执行特定指令(),这称为配对 我想是写入内存(实际上是缓存)比使用寄存器慢
所以
在再次需要bl
寄存器之前,给处理器一些时间将bl
写入内存,而
shr al, $01;
mov [edx], bl;
mov bl, al;
需要立即bl
,因此处理器必须停止并等待内存写入完成
这让我很惊讶。现代英特尔处理器进行疯狂的流水线和寄存器重命名,因此在我看来,如果有什么不同的话,解码像素4B应该更快,因为每条指令的依赖关系都更靠前。除此之外,以上是我能提供的所有解释:
x86是一个糟糕的指令集,Intel做了惊人的、非常先进的hocus pocus来提高效率。如果我是你,我会调查别的事情。如今,PC机对MegaMcOptimized软件的需求非常少。我的友好建议是研究用于移动设备(主要是ARM)的处理器,因为在移动设备中,处理器速度、功耗和电池寿命问题意味着微优化软件更为重要。ARM的指令集优于x86。4b比4a快的可能原因是它的并行性更好。从4a开始:
mov bl, al;
and bl, $01; // data dep (bl)
mov [edx], bl; // data dep (bl)
shr al, $01;
mov bl, al; // data dep (al)
and bl, $01; // data dep (bl)
mov [edx + $01], bl; // data dep (bl)
标记为“data dep”的指令在前一条指令完成之前无法开始执行,我已经编写了导致此数据依赖性的寄存器。如果没有依赖关系,现代CPU能够在最后一条指令完成之前启动指令。但你命令这些行动的方式阻止了这一切
在4b中,数据依赖性更少:
mov bl, al;
and bl, $01; // data dep (bl)
shr al, $01;
mov [edx], bl;
mov bl, al;
and bl, $01; // data dep (bl)
shr al, $01;
mov [edx + $01], bl;
使用这种指令顺序,依赖于前一条指令的指令更少,因此有更多的并行机会
我不能保证这是造成速度差异的原因,但这很可能是一个候选因素。不幸的是,很难找到像你所寻找的那样绝对的答案;现代处理器有分支预测器、多级缓存、硬件预取器和各种其他协同处理器
Test1 Test2 Test3 Test4
Time1 : 1,211 1,210 1,220 1,213
Time2 : 1,280 1,258 1,253 1,332
Time3 : 1,129 1,138 1,130 1,160
Time4a : 0,690 0,682 0,617 0,635
Time4b : 0,707 0,698 0,706 0,659
Time4c : 0,679 0,685 0,626 0,625
Time5 : 0,715 0,682 0,686 0,679
Time6 : 0,490 0,485 0,522 0,514
Time7 : 0,323 0,333 0,336 0,318
Time8 : 0,407 0,403 0,373 0,354
Time9 : 0,352 0,378 0,355 0,355
Time10a : 1,823 1,812 1,807 1,813
Time10b : 1,113 1,120 1,115 1,118
Time10c : 0,652 0,630 0,653 0,633
Time10d : 0,156 0,155 0,172 0,160 <-- current winner!
mov [edx+...], bl
shr al, $01;
mov bl, al;
shr al, $01;
mov [edx], bl;
mov bl, al;
mov bl, al;
and bl, $01; // data dep (bl)
mov [edx], bl; // data dep (bl)
shr al, $01;
mov bl, al; // data dep (al)
and bl, $01; // data dep (bl)
mov [edx + $01], bl; // data dep (bl)
mov bl, al;
and bl, $01; // data dep (bl)
shr al, $01;
mov [edx], bl;
mov bl, al;
and bl, $01; // data dep (bl)
shr al, $01;
mov [edx + $01], bl;
BT ax,1
SETC [dx]
inc dx
BT ax,2
SETC [dx]
inc dx
procedure DecodePixels5(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
DecPixels[0] := (EncPixels shr 0) and $01;
DecPixels[1] := (EncPixels shr 1) and $01;
DecPixels[2] := (EncPixels shr 2) and $01;
DecPixels[3] := (EncPixels shr 3) and $01;
DecPixels[4] := (EncPixels shr 4) and $01;
DecPixels[5] := (EncPixels shr 5) and $01;
DecPixels[6] := (EncPixels shr 6) and $01;
DecPixels[7] := (EncPixels shr 7) and $01;
end;
Results:
Time1 : 1,03096806151283 ms. <- Delphi loop.
Time2 : 0,740308641141395 ms. <- Delphi unrolled loop.
Time3 : 0,996602425688886 ms. <- BASM loop.
Time4a : 0,608267951561275 ms. <- BASM unrolled loop.
Time4b : 0,574162510648039 ms. <- BASM unrolled loop instruction switch.
Time5 : 0,499628206138524 ms. !!! <- Delphi unrolled loop 5.
push ebx;
// DecPixels[0] := (EncPixels shr 0) and 1;
movzx ecx,al
mov ebx,ecx
// shr ebx,$00
and bl,$01
mov [edx],bl
// DecPixels[1] := (EncPixels shr 1) and 1;
mov ebx,ecx
shr ebx,1
and bl,$01
mov [edx+$01],bl
// DecPixels[2] := (EncPixels shr 2) and 1;
mov ebx,ecx
shr ebx,$02
and bl,$01
mov [edx+$02],bl
// DecPixels[3] := (EncPixels shr 3) and 1;
mov ebx,ecx
shr ebx,$03
and bl,$01
mov [edx+$03],bl
// DecPixels[4] := (EncPixels shr 4) and 1;
mov ebx,ecx
shr ebx,$04
and bl,$01
mov [edx+$04],bl
// DecPixels[5] := (EncPixels shr 5) and 1;
mov ebx,ecx
shr ebx,$05
and bl,$01
mov [edx+$05],bl
// DecPixels[6] := (EncPixels shr 6) and 1;
mov ebx,ecx
shr ebx,$06
and bl,$01
mov [edx+$06],bl
// DecPixels[7] := (EncPixels shr 7) and 1;
shr ecx,$07
and cl,$01
mov [edx+$07],cl
pop ebx;
var
PixelLookup:Array[byte] of TDecodedPixels;
// You could precalculate, but the performance gain would hardly be worth it because you call this once only.
for I := 0 to 255 do
DecodePixels5b(I, PixelLookup[I]);
procedure DecodePixels7(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
DecPixels := PixelLookup[EncPixels];
end;
Results:
Time1 : 1,03096806151283 ms. <- Delphi loop.
Time2 : 0,740308641141395 ms. <- Delphi unrolled loop.
Time3 : 0,996602425688886 ms. <- BASM loop.
Time4a : 0,608267951561275 ms. <- BASM unrolled loop.
Time4b : 0,574162510648039 ms. <- BASM unrolled loop instruction switch.
Time5 : 0,499628206138524 ms. !!! <- Delphi unrolled loop 5.
Time7 : 0,251533475182096 ms. <- simple table lookup
const Uint64DecPix : PACKED ARRAY [0..255] OF UINT64 =
( $0000000000000000, $0000000000000001, $0000000000000100, $0000000000000101, $0000000000010000, $0000000000010001, $0000000000010100, $0000000000010101, $0000000001000000, $0000000001000001, $0000000001000100, $0000000001000101, $0000000001010000, $0000000001010001, $0000000001010100, $0000000001010101,
$0000000100000000, $0000000100000001, $0000000100000100, $0000000100000101, $0000000100010000, $0000000100010001, $0000000100010100, $0000000100010101, $0000000101000000, $0000000101000001, $0000000101000100, $0000000101000101, $0000000101010000, $0000000101010001, $0000000101010100, $0000000101010101,
$0000010000000000, $0000010000000001, $0000010000000100, $0000010000000101, $0000010000010000, $0000010000010001, $0000010000010100, $0000010000010101, $0000010001000000, $0000010001000001, $0000010001000100, $0000010001000101, $0000010001010000, $0000010001010001, $0000010001010100, $0000010001010101,
$0000010100000000, $0000010100000001, $0000010100000100, $0000010100000101, $0000010100010000, $0000010100010001, $0000010100010100, $0000010100010101, $0000010101000000, $0000010101000001, $0000010101000100, $0000010101000101, $0000010101010000, $0000010101010001, $0000010101010100, $0000010101010101,
$0001000000000000, $0001000000000001, $0001000000000100, $0001000000000101, $0001000000010000, $0001000000010001, $0001000000010100, $0001000000010101, $0001000001000000, $0001000001000001, $0001000001000100, $0001000001000101, $0001000001010000, $0001000001010001, $0001000001010100, $0001000001010101,
$0001000100000000, $0001000100000001, $0001000100000100, $0001000100000101, $0001000100010000, $0001000100010001, $0001000100010100, $0001000100010101, $0001000101000000, $0001000101000001, $0001000101000100, $0001000101000101, $0001000101010000, $0001000101010001, $0001000101010100, $0001000101010101,
$0001010000000000, $0001010000000001, $0001010000000100, $0001010000000101, $0001010000010000, $0001010000010001, $0001010000010100, $0001010000010101, $0001010001000000, $0001010001000001, $0001010001000100, $0001010001000101, $0001010001010000, $0001010001010001, $0001010001010100, $0001010001010101,
$0001010100000000, $0001010100000001, $0001010100000100, $0001010100000101, $0001010100010000, $0001010100010001, $0001010100010100, $0001010100010101, $0001010101000000, $0001010101000001, $0001010101000100, $0001010101000101, $0001010101010000, $0001010101010001, $0001010101010100, $0001010101010101,
$0100000000000000, $0100000000000001, $0100000000000100, $0100000000000101, $0100000000010000, $0100000000010001, $0100000000010100, $0100000000010101, $0100000001000000, $0100000001000001, $0100000001000100, $0100000001000101, $0100000001010000, $0100000001010001, $0100000001010100, $0100000001010101,
$0100000100000000, $0100000100000001, $0100000100000100, $0100000100000101, $0100000100010000, $0100000100010001, $0100000100010100, $0100000100010101, $0100000101000000, $0100000101000001, $0100000101000100, $0100000101000101, $0100000101010000, $0100000101010001, $0100000101010100, $0100000101010101,
$0100010000000000, $0100010000000001, $0100010000000100, $0100010000000101, $0100010000010000, $0100010000010001, $0100010000010100, $0100010000010101, $0100010001000000, $0100010001000001, $0100010001000100, $0100010001000101, $0100010001010000, $0100010001010001, $0100010001010100, $0100010001010101,
$0100010100000000, $0100010100000001, $0100010100000100, $0100010100000101, $0100010100010000, $0100010100010001, $0100010100010100, $0100010100010101, $0100010101000000, $0100010101000001, $0100010101000100, $0100010101000101, $0100010101010000, $0100010101010001, $0100010101010100, $0100010101010101,
$0101000000000000, $0101000000000001, $0101000000000100, $0101000000000101, $0101000000010000, $0101000000010001, $0101000000010100, $0101000000010101, $0101000001000000, $0101000001000001, $0101000001000100, $0101000001000101, $0101000001010000, $0101000001010001, $0101000001010100, $0101000001010101,
$0101000100000000, $0101000100000001, $0101000100000100, $0101000100000101, $0101000100010000, $0101000100010001, $0101000100010100, $0101000100010101, $0101000101000000, $0101000101000001, $0101000101000100, $0101000101000101, $0101000101010000, $0101000101010001, $0101000101010100, $0101000101010101,
$0101010000000000, $0101010000000001, $0101010000000100, $0101010000000101, $0101010000010000, $0101010000010001, $0101010000010100, $0101010000010101, $0101010001000000, $0101010001000001, $0101010001000100, $0101010001000101, $0101010001010000, $0101010001010001, $0101010001010100, $0101010001010101,
$0101010100000000, $0101010100000001, $0101010100000100, $0101010100000101, $0101010100010000, $0101010100010001, $0101010100010100, $0101010100010101, $0101010101000000, $0101010101000001, $0101010101000100, $0101010101000101, $0101010101010000, $0101010101010001, $0101010101010100, $0101010101010101);
PUint64DecPix : pointer = @Uint64DecPix;
procedure DecodePixelsPS1Pas (EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
DecPixels := TDecodedPixels(Uint64DecPix[EncPixels]);
end;
procedure DecodePixelsPS1PasInline (EncPixels: Byte; var DecPixels: TDecodedPixels);
inline;
begin
DecPixels := TDecodedPixels(Uint64DecPix[EncPixels]);
end;
procedure DecodePixelsPS1Asm (EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
lea ecx, Uint64DecPix //[<-Added in EDIT 3]
//mov ecx, dword ptr PUint64DecPix - alternative to the above line (slower for me)
movzx eax, al
movq xmm0, [8*eax+ecx] //Using XMM rather than MMX so we don't have to issue emms at the end
movq [edx], xmm0 //use MOVQ because it doesn't need mem alignment
end;
PACKED ARRAY [0..7] of byte
Time1 : 2.51638266874701 ms. <- Delphi loop.
Time2 : 2.11277620479698 ms. <- Delphi unrolled loop.
Time3 : 2.21972066282167 ms. <- BASM loop.
Time4a : 1.34093090043567 ms. <- BASM unrolled loop.
Time4b : 1.52222070123437 ms. <- BASM unrolled loop instruction switch.
Time5 : 1.17106364076999 ms. <- Wouter van Nifterick
TimePS1 : 0.633099318488802 ms. <- PS.Pas
TimePS2 : 0.551617593856202 ms. <- PS.Pas Inline
TimePS3 : 0.70921094720139 ms. <- PS.Asm (speed for version before 3rd EDIT)
procedure DecodePixels(EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
xor ecx, ecx
add al, al
rcl ecx, 8
add al, al
rcl ecx, 8
add al, al
rcl ecx, 8
add al, al
rcl ecx, 1
mov [DecPixels + 4], ecx
xor ecx, ecx
add al, al
rcl ecx, 8
add al, al
rcl ecx, 8
add al, al
rcl ecx, 8
add al, al
rcl ecx, 1
mov [DecPixels], ecx
end;
procedure DecodePixelsI(EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
mov ecx, 0 //Faster than: xor ecx, ecx
add al, al
rcl ch, 1
add al, al
rcl cl, 1
ror ecx, 16
add al, al
rcl ch, 1
add al, al
rcl cl, 1
mov [DecPixels + 4], ecx
mov ecx, 0 //Faster than: xor ecx, ecx
add al, al
rcl ch, 1
add al, al
rcl cl, 1
ror ecx, 16
add al, al
rcl ch, 1
add al, al
rcl cl, 1
mov [DecPixels], ecx
end;
procedure DecodePixelsX(EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
add al, al
setc byte ptr[DecPixels + 7]
add al, al
setc byte ptr[DecPixels + 6]
add al, al
setc byte ptr[DecPixels + 5]
add al, al
setc byte ptr[DecPixels + 4]
add al, al
setc byte ptr[DecPixels + 3]
add al, al
setc byte ptr[DecPixels + 2]
add al, al
setc byte ptr[DecPixels + 1]
setnz byte ptr[DecPixels]
end;
const Uint32DecPix : array [0..15] of cardinal = (
$00000000, $00000001, $00000100, $00000101,
$00010000, $00010001, $00010100, $00010101,
$01000000, $01000001, $01000100, $01000101,
$01010000, $01010001, $01010100, $01010101
);
procedure DecodePixelsY(EncPixels: byte; var DecPixels: TDecodedPixels); inline;
begin
pcardinal(@DecPixels)^ := Uint32DecPix[EncPixels and $0F];
pcardinal(cardinal(@DecPixels) + 4)^ := Uint32DecPix[(EncPixels and $F0) shr 4];
end;
int main ()
{
const int
size = 0x100000;
unsigned char
*source = new unsigned char [size],
*dest,
*dest1 = new unsigned char [size * 32],
*dest2 = new unsigned char [size * 32];
for (int i = 0 ; i < size ; ++i)
{
source [i] = rand () & 0xff;
}
LARGE_INTEGER
start,
middle,
end;
QueryPerformanceCounter (&start);
dest = dest1;
for (int i = 0 ; i < size ; ++i)
{
unsigned char
v = source [i];
for (int b = 0 ; b < 8 ; ++b)
{
*(dest++) = (v >> b) & 1;
}
}
unsigned char
bits [] = {1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128},
zero [] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
ones [] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
QueryPerformanceCounter (&middle);
__asm
{
movdqu xmm1,bits
movdqu xmm2,zero
movdqu xmm3,ones
mov ecx,0x100000/4
mov esi,source
mov edi,dest2
l1:
lodsd
movd xmm0,eax
movd xmm4,eax
punpcklbw xmm0,xmm0
punpcklbw xmm4,xmm4
punpcklwd xmm0,xmm0
punpcklwd xmm4,xmm4
punpckldq xmm0,xmm0
punpckhdq xmm4,xmm4
pand xmm0,xmm1
pand xmm4,xmm1
pcmpeqb xmm0,xmm2
pcmpeqb xmm4,xmm2
paddb xmm0,xmm3
paddb xmm4,xmm3
movdqu [edi],xmm0
movdqu [edi+16],xmm4
add edi,32
dec ecx
jnz l1
}
QueryPerformanceCounter (&end);
cout << "Time taken = " << (middle.QuadPart - start.QuadPart) << endl;
cout << "Time taken = " << (end.QuadPart - middle.QuadPart) << endl;
cout << "memcmp = " << memcmp (dest1, dest2, size * 32) << endl;
return 0;
}
/* input byte in eax, address to store result in edx */
and eax, 0xff /* may not be needed */
mov ebx, eax
shl ebx, 7
or eax, ebx
mov ebx, eax
shl ebx, 14
or eax, ebx
mov ebx, eax
and eax, 0x01010101
mov [edx], eax
shr ebx, 4
and ebx, 0x01010101
mov [edx+4], ebx
BtBld PROC Array:DWORD, Pixels:DWORD
mov eax, [Array]
add eax, 7
mov edx, [Pixels]
mov bx, 0
mov ecx, 8
rpt: or bx, [eax]
dec eax
shl bx, 1
loop rpt
shr bx, 1
mov [edx], bl
ret
BtBld ENDP
4a Instruction Sequence:
AND - MOV - SHR
4b Instruction Sequence:
AND - SHR - MOV
4a: AND (piped) MOV (piped) SHR
4b: AND (WAIT) SHR (piped) MOV
mov bl, al;
and bl, $01; // data dep (bl)
mov bl, al -> (A:) read al (B:) write bl => (2 clocks in i386)
and bl, 01 -> (C:) read 01 (D:) write bl => idem
[--A--][--B--]
[--C--]<wait>[---D--]
type TPackedDecodedPixels = record
case integer of
0: (a: TDecodedPixels);
1: (v: Int64);
end;
procedure DecodePixels(EncPixels: byte; var DecPixels: TDecodedPixels); inline;
const
magic = $8040201008040201;
mask = $8080808080808080;
begin
TPackedDecodedPixels(DecPixels).v := SwapEndian(((EncPixels*magic) and mask) shr 7);
end;
0000000a 0000000b 0000000c 0000000d 0000000e 0000000f 0000000g 0000000h (1)
| b7 || b6 || b4 || b4 || b3 || b2 || b1 || b0 |
abcdefgh (1-byte value)
x 1000000001000000001000000001000000001000000001000000001000000001
────────────────────────────────────────────────────────────────
= h0abcdefgh0abcdefgh0abcdefgh0abcdefgh0abcdefgh0abcdefgh0abcdefgh
| b7 || b6 || b4 || b4 || b3 || b2 || b1 || b0 |
h0abcdefgh0abcdefgh0abcdefgh0abcdefgh0abcdefgh0abcdefgh0abcdefgh
& 1000000010000000100000001000000010000000100000001000000010000000
────────────────────────────────────────────────────────────────
= h0000000g0000000f0000000e0000000d0000000c0000000b0000000a0000000 (8-byte array)
0abcdefg
x 0000000000000010000001000000100000010000001000000100000010000001
────────────────────────────────────────────────────────────────
= 00000000abcdefgabcdefgabcdefgabcdefgabcdefgabcdefgabcdefgabcdefg
& 0000000000000001000000010000000100000001000000010000000100000001
────────────────────────────────────────────────────────────────
= 000000000000000a0000000b0000000c0000000d0000000e0000000f0000000g
TPackedDecodedPixels(DecPixels).v := _pext_u64(EncPixels, $0101010101010101);
mov al, dil
mov rbx, rsi
movzx edi, al
movabs rax, 0x8040201008040201
imul rdi, rax
movabs rax, 0x8080808080808080
and rdi, rax
shr rdi, 0x7
call 4016a0 <SYSTEM_$$_SWAPENDIAN$INT64$$INT64>
mov QWORD PTR [rbx], rax