Intel x86 assembly optimization techniques for expanding 8 bits to 8 boolean bytes of 0 or 1
In general, I'd personally stay away from trying to optimize code by using tricks on assembler level, unless you really need that extra 2 or 3% of speed, and you're willing to pay the price of code that is harder to read, maintain and port.
To squeeze that last 1%, you might even have to maintain several versions optimized per processor, and if newer processors and an improved pascal compiler comes along, you're not going to benefit from it.
This Delphi code is faster than your fastest assembler code:
procedure DecodePixels5(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
DecPixels[0] := (EncPixels shr 0) and $01;
DecPixels[1] := (EncPixels shr 1) and $01;
DecPixels[2] := (EncPixels shr 2) and $01;
DecPixels[3] := (EncPixels shr 3) and $01;
DecPixels[4] := (EncPixels shr 4) and $01;
DecPixels[5] := (EncPixels shr 5) and $01;
DecPixels[6] := (EncPixels shr 6) and $01;
DecPixels[7] := (EncPixels shr 7) and $01;
end;
Results:
Time1 : 1,03096806151283 ms. <- Delphi loop.
Time2 : 0,740308641141395 ms. <- Delphi unrolled loop.
Time3 : 0,996602425688886 ms. <- BASM loop.
Time4a : 0,608267951561275 ms. <- BASM unrolled loop.
Time4b : 0,574162510648039 ms. <- BASM unrolled loop instruction switch.
Time5 : 0,499628206138524 ms. !!! <- Delphi unrolled loop 5.
It's fast because the operations can be done with registers only, instead of needing to store and fetch memory. Modern processors execute this partly in parallel (a new operation can be started before the previous finished), because the results of the consecutive instructions are independent of each other.
The machine code looks like this:
push ebx;
// DecPixels[0] := (EncPixels shr 0) and 1;
movzx ecx,al
mov ebx,ecx
// shr ebx,$00
and bl,$01
mov [edx],bl
// DecPixels[1] := (EncPixels shr 1) and 1;
mov ebx,ecx
shr ebx,1
and bl,$01
mov [edx+$01],bl
// DecPixels[2] := (EncPixels shr 2) and 1;
mov ebx,ecx
shr ebx,$02
and bl,$01
mov [edx+$02],bl
// DecPixels[3] := (EncPixels shr 3) and 1;
mov ebx,ecx
shr ebx,$03
and bl,$01
mov [edx+$03],bl
// DecPixels[4] := (EncPixels shr 4) and 1;
mov ebx,ecx
shr ebx,$04
and bl,$01
mov [edx+$04],bl
// DecPixels[5] := (EncPixels shr 5) and 1;
mov ebx,ecx
shr ebx,$05
and bl,$01
mov [edx+$05],bl
// DecPixels[6] := (EncPixels shr 6) and 1;
mov ebx,ecx
shr ebx,$06
and bl,$01
mov [edx+$06],bl
// DecPixels[7] := (EncPixels shr 7) and 1;
shr ecx,$07
and cl,$01
mov [edx+$07],cl
pop ebx;
Edit: As suggested, a table lookup is indeed faster.
var
PixelLookup:Array[byte] of TDecodedPixels;
// You could precalculate, but the performance gain would hardly be worth it because you call this once only.
for I := 0 to 255 do
DecodePixels5b(I, PixelLookup[I]);
procedure DecodePixels7(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
DecPixels := PixelLookup[EncPixels];
end;
Results:
Time1 : 1,03096806151283 ms. <- Delphi loop.
Time2 : 0,740308641141395 ms. <- Delphi unrolled loop.
Time3 : 0,996602425688886 ms. <- BASM loop.
Time4a : 0,608267951561275 ms. <- BASM unrolled loop.
Time4b : 0,574162510648039 ms. <- BASM unrolled loop instruction switch.
Time5 : 0,499628206138524 ms. !!! <- Delphi unrolled loop 5.
Time7 : 0,251533475182096 ms. <- simple table lookup
Your asm code is relativity slow because use stack end write 8 times to memory. Check this one...
procedure DecodePixels(EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
xor ecx, ecx
add al, al
rcl ecx, 8
add al, al
rcl ecx, 8
add al, al
rcl ecx, 8
add al, al
rcl ecx, 1
mov [DecPixels + 4], ecx
xor ecx, ecx
add al, al
rcl ecx, 8
add al, al
rcl ecx, 8
add al, al
rcl ecx, 8
add al, al
rcl ecx, 1
mov [DecPixels], ecx
end;
Maybe is even faster than code with lookup table!
Improved version:
procedure DecodePixelsI(EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
mov ecx, 0 //Faster than: xor ecx, ecx
add al, al
rcl ch, 1
add al, al
rcl cl, 1
ror ecx, 16
add al, al
rcl ch, 1
add al, al
rcl cl, 1
mov [DecPixels + 4], ecx
mov ecx, 0 //Faster than: xor ecx, ecx
add al, al
rcl ch, 1
add al, al
rcl cl, 1
ror ecx, 16
add al, al
rcl ch, 1
add al, al
rcl cl, 1
mov [DecPixels], ecx
end;
Version 3:
procedure DecodePixelsX(EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
add al, al
setc byte ptr[DecPixels + 7]
add al, al
setc byte ptr[DecPixels + 6]
add al, al
setc byte ptr[DecPixels + 5]
add al, al
setc byte ptr[DecPixels + 4]
add al, al
setc byte ptr[DecPixels + 3]
add al, al
setc byte ptr[DecPixels + 2]
add al, al
setc byte ptr[DecPixels + 1]
setnz byte ptr[DecPixels]
end;
Version 4:
const Uint32DecPix : array [0..15] of cardinal = (
$00000000, $00000001, $00000100, $00000101,
$00010000, $00010001, $00010100, $00010101,
$01000000, $01000001, $01000100, $01000101,
$01010000, $01010001, $01010100, $01010101
);
procedure DecodePixelsY(EncPixels: byte; var DecPixels: TDecodedPixels); inline;
begin
pcardinal(@DecPixels)^ := Uint32DecPix[EncPixels and $0F];
pcardinal(cardinal(@DecPixels) + 4)^ := Uint32DecPix[(EncPixels and $F0) shr 4];
end;
Expanding on Nick D's answer, I tried the following table-lookup based versions, all of which are faster than the implementations you give (and faster than Wouter van Nifterick's code).
Given the following packed array:
const Uint64DecPix : PACKED ARRAY [0..255] OF UINT64 =
( $0000000000000000, $0000000000000001, $0000000000000100, $0000000000000101, $0000000000010000, $0000000000010001, $0000000000010100, $0000000000010101, $0000000001000000, $0000000001000001, $0000000001000100, $0000000001000101, $0000000001010000, $0000000001010001, $0000000001010100, $0000000001010101,
$0000000100000000, $0000000100000001, $0000000100000100, $0000000100000101, $0000000100010000, $0000000100010001, $0000000100010100, $0000000100010101, $0000000101000000, $0000000101000001, $0000000101000100, $0000000101000101, $0000000101010000, $0000000101010001, $0000000101010100, $0000000101010101,
$0000010000000000, $0000010000000001, $0000010000000100, $0000010000000101, $0000010000010000, $0000010000010001, $0000010000010100, $0000010000010101, $0000010001000000, $0000010001000001, $0000010001000100, $0000010001000101, $0000010001010000, $0000010001010001, $0000010001010100, $0000010001010101,
$0000010100000000, $0000010100000001, $0000010100000100, $0000010100000101, $0000010100010000, $0000010100010001, $0000010100010100, $0000010100010101, $0000010101000000, $0000010101000001, $0000010101000100, $0000010101000101, $0000010101010000, $0000010101010001, $0000010101010100, $0000010101010101,
$0001000000000000, $0001000000000001, $0001000000000100, $0001000000000101, $0001000000010000, $0001000000010001, $0001000000010100, $0001000000010101, $0001000001000000, $0001000001000001, $0001000001000100, $0001000001000101, $0001000001010000, $0001000001010001, $0001000001010100, $0001000001010101,
$0001000100000000, $0001000100000001, $0001000100000100, $0001000100000101, $0001000100010000, $0001000100010001, $0001000100010100, $0001000100010101, $0001000101000000, $0001000101000001, $0001000101000100, $0001000101000101, $0001000101010000, $0001000101010001, $0001000101010100, $0001000101010101,
$0001010000000000, $0001010000000001, $0001010000000100, $0001010000000101, $0001010000010000, $0001010000010001, $0001010000010100, $0001010000010101, $0001010001000000, $0001010001000001, $0001010001000100, $0001010001000101, $0001010001010000, $0001010001010001, $0001010001010100, $0001010001010101,
$0001010100000000, $0001010100000001, $0001010100000100, $0001010100000101, $0001010100010000, $0001010100010001, $0001010100010100, $0001010100010101, $0001010101000000, $0001010101000001, $0001010101000100, $0001010101000101, $0001010101010000, $0001010101010001, $0001010101010100, $0001010101010101,
$0100000000000000, $0100000000000001, $0100000000000100, $0100000000000101, $0100000000010000, $0100000000010001, $0100000000010100, $0100000000010101, $0100000001000000, $0100000001000001, $0100000001000100, $0100000001000101, $0100000001010000, $0100000001010001, $0100000001010100, $0100000001010101,
$0100000100000000, $0100000100000001, $0100000100000100, $0100000100000101, $0100000100010000, $0100000100010001, $0100000100010100, $0100000100010101, $0100000101000000, $0100000101000001, $0100000101000100, $0100000101000101, $0100000101010000, $0100000101010001, $0100000101010100, $0100000101010101,
$0100010000000000, $0100010000000001, $0100010000000100, $0100010000000101, $0100010000010000, $0100010000010001, $0100010000010100, $0100010000010101, $0100010001000000, $0100010001000001, $0100010001000100, $0100010001000101, $0100010001010000, $0100010001010001, $0100010001010100, $0100010001010101,
$0100010100000000, $0100010100000001, $0100010100000100, $0100010100000101, $0100010100010000, $0100010100010001, $0100010100010100, $0100010100010101, $0100010101000000, $0100010101000001, $0100010101000100, $0100010101000101, $0100010101010000, $0100010101010001, $0100010101010100, $0100010101010101,
$0101000000000000, $0101000000000001, $0101000000000100, $0101000000000101, $0101000000010000, $0101000000010001, $0101000000010100, $0101000000010101, $0101000001000000, $0101000001000001, $0101000001000100, $0101000001000101, $0101000001010000, $0101000001010001, $0101000001010100, $0101000001010101,
$0101000100000000, $0101000100000001, $0101000100000100, $0101000100000101, $0101000100010000, $0101000100010001, $0101000100010100, $0101000100010101, $0101000101000000, $0101000101000001, $0101000101000100, $0101000101000101, $0101000101010000, $0101000101010001, $0101000101010100, $0101000101010101,
$0101010000000000, $0101010000000001, $0101010000000100, $0101010000000101, $0101010000010000, $0101010000010001, $0101010000010100, $0101010000010101, $0101010001000000, $0101010001000001, $0101010001000100, $0101010001000101, $0101010001010000, $0101010001010001, $0101010001010100, $0101010001010101,
$0101010100000000, $0101010100000001, $0101010100000100, $0101010100000101, $0101010100010000, $0101010100010001, $0101010100010100, $0101010100010101, $0101010101000000, $0101010101000001, $0101010101000100, $0101010101000101, $0101010101010000, $0101010101010001, $0101010101010100, $0101010101010101);
PUint64DecPix : pointer = @Uint64DecPix;
you can write the following:
procedure DecodePixelsPS1Pas (EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
DecPixels := TDecodedPixels(Uint64DecPix[EncPixels]);
end;
procedure DecodePixelsPS1PasInline (EncPixels: Byte; var DecPixels: TDecodedPixels);
inline;
begin
DecPixels := TDecodedPixels(Uint64DecPix[EncPixels]);
end;
procedure DecodePixelsPS1Asm (EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
lea ecx, Uint64DecPix //[<-Added in EDIT 3]
//mov ecx, dword ptr PUint64DecPix - alternative to the above line (slower for me)
movzx eax, al
movq xmm0, [8*eax+ecx] //Using XMM rather than MMX so we don't have to issue emms at the end
movq [edx], xmm0 //use MOVQ because it doesn't need mem alignment
end;
The standard PAS and ASM implementations are fairly similar speed-wise, but the PAS implementation marked with "INLINE" is the fastest because it gets rid of all the call/ret involved in calling the routine.
--EDIT--: I forgot to say: since you are implicitly assuming something about the memory layout of your TDecodedPixels structure, it would be better if you declare it as
PACKED ARRAY [0..7] of byte
--EDIT2--: Here are my results for comparison:
Time1 : 2.51638266874701 ms. <- Delphi loop.
Time2 : 2.11277620479698 ms. <- Delphi unrolled loop.
Time3 : 2.21972066282167 ms. <- BASM loop.
Time4a : 1.34093090043567 ms. <- BASM unrolled loop.
Time4b : 1.52222070123437 ms. <- BASM unrolled loop instruction switch.
Time5 : 1.17106364076999 ms. <- Wouter van Nifterick
TimePS1 : 0.633099318488802 ms. <- PS.Pas
TimePS2 : 0.551617593856202 ms. <- PS.Pas Inline
TimePS3 : 0.70921094720139 ms. <- PS.Asm (speed for version before 3rd EDIT)