how to best optimization this code?
mov ecx,1024*768
cycle:
xor ebx,ebx
mov bx,word ptr fs:[edi]
shl ebx,1
mov ax,fs:[esi+ebx]
mov fs:[edi],ax
add edi,2
dec ecx
cmp ecx,0
jnz cycle
a few pokes.. (not necessarily the best optimization.)
xor eax,eax
xor ebx,ebx
mov ecx,1024*768
cycle:
mov bx,word ptr fs:[edi]
mov ax,fs:[esi+2*ebx]
mov fs:[edi],ax
add edi,2
sub ecx,1
jnz cycle
If you can avoid using fs, then I would - it's nice and slow.
this?
push ds
push fs
pop ds
 xor eax,eax
 xor ebx,ebx
 mov ecx,1024*768
mov edx,2
mov ebp,1
cycle:
 mov bx,[edi]
 mov ax,[esi+2*ebx]
 mov [edi],ax
 add edi,edx
 sub ecx,ebp
 jnz cycle
pop ds
you could probably use mmx/xmm to speed this up
Quote from: korte on November 13, 2007, 09:23:43 PM
this?
cycle:
 mov bx,[edi]
 mov ax,[esi+2*ebx]
 mov [edi],ax
 add edi,edx
 sub ecx,ebp
 jnz cycle
This probably could be further enhanced..
cycle:
movzx ebx, word ptr [edi]
movzx eax, word ptr [esi+2*ebx]
mov [edi], ax
add edi, edx
sub ecx, ebp
jnz cycle
..which also allows the removal of the setup:
xor eax, eax
xor ebx, ebx
the gist here is to avoid what is called "false dependencies" .. and while it doesnt look like there will be any because the loop body doesnt write to eax or ebx more than once, a modern cpu can have several iterations of this loop in the pipeline (as if you had unrolled it) thereby thwarting register renaming when you write to these registers in 16-bit form but read from them in 32-bit form ..
the benefits of this sort of thing are cpu dependent (probably the core2 benefits the most?) .. but it is unlikely to hurt performance on those machines that cannot actualy benefit because movzx in isolation has the same performance characteristics as a regular mov
Quote from: Kernel_Gaddafi on November 13, 2007, 11:43:11 PM
you could probably use mmx/xmm to speed this up
How would that work?
Remember that you cannot use an MMX or XMM register as a pointer .. only the 8 "general purpose" registers (eax, ebx, ecx, edx, esi, edi, ebp, and esp) can be used in 32-bit mode as pointers
This is what I see:
You have a 1024x768 input buffer of 16-bit color indexes, you have a input palette of 65536 16-bit color values (or 15-bit?), and you have a 1024x768 output buffer ( = the input buffer) of 16-bit color values
..and the very simple thing you are doing is translating the paletted image to a colored image
Here is my "best attempt", which isnt profiled...
imagesize EQU (1024 * 768 * 2)
mov esi, palette
mov edi, pixelbuffer + imagesize
mov ebp, -imagesize
@@do:
movzx eax, word ptr [edi + ebp]
movzx eax, word ptr [esi + 2*eax]
mov [edi + ebp], ax
add ebp, 2
js @@do
and unrolled 4 times:
imagesize EQU (1024 * 768 * 2)
mov esi, palette
mov edi, buffer + imagesize
mov ebp, -imagesize
@@do:
movzx eax, word ptr [edi + ebp + 0]
movzx ebx, word ptr [edi + ebp + 2]
movzx ecx, word ptr [edi + ebp + 4]
movzx edx, word ptr [edi + ebp + 6]
movzx eax, word ptr [esi + 2*eax]
movzx ebx, word ptr [esi + 2*ebx]
movzx ecx, word ptr [esi + 2*ecx]
movzx edx, word ptr [esi + 2*edx]
mov [edi + ebp + 0], ax
mov [edi + ebp + 2], bx
mov [edi + ebp + 4], cx
mov [edi + ebp + 6], dx
add ebp, 8
js @@do
use screen 1024*768 16 bit wide
table define gray scale
this code convert rgb screen to grayscale screen