Hi all,
i wrote the next proc but the loop doesnt go backward,
it doesnt work. Why ?
mov ecx, 10
xor edx, edx
@@: movzx eax, byte ptr [esi + edx]
movzx ebx, byte ptr [edi + edx]
inc edx
cmp al, bl
je @F ;
stc
ret
;
@@: loop short @B
clc
ret
But this woks. Whats wrong with loop backward ?
mov ecx, 10
xor edx, edx
_next: movzx eax, byte ptr [esi + edx]
movzx ebx, byte ptr [edi + edx]
inc edx
cmp al, bl
je @F
;
stc
ret
;
@@: loop short _next
clc
ret
Rui
I have wasted hours on simple things like these. You intend to do something and it does not happen.
@@: loop short @B ; You are going nowhere but bact to the same line.
:bg
You can also simplify your algo by
@@: movzx eax, byte ptr [edx + esi]
movzx edx, byte ptr [edx + edi]
inc edx
cmp al, bl
loopnz @B
clc
ret
Qualifying loop with NZ checks the condition of the previous comparison or when ECX is exhausted it will fall out of the loop.
Rui,
To MASM all of these codings look the same:
@@: loop short @B
@@:
loop short @B
@@:
loop short @B
The loop will run, but all it will do is execute the loop instruction ECX times, and then fall through to the next instruction.
Hi all,
Thank you for your answers.
MichaelW,
Yes , it is the same ! I was wainting it goes backward not to the same line ! But not.
Sometimes we waste time with questions like this and for that reason i put the question here.
Tight_Coder_Ex,
Well seen ! Thank you. Do you know better procs to copy strings other than
@@: dec ecx
movzx eax, byte ptr [esi + ecx]
mov byte ptr [edi + ecx], al
jnz @B
Thanks
Rui
Quote from: RuiLoureiro on January 17, 2008, 08:56:32 PM
Do you know better procs to copy strings other than
@@: dec ecx
movzx eax, byte ptr [esi + ecx]
mov byte ptr [edi + ecx], al
jnz @B
Thanks
Rui
Yes, EDI & ESI are designed just for that purpose, maybe not exclusively.
lea esi, Source
lea edi, Destination
mov ecx, BlockSize
; Your code is preplaced by this single statement
rep movsb
Quote from: RuiLoureiro on January 17, 2008, 08:56:32 PM
Do you know better procs to copy strings
if you are not afraid to use mmx, the syntax :
mov esi,OFFSET SourceString
mov edi,OFFSET DestinationString
call Mmx_StrCopy ; return length of the strig in eax, best result if DestinationString is aligned...
the corresponding proc :
Mmx_StrCopy PROC
push ecx
push edx
mov eax,-16
pxor MM2,MM2
pxor MM3,MM3
jmp Label2
nop
nop
nop
nop
nop
Label1: movq QWORD PTR [edi+eax],MM0
movq QWORD PTR [edi+eax+8],MM1
Label2: add eax,16
movq MM0,QWORD PTR [esi+eax]
movq MM1,QWORD PTR [esi+eax+8]
pcmpeqb MM2,MM0
pcmpeqb MM3,MM1
por MM2,MM3
pmovmskb edx,MM2
test edx,edx
jz Label1
jmp Label4
Label3: add edx,01010101h
mov DWORD PTR [edi+eax],edx
add eax,4
Label4: mov ecx,DWORD PTR [esi+eax]
lea edx,[ecx-01010101h]
xor ecx,edx
and ecx,80808080h
jz Label3
and ecx,edx
jz Label3
add edx,01010101h
mov [edi+eax],dl
test dl,dl
jz Label7
mov [edi+eax+1],dh
test dh,dh
jz Label6
shr edx,16
mov [edi+eax+2],dl
test dl,dl
jz Label5
mov [edi+eax+3],dh
inc eax
Label5: inc eax
Label6: inc eax
Label7:
pop edx
pop ecx
ret
Mmx_StrCopy ENDP
Hi NightWare,
Thank you !
Hi Tight_Coder_Ex,
; Your code is preplaced by this single statement
rep movsb
But rep movsb doesnt look like a good solution.
I used a MichaelW timming macros and rep movsb waste 285-284 cycles against 97-101 cycles to copy 20 bytes.
Here is the code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
include \masm32\include\masm32rt.inc
.686
include \masm32\macros\timers.asm
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.data
db 20
_Str1 db "Tudo igual ao litro "
db 20
_Str2 db "Tudo igual ao litro "
;+++++++++++++++++++++++++++++++++++++++++++++++
.code
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
RepCopy proc Fnt:DWORD, Dst:DWORD, Len:DWORD
push esi
push edi
;
mov esi, Fnt
mov edi, Dst
mov ecx, Len
;
cld
rep movsb
;
pop edi
pop esi
ret
RepCopy endp
;......................................................
LopCopy proc Fnt:DWORD, Dst:DWORD, Len:DWORD
push esi
push edi
;
mov esi, Fnt
mov edi, Dst
mov ecx, Len
;
@@: dec ecx
movzx eax, byte ptr [esi + ecx]
mov byte ptr [edi + ecx], al
jnz @B
;
pop edi
pop esi
ret
LopCopy endp
;......................................................
LopCopy2 proc Fnt:DWORD, Dst:DWORD, Len:DWORD
push esi
push edi
;
mov esi, Fnt
mov edi, Dst
mov ecx, Len
;
@@: movzx eax, byte ptr [esi + ecx-1]
mov byte ptr [edi + ecx-1], al
loop @B
;
pop edi
pop esi
ret
LopCopy2 endp
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
invoke Sleep, 4000
; ##############################################################################################
counter_begin 1000000, HIGH_PRIORITY_CLASS
invoke RepCopy, offset _Str1, offset _Str2, 20 ;285 - 284 cycles
counter_end
print ustr$(eax)," cycles",13,10
;------------------------------------------------------------------
counter_begin 1000000, HIGH_PRIORITY_CLASS
invoke LopCopy, offset _Str1, offset _Str2, 20 ; 97 - 101 cycles
counter_end
print ustr$(eax)," cycles",13,10
;++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
counter_begin 1000000, HIGH_PRIORITY_CLASS
mov esi, offset _Str1
movzx eax, byte ptr [esi - 1]
invoke RepCopy, esi, offset _Str2, eax ; 283 - 284 cycles
counter_end
print ustr$(eax)," cycles",13,10
;------------------------------------------------------------------
counter_begin 1000000, HIGH_PRIORITY_CLASS
mov esi, offset _Str1
movzx eax, byte ptr [esi - 1]
invoke LopCopy, esi, offset _Str2, eax ; 98 - 101 cycles
counter_end
print ustr$(eax)," cycles",13,10
;------------------------------------------------------------------
counter_begin 1000000, HIGH_PRIORITY_CLASS
mov esi, offset _Str1
movzx eax, byte ptr [esi - 1]
invoke LopCopy2, esi, offset _Str2, eax ; 140 - 141 cycles
counter_end
print ustr$(eax)," cycles",13,10
inkey "Press any key to exit..."
exit
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start
Rui
NightWare:
- If our input and output buffers are (big enough+16 bytes) we can omit
the slower part of strlen in your code
- I wondering why you preserve ecx and edx registers rather than esi and edi... :wink
push edi ; On entry: ecx->address of source buffer
xor edx, edx ; eax->address of target buffer
push ebx ;
pxor MM1, MM1 ;
movq MM0, [ecx] ;
movq MM2, [ecx+8] ;
@@: ;
movq [eax+edx], MM0 ;
movq [eax+edx+8], MM2 ;
add edx, 16 ;
pcmpeqb MM0, MM1 ;
pcmpeqb MM2, MM1 ;
packsswb MM0, MM0 ;
packsswb MM2, MM2 ;
movd ebx, MM0 ;
movd edi, MM2 ;
movq MM0, [ecx+edx] ;
movq MM2, [ecx+edx+8] ;
test ebx, ebx ;
jne @f ;
test edi, edi ;
je @b ;
bsf edi, edi ;
pop ebx ;
shr edi, 2 ;
lea eax, [edx+edi-8] ; eax->strlen without "0"
pop edi ;
ret ;
@@: ;
bsf ebx, ebx ;
shr ebx, 2 ;
lea eax, [edx+ebx-16] ; eax->strlen without "0"
pop ebx ;
pop edi ;
ret ;
Rui-
The results of your code on an AMD--
RepCopy 53 cycles
LopCopy 63 cycles
RepCopy 54 cycles
LopCopy 61 cycles
LopCopy2 102 cycles
Press any key to exit...
Hi Jimg,
Its curious ! I am exec it on a Pentium 3010 Mhz with XP sp2 !
Thank you
Rui
Rui,
I'm sure you can have better results with RtlMoveMemory (rep movsd / rep movsb) :wink
counter_begin 1000000, HIGH_PRIORITY_CLASS
invoke RtlMoveMemory, addr _Str2, addr _Str1, 20
counter_end
print ustr$(eax)," cycles",13,10
Quote from: jdoe on January 18, 2008, 06:47:23 PM
I'm sure you can have better results with RtlMoveMemory (rep movsd / rep movsb) :wink
Hi jdoe,
Yes, It gives 80 - 88 cycles. But whats RtlMoveMemory ?
Rui
Quote from: lingo on January 18, 2008, 02:16:07 PM
NightWare:
- If our input and output buffers are (big enough+16 bytes) we can omit
the slower part of strlen in your code
- I wondering why you preserve ecx and edx registers rather than esi and edi... :wink
lingo,
yep, but you need to check the length in this case...
and concerning push ecx/edx, it's my own calling convention (all registers have to be preserved), beside i don't see why i should preserve ebx, esi or edi. if microsoft say preservation of those register by the system, it sound to me i just have to take care of the other (if someone ear something else... :wink).
for me, the unique reason to preserve ebx, esi and edi, it's when you code a library, for os compatibility...
and it's not my case :toothy
EDIT :
lingo,
fast algo... :U
rui,
it's one of the functions from ntdll.dll (this library is the base of win32 os... )
here my xmm version of memmove, syntax :
mov eax,Size
mov esi,OFFSET Source
mov edi,OFFSET Destination
call Mmx_MemMove
the corresponding proc :
ALIGN 16
Mmx_MemMove PROC
push ecx
push edx
cmp esi,edi
jae Label00
mov ecx,edi
sub ecx,esi
cmp eax,ecx
ja Label09
Label00: mov ecx,eax
and ecx,11111111111111111111111111000000b
jz Label02
add esi,ecx
add edi,ecx
neg ecx
nop
nop
nop
nop
Label01: movq MM0,QWORD PTR[esi+ecx]
movq MM1,QWORD PTR[esi+ecx+8]
movq MM2,QWORD PTR[esi+ecx+16]
movq MM3,QWORD PTR[esi+ecx+24]
movq MM4,QWORD PTR[esi+ecx+32]
movq MM5,QWORD PTR[esi+ecx+40]
movq MM6,QWORD PTR[esi+ecx+48]
movq MM7,QWORD PTR[esi+ecx+56]
movq QWORD PTR[edi+ecx],MM0
movq QWORD PTR[edi+ecx+8],MM1
movq QWORD PTR[edi+ecx+16],MM2
movq QWORD PTR[edi+ecx+24],MM3
movq QWORD PTR[edi+ecx+32],MM4
movq QWORD PTR[edi+ecx+40],MM5
movq QWORD PTR[edi+ecx+48],MM6
movq QWORD PTR[edi+ecx+56],MM7
add ecx,64
jnz Label01
Label02: mov ecx,eax
and ecx,00000000000000000000000000110000b
jz Label04
add esi,ecx
add edi,ecx
neg ecx
Label03: movq MM0,QWORD PTR[esi+ecx]
movq MM1,QWORD PTR[esi+ecx+8]
movq QWORD PTR[edi+ecx],MM0
movq QWORD PTR[edi+ecx+8],MM1
add ecx,16
jnz Label03
Label04: mov ecx,eax
and ecx,00000000000000000000000000001100b
jz Label06
add esi,ecx
add edi,ecx
neg ecx
Label05: mov edx,DWORD PTR [esi+ecx]
mov DWORD PTR [edi+ecx],edx
add ecx,4
jnz Label05
Label06: mov ecx,eax
and ecx,00000000000000000000000000000011b
jz Label08
add esi,ecx
add edi,ecx
neg ecx
Label07: mov dl,BYTE PTR [esi+ecx]
mov BYTE PTR [edi+ecx],dl
inc ecx
jnz Label07
Label08: sub esi,eax
sub edi,eax
pop edx
pop ecx
ret
ALIGN 16
Label09: add esi,eax
add edi,eax
mov ecx,eax
and ecx,00000000000000000000000000000011b
jz Label11
sub esi,ecx
sub edi,ecx
Label10: dec ecx
mov dl,BYTE PTR [esi+ecx]
mov BYTE PTR [edi+ecx],dl
jnz Label10
Label11: mov ecx,eax
and ecx,00000000000000000000000000001100b
jz Label13
sub esi,ecx
sub edi,ecx
Label12: sub ecx,4
mov edx,DWORD PTR [esi+ecx]
mov DWORD PTR [edi+ecx],edx
jnz Label12
Label13: mov ecx,eax
and ecx,00000000000000000000000000110000b
jz Label15
sub esi,ecx
sub edi,ecx
Label14: sub ecx,16
movq MM0,QWORD PTR[esi+ecx+8]
movq MM1,QWORD PTR[esi+ecx]
movq QWORD PTR[edi+ecx+8],MM0
movq QWORD PTR[edi+ecx],MM1
jnz Label14
Label15: mov ecx,eax
and ecx,11111111111111111111111111000000b
jz Label17
sub esi,ecx
sub edi,ecx
Label16: sub ecx,64
movq MM0,QWORD PTR[esi+ecx+56]
movq MM1,QWORD PTR[esi+ecx+48]
movq MM2,QWORD PTR[esi+ecx+40]
movq MM3,QWORD PTR[esi+ecx+32]
movq MM4,QWORD PTR[esi+ecx+24]
movq MM5,QWORD PTR[esi+ecx+16]
movq MM6,QWORD PTR[esi+ecx+8]
movq MM7,QWORD PTR[esi+ecx]
movq QWORD PTR[edi+ecx+56],MM0
movq QWORD PTR[edi+ecx+48],MM1
movq QWORD PTR[edi+ecx+40],MM2
movq QWORD PTR[edi+ecx+32],MM3
movq QWORD PTR[edi+ecx+24],MM4
movq QWORD PTR[edi+ecx+16],MM5
movq QWORD PTR[edi+ecx+8],MM6
movq QWORD PTR[edi+ecx],MM7
jnz Label16
Label17:
pop edx
pop ecx
ret
Mmx_MemMove ENDP
NightWare,
I tried to compile it and this was the result:
Microsoft (R) Macro Assembler Version 6.14.8444
Copyright (C) Microsoft Corp 1981-1997. All rights reserved.
Assembling: C:\MASM32\RuiTestes\Test323.asm
C:\MASM32\RuiTestes\Test323.asm(41) : error A2085: instruction or register not ac
cepted in current CPU mode
C:\MASM32\RuiTestes\Test323.asm(42) : error A2085: instruction or register not ac
cepted in current CPU mode
........................................
C:\MASM32\RuiTestes\Test323.asm(58) : error A2006: undefined symbol : Label01
C:\MASM32\RuiTestes\Test323.asm(71) : error A2006: undefined symbol : Label03
_
Assembly Error
Prima qualquer tecla para continuar . . .
Whats wrong ?
Rui
Rui,
Before using MMX instructions, the assembler must know that you will.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
include \masm32\include\masm32rt.inc
.686
.MMX
include \masm32\macros\timers.asm
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
Thanks jdoe, it works
It waists 23 cycles only ! Good.
;------------------------------------------------------------------
counter_begin 1000000, HIGH_PRIORITY_CLASS
mov eax, 20
mov esi, offset _Str1
mov edi, offset _Str2
call Mmx_MemMove ; 23 cycles
counter_end
print ustr$(eax)," cycles",13,10
Rui
rui,
if you want to make speed test, you must compare equivalent code (here there is no alignment)... add "ALIGN 16" before each procs, and you will obtain more representative results :P
Quote from: NightWare on January 20, 2008, 12:47:24 AM
rui,
if you want to make speed test, you must compare equivalent code (here there is no alignment)... add "ALIGN 16" before each procs, and you will obtain more representative results :P
Moreover, adding few nop before a loop could help to gain few cpu clock. Some cpu are more sensitive to alignment than other but, like NightWare said, a procedure should always start by "align 16".
:U
Thanks NightWare and jdoe.
Before each proc i will go to put align 16 in all my set of procs.
Now i have one problem with menus. When a program is running i cannot access the menu. The program stops to run. how to solve this problem ? Anyone know ?
Thanks
Rui
Quote from: RuiLoureiro on January 21, 2008, 01:34:52 PM
Now i have one problem with menus. When a program is running i cannot access the menu. The program stops to run. how to solve this problem ? Anyone know ?
Rui,
What do you mean by "when the program is running". Only the menus are freezed or all the controls.
I could be wrong about your problem but when you start a task that takes times to execute, it is normal that a form freeze if you don't use a new thread.
Quote from: jdoe on January 21, 2008, 03:20:49 PM
... when you start a task that takes times to execute, it is normal that a form freeze if you don't use a new thread.
Hi jdoe, thank you
Yes when it is executing a task. So, the solution is using a new thread, no ?
Rui
Rui,
You have a DialogProc/WindowProc for your dialog/window ?
Suppose BTN1_ID is the OK button and BTN2_ID the Cancel button
ThreadProc PROTO :DWORD
DialogProc PROC p_hWndDlg:DWORD, p_uMsg:DWORD, p_wParam:DWORD, p_lParam:DWORD
mov eax, p_uMsg
.if (eax == WM_INITDIALOG)
.elseif (eax == WM_COMMAND)
.if (p_wParam == BTN1_ID)
invoke CreateThread, NULL, 0, addr ThreadProc, p_hWndDlg, 0, addr dwDummy
invoke CloseHandle, eax
.elseif (p_wParam == BTN2_ID)
jmp End_Dialog
.endif
.elseif (eax == WM_CLOSE)
End_Dialog:
invoke EndDialog, p_hWndDlg, NULL
.endif
xor eax, eax
ret
DialogProc ENDP
ThreadProc PROC lpParameter:DWORD
ret
ThreadProc ENDP
Now, the thread proc can take the time it needs, the dialog won't freeze.
Thank you, jdoe
i am going to try.
Rui