News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Loop Backward

Started by RuiLoureiro, January 17, 2008, 02:41:40 PM

Previous topic - Next topic

RuiLoureiro

Hi all,
        i wrote the next proc but the loop doesnt go backward,
        it doesnt work. Why ?

                        mov     ecx, 10
                        xor      edx, edx
@@:                   movzx   eax, byte ptr [esi + edx]
                        movzx   ebx, byte ptr [edi + edx]
                        inc       edx
                        cmp      al, bl
                        je       @F                        ;
                        stc
                        ret
                        ;
@@:                  loop    short @B
                         clc
                         ret


But this woks. Whats wrong with loop backward ?

                        mov     ecx, 10
                        xor      edx, edx
_next:               movzx   eax, byte ptr [esi + edx]
                        movzx   ebx, byte ptr [edi + edx]
                        inc       edx
                        cmp      al, bl
                        je       @F
                        ;
                        stc
                        ret
                        ;
@@:                  loop    short _next
                         clc
                         ret

Rui


asmfan

Quote from: RuiLoureiro on January 17, 2008, 02:41:40 PM

@@:                  loop    short @B

Is self-explonatory - works as should.
Russia is a weird place

Sarel

I have wasted hours on simple things like these. You intend to do something and it does not happen.

@@:                  loop    short @B ; You are going nowhere but bact to the same line.


:bg

Tight_Coder_Ex

You can also simplify your algo by

@@:     movzx eax, byte ptr [edx + esi]
        movzx edx, byte ptr [edx  + edi]
        inc edx
        cmp al, bl
        loopnz @B

        clc
        ret


Qualifying loop with NZ checks the condition of the previous comparison or when ECX is exhausted it will fall out of the loop.

MichaelW

Rui,

To MASM all of these codings look the same:

@@: loop short @B

@@:
    loop short @B

@@:

    loop short @B


The loop will run, but all it will do is execute the loop instruction ECX times, and then fall through to the next instruction.
eschew obfuscation

RuiLoureiro

Hi all,
         Thank you for your answers.

MichaelW,
               Yes , it is the same ! I was wainting it goes backward not to the same line ! But not.
               Sometimes we waste time with questions like this and for that reason i put the question here.

Tight_Coder_Ex,
                        Well seen ! Thank you. Do you know better procs to copy strings other than

@@:           dec           ecx   
                  movzx        eax, byte ptr [esi + ecx]
                  mov           byte ptr [edi + ecx], al
                  jnz             @B

Thanks
Rui

Tight_Coder_Ex

Quote from: RuiLoureiro on January 17, 2008, 08:56:32 PM
Do you know better procs to copy strings other than

@@:           dec           ecx   
                  movzx        eax, byte ptr [esi + ecx]
                  mov           byte ptr [edi + ecx], al
                  jnz             @B

Thanks
Rui


Yes, EDI & ESI are designed just for that purpose, maybe not exclusively.

        lea esi, Source
        lea edi, Destination
        mov ecx, BlockSize

; Your code is preplaced by this single statement

        rep movsb



NightWare

Quote from: RuiLoureiro on January 17, 2008, 08:56:32 PM
Do you know better procs to copy strings
if you are not afraid to use mmx, the syntax :
mov esi,OFFSET SourceString
mov edi,OFFSET DestinationString
call Mmx_StrCopy ; return length of the strig in eax, best result if DestinationString is aligned...

the corresponding proc :
Mmx_StrCopy PROC
push ecx
push edx

mov eax,-16
pxor MM2,MM2
pxor MM3,MM3
jmp Label2
nop
nop
nop
nop
nop
Label1: movq QWORD PTR [edi+eax],MM0
movq QWORD PTR [edi+eax+8],MM1
Label2: add eax,16
movq MM0,QWORD PTR [esi+eax]
movq MM1,QWORD PTR [esi+eax+8]
pcmpeqb MM2,MM0
pcmpeqb MM3,MM1
por MM2,MM3
pmovmskb edx,MM2
test edx,edx
jz Label1
jmp Label4
Label3: add edx,01010101h
mov DWORD PTR [edi+eax],edx
add eax,4
Label4: mov ecx,DWORD PTR [esi+eax]
lea edx,[ecx-01010101h]
xor ecx,edx
and ecx,80808080h
jz Label3
and ecx,edx
jz Label3
add edx,01010101h
mov [edi+eax],dl
test dl,dl
jz Label7
mov [edi+eax+1],dh
test dh,dh
jz Label6
shr edx,16
mov [edi+eax+2],dl
test dl,dl
jz Label5
mov [edi+eax+3],dh
inc eax
Label5: inc eax
Label6: inc eax
Label7:
pop edx
pop ecx
ret
Mmx_StrCopy ENDP

RuiLoureiro

Hi NightWare,
                   Thank you !

Hi Tight_Coder_Ex,


; Your code is preplaced by this single statement
        rep movsb


But rep movsb doesnt look like a good solution.
I used a MichaelW timming macros and rep movsb waste 285-284 cycles against 97-101 cycles to copy 20 bytes.
Here is the code

; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    include \masm32\include\masm32rt.inc
    .686
    include \masm32\macros\timers.asm
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    .data
           db 20
_Str1      db "Tudo igual ao litro "

           db 20
_Str2      db "Tudo igual ao litro "
                 
;+++++++++++++++++++++++++++++++++++++++++++++++     
    .code
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
RepCopy     proc    Fnt:DWORD, Dst:DWORD, Len:DWORD
            push    esi
            push    edi
            ;
            mov     esi, Fnt
            mov     edi, Dst
            mov     ecx, Len
            ;
            cld
            rep     movsb
            ;
            pop     edi
            pop     esi
            ret
RepCopy     endp
;......................................................
LopCopy     proc    Fnt:DWORD, Dst:DWORD, Len:DWORD
            push    esi
            push    edi
            ;
            mov     esi, Fnt
            mov     edi, Dst
            mov     ecx, Len
            ;
@@:         dec     ecx
            movzx   eax, byte ptr [esi + ecx]
            mov     byte ptr [edi + ecx], al
            jnz     @B           
            ;
            pop     edi
            pop     esi
            ret
LopCopy     endp
;......................................................
LopCopy2    proc    Fnt:DWORD, Dst:DWORD, Len:DWORD
            push    esi
            push    edi
            ;
            mov     esi, Fnt
            mov     edi, Dst
            mov     ecx, Len
            ;
@@:         movzx   eax, byte ptr [esi + ecx-1]
            mov     byte ptr [edi + ecx-1], al
            loop    @B           
            ;
            pop     edi
            pop     esi
            ret
LopCopy2    endp
; ««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    invoke Sleep, 4000
; ##############################################################################################
    counter_begin 1000000, HIGH_PRIORITY_CLASS

      invoke    RepCopy, offset _Str1, offset _Str2, 20             ;285 - 284 cycles

    counter_end
    print ustr$(eax)," cycles",13,10
;------------------------------------------------------------------
    counter_begin 1000000, HIGH_PRIORITY_CLASS

      invoke    LopCopy, offset _Str1, offset _Str2, 20             ; 97 - 101 cycles

    counter_end
    print ustr$(eax)," cycles",13,10
;++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    counter_begin 1000000, HIGH_PRIORITY_CLASS

      mov       esi, offset _Str1
      movzx     eax, byte ptr [esi - 1]
      invoke    RepCopy, esi, offset _Str2, eax                     ; 283 - 284 cycles

    counter_end
    print ustr$(eax)," cycles",13,10
;------------------------------------------------------------------
    counter_begin 1000000, HIGH_PRIORITY_CLASS

      mov       esi, offset _Str1
      movzx     eax, byte ptr [esi - 1]
      invoke    LopCopy, esi, offset _Str2, eax                     ; 98 - 101 cycles

    counter_end
    print ustr$(eax)," cycles",13,10
;------------------------------------------------------------------
    counter_begin 1000000, HIGH_PRIORITY_CLASS

      mov       esi, offset _Str1
      movzx     eax, byte ptr [esi - 1]
      invoke    LopCopy2, esi, offset _Str2, eax                    ; 140 - 141 cycles

    counter_end
    print ustr$(eax)," cycles",13,10

    inkey "Press any key to exit..."
    exit
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start


Rui

lingo

NightWare:
- If our input and output buffers are (big enough+16 bytes) we can omit
the slower part of strlen in your code
- I wondering why you preserve ecx and edx registers rather than esi and edi... :wink
        push edi ; On entry: ecx->address of source buffer
xor edx, edx ;           eax->address of target buffer
push ebx ;
pxor MM1, MM1 ;
movq MM0, [ecx] ;
movq MM2, [ecx+8] ;
@@: ;
movq [eax+edx], MM0 ;
movq [eax+edx+8], MM2 ;
add edx, 16 ;
pcmpeqb MM0, MM1 ;
pcmpeqb MM2, MM1 ;
packsswb MM0, MM0         ;
packsswb MM2, MM2           ;
movd ebx, MM0 ;
movd edi, MM2 ;
movq MM0, [ecx+edx] ;
movq MM2, [ecx+edx+8] ;
test ebx, ebx ;
jne @f ;
test edi, edi ;
je @b ;
bsf edi, edi ;
pop ebx ;
shr edi, 2 ;
lea eax, [edx+edi-8] ; eax->strlen without  "0"
pop edi ;
ret                                 ;
@@: ;
bsf ebx, ebx ;
shr ebx, 2 ;
lea eax, [edx+ebx-16] ; eax->strlen without  "0"
pop ebx ;
pop edi ;
ret ;


Jimg

Rui-
The results of your code on an AMD--
RepCopy    53 cycles
LopCopy    63 cycles
RepCopy    54 cycles
LopCopy    61 cycles
LopCopy2   102 cycles
Press any key to exit...

RuiLoureiro

Hi Jimg,
           Its curious ! I am exec it on a Pentium  3010 Mhz with XP sp2 !

Thank you
Rui


jdoe

Rui,

I'm sure you can have better results with RtlMoveMemory (rep movsd / rep movsb)   :wink



    counter_begin 1000000, HIGH_PRIORITY_CLASS

      invoke RtlMoveMemory, addr _Str2, addr _Str1, 20

    counter_end
    print ustr$(eax)," cycles",13,10


RuiLoureiro

Quote from: jdoe on January 18, 2008, 06:47:23 PM
I'm sure you can have better results with RtlMoveMemory (rep movsd / rep movsb)   :wink

Hi  jdoe,
            Yes, It gives 80 - 88 cycles. But whats RtlMoveMemory ?
Rui

NightWare

#14
Quote from: lingo on January 18, 2008, 02:16:07 PM
NightWare:
- If our input and output buffers are (big enough+16 bytes) we can omit
the slower part of strlen in your code
- I wondering why you preserve ecx and edx registers rather than esi and edi... :wink
lingo,
yep, but you need to check the length in this case...

and concerning push ecx/edx, it's my own calling convention (all registers have to be preserved), beside i don't see why i should preserve ebx, esi or edi. if microsoft say preservation of those register by the system, it sound to me i just have to take care of the other (if someone ear something else...  :wink).
for me, the unique reason to preserve ebx, esi and edi, it's when you code a library, for os compatibility...
and it's not my case  :toothy

EDIT :
lingo,
fast algo...  :U

rui,
it's one of the functions from ntdll.dll (this library is the base of win32 os... )

here my xmm version of memmove, syntax :
mov eax,Size
mov esi,OFFSET Source
mov edi,OFFSET Destination
call Mmx_MemMove


the corresponding proc :
ALIGN 16
Mmx_MemMove PROC
push ecx
push edx

cmp esi,edi
jae Label00
mov ecx,edi
sub ecx,esi
cmp eax,ecx
ja Label09

Label00: mov ecx,eax
and ecx,11111111111111111111111111000000b
jz Label02
add esi,ecx
add edi,ecx
neg ecx
nop
nop
nop
nop
Label01: movq MM0,QWORD PTR[esi+ecx]
movq MM1,QWORD PTR[esi+ecx+8]
movq MM2,QWORD PTR[esi+ecx+16]
movq MM3,QWORD PTR[esi+ecx+24]
movq MM4,QWORD PTR[esi+ecx+32]
movq MM5,QWORD PTR[esi+ecx+40]
movq MM6,QWORD PTR[esi+ecx+48]
movq MM7,QWORD PTR[esi+ecx+56]
movq QWORD PTR[edi+ecx],MM0
movq QWORD PTR[edi+ecx+8],MM1
movq QWORD PTR[edi+ecx+16],MM2
movq QWORD PTR[edi+ecx+24],MM3
movq QWORD PTR[edi+ecx+32],MM4
movq QWORD PTR[edi+ecx+40],MM5
movq QWORD PTR[edi+ecx+48],MM6
movq QWORD PTR[edi+ecx+56],MM7
add ecx,64
jnz Label01

Label02: mov ecx,eax
and ecx,00000000000000000000000000110000b
jz Label04
add esi,ecx
add edi,ecx
neg ecx
Label03: movq MM0,QWORD PTR[esi+ecx]
movq MM1,QWORD PTR[esi+ecx+8]
movq QWORD PTR[edi+ecx],MM0
movq QWORD PTR[edi+ecx+8],MM1
add ecx,16
jnz Label03

Label04: mov ecx,eax
and ecx,00000000000000000000000000001100b
jz Label06
add esi,ecx
add edi,ecx
neg ecx
Label05: mov edx,DWORD PTR [esi+ecx]
mov DWORD PTR [edi+ecx],edx
add ecx,4
jnz Label05

Label06: mov ecx,eax
and ecx,00000000000000000000000000000011b
jz Label08
add esi,ecx
add edi,ecx
neg ecx
Label07: mov dl,BYTE PTR [esi+ecx]
mov BYTE PTR [edi+ecx],dl
inc ecx
jnz Label07
Label08: sub esi,eax
sub edi,eax

pop edx
pop ecx
ret

ALIGN 16
Label09: add esi,eax
add edi,eax
mov ecx,eax
and ecx,00000000000000000000000000000011b
jz Label11
sub esi,ecx
sub edi,ecx
Label10: dec ecx
mov dl,BYTE PTR [esi+ecx]
mov BYTE PTR [edi+ecx],dl
jnz Label10

Label11: mov ecx,eax
and ecx,00000000000000000000000000001100b
jz Label13
sub esi,ecx
sub edi,ecx
Label12: sub ecx,4
mov edx,DWORD PTR [esi+ecx]
mov DWORD PTR [edi+ecx],edx
jnz Label12

Label13: mov ecx,eax
and ecx,00000000000000000000000000110000b
jz Label15
sub esi,ecx
sub edi,ecx
Label14: sub ecx,16
movq MM0,QWORD PTR[esi+ecx+8]
movq MM1,QWORD PTR[esi+ecx]
movq QWORD PTR[edi+ecx+8],MM0
movq QWORD PTR[edi+ecx],MM1
jnz Label14

Label15: mov ecx,eax
and ecx,11111111111111111111111111000000b
jz Label17
sub esi,ecx
sub edi,ecx

Label16: sub ecx,64
movq MM0,QWORD PTR[esi+ecx+56]
movq MM1,QWORD PTR[esi+ecx+48]
movq MM2,QWORD PTR[esi+ecx+40]
movq MM3,QWORD PTR[esi+ecx+32]
movq MM4,QWORD PTR[esi+ecx+24]
movq MM5,QWORD PTR[esi+ecx+16]
movq MM6,QWORD PTR[esi+ecx+8]
movq MM7,QWORD PTR[esi+ecx]
movq QWORD PTR[edi+ecx+56],MM0
movq QWORD PTR[edi+ecx+48],MM1
movq QWORD PTR[edi+ecx+40],MM2
movq QWORD PTR[edi+ecx+32],MM3
movq QWORD PTR[edi+ecx+24],MM4
movq QWORD PTR[edi+ecx+16],MM5
movq QWORD PTR[edi+ecx+8],MM6
movq QWORD PTR[edi+ecx],MM7
jnz Label16
Label17:
pop edx
pop ecx
ret
Mmx_MemMove ENDP