This algo is a DWORD compare that drops to a byte compare to test which byte is different, it exits with any of 3 return value to tell you the two bytes are equal, one is greater than the other or less than the other. It appeas to run OK and returns the correct results so far but it could probably do with some optimisation to get a bit more grunt out of it.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 4
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
cmpmema proc buf1:DWORD,buf2:DWORD,bcnt:DWORD
; ---------------------------------------
; compare 2 4 byte aligned memory buffers
; ---------------------------------------
push esi
push edi
mov edx, [esp+20] ; bcnt
shr edx, 2 ; div by 4
mov esi, [esp+12] ; buf1
mov edi, [esp+16] ; buf2
xor ecx, ecx
align 4
@@:
mov eax, [esi+ecx] ; DWORD compare main file
cmp eax, [edi+ecx]
jne nxt
add ecx, 4
sub edx, 1
jnz @B
mov edx, [esp+20] ; bcnt calculate any remainder
and edx, 3
jz match ; exit if its zero
xor eax, eax
jmp @F
nxt:
mov edx, 4 ; set counter for 4 bytes
xor eax, eax ; clear EAX for partial writes
@@:
mov al, [esi+ecx] ; BYTE compare tail
cmp al, [edi+ecx]
jg greater
jl lessthan
add ecx, 1
sub edx, 1
jnz @B
jmp match
greater:
mov eax, 1
jmp quit
lessthan:
mov eax, -1
jmp quit
match:
xor eax, eax ; return ZERO on match
quit:
pop edi
pop esi
ret 12
; -------------------------------------------------
cmpmema endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
thanks Hutch, got me started, now see if I can beat you
9 opcodes=16 compares of equal or less than or greater than
could we go even faster?
data
align 16
twos: db 16 dup (2)
minus1:db 16 dup (-1)
mask:db 16 dup (0FFh)
.code
;top of loop
movdqa xmm0,[esi+ecx] ;source1
movdqa xmm1,xmm0
;greater than or less than
pcmpgtb xmm1,[edi+ecx] ;source2
pand xmm1,twos
paddb xmm1,minus1 ; 2+ -1 = 1 or 0+-1 == -1
check for equal
pcmpeqb,xmm0,[edi+ecx] ;source2FF's = equal, 0 = nonequal
pxor, xmm0,mask ; FF becomes 0 (not)
pandb xmm1,xmm0 ;mask out when zero, all thats equal becomes zero
movdqa result,xmm1
;endwhile
I managed to reduce the execution time by 25%, using 486-compatible code, and I think without disturbing the functionality, but bcnt must be a multiple of 16.
[attachment deleted by admin]