Hey all,
I've been working on some projects requiring fuzzy matching, mainly in C#, but for interest sake I wanted to see the performance difference of some of the core functions if written in asm.
Here is the first attempt at the Levenshtein Distance function, only restriction is that the input strings are < 256 chars long. Which is all I need, so I pre-create a standard max size matrix.
Also the strings must be padded with 4 zeros, just to use the faster string length macro.
Anyone have any optimizations/ideas/thoughts?
strlen MACRO
mov ecx,-1
align 16
@@:
inc ecx
mov al,[esi+ecx]
test al,al
BTK
jnz short @B
ENDM
strlens MACRO
LOCAL foundLength
mov ebx,-4
align 16
@@:
add ebx,4
mov eax,[esi+ebx]
test eax,0ffh
BNT
jz short foundLength
test eax,0ff00h
BTK
jnz short @B
add ebx,1
jmp short foundLength
test eax,0ff0000h
BTK
jnz short @B
add ebx,2
jmp short foundLength
test eax,0ff000000h
BTK
jnz short @B
add ebx,3
jmp short foundLength
jmp short @B
foundLength:
ENDM
BNT MACRO
db 2eh
ENDM
BTK MACRO
db 3eh
ENDM
levenshtein_distance PROTO string1Addr:DWORD, string2Addr:DWORD
.data?
align 16
levenshtein_matrix db (256*256) DUP(?)
string1len dd ?
string2len dd ?
tempebp dd ?
.code
align 16
levenshtein_distance PROC string1Addr:DWORD, string2Addr:DWORD
push ecx
mov tempebp,ebp
; Length of String 1.
mov esi,string1Addr
strlens
mov string1len,ebx
; Length of String 2.
mov esi,string2Addr
strlens
shl ebx,8
mov string2len,ebx
; Pre-populate matrix first column.
mov esi,offset levenshtein_matrix
mov ecx,(255*256)
add esi,ecx
neg ecx
xor eax,eax
align 16
@@:
mov byte ptr [esi+ecx],al
add eax,1
add ecx,256
jnz short @B
; Main loop and clear cells.
mov esi,string1Addr
mov edi,string2Addr
mov ecx,1
align 16
outer:
mov ebx,256
mov ebp,1
mov byte ptr levenshtein_matrix[ecx],cl
align 16
inner:
mov edx,1
mov al,[esi+ecx-1]
sub al,[edi+ebp-1]
jnz short @F
xor edx,edx
@@:
add dl,levenshtein_matrix[ebx+ecx-1-256]
mov al,levenshtein_matrix[ebx+ecx-1]
inc al
mov ah,levenshtein_matrix[ebx+ecx-256]
inc ah
cmp dl,al
jle short @F
mov dl,al
@@:
cmp dl,ah
jle short @F
mov dl,ah
@@:
mov byte ptr levenshtein_matrix[ebx+ecx],dl
add ebp,1
add ebx,256
cmp ebx,string2len
BTK
jle short inner
add ecx,1
cmp ecx,string1len
BTK
jle short outer
xor eax,eax
mov al,levenshtein_matrix[ebx+ecx-1-256]
mov ebp,tempebp
pop ecx
ret
levenshtein_distance ENDP
Just for interest sake the results I've gotten sofar:
1,000,000 iterations timed in C# and ASM for the two strings as follows:
string1 db 'kempton park',0,0,0,0
string2 db 'kempston park',0,0,0,0
C# optimized + release mode build: 6 seconds
ASM: 900ms
Thats on my core2duo 2.5ghz dell laptop.
Better than 6 to 1 sounds fine to me, see if you can get it faster.
Other than filling in the 0 column with values past the length of the string,
1 to 255 always filled in, I didn't see anything else.
For example if the two strings were 'abcd' and 'wxyz'
01234
11234
22234
33334
44444 <- the answer
5
6
7
...
255
You managed to fill in the 0 row only as far as needed,
and it is incorporated in the routine to fill in the matrix.
By fixing it to a 256 x 256 byte matrix you don't have
to size it to fit different length strings.
Above left is -257, above -256, left -1, bytes earlier.
Your code is somewhat like this, past the string length code.
ss := 'abcd';
m := length(ss);
for i := 1 to length(ss) do s[i] := ss[i];
ss := 'wxyz';
n := length(ss);
for j := 1 to length(ss) do t[j] := ss[j];
for j := 0 to 255 do
d[0,j] := j;
for i := 0 to m do begin
d[i,0] := i;
if i > 0 then
for j := 1 to n do begin
if s[i] = t[j] then cost := 0
else cost := 1;
d[i,j] := minimum(d[i-1,j]+1,
d[i,j-1]+1,
d[i-1,j-1] + cost);
end;
end;
ans := d[m,n];
Are there any other algorithms ?
Did you try to replace
inc al
mov ah,levenshtein_matrix[ebx+ecx-256]
inc ah
with this one?
mov ah,levenshtein_matrix[ebx+ecx-256]
add ax, 1+256
Hey,
I tried the add ax replacement, slowed it down quite a bit in fact, from 900 to 1200ms.
I don't think there are any other algorithms for doing a levenshtein, without actually doing a completely different type of string similarity comparison.
Sofar I'm reasonably happy with this performance, but I will continue to look for optimizations.
What about these two variations ?
;===========================================================
align 16
inner:
mov edx,257 ; 1 + (1 shl 8)
mov al,[esi+ecx-1]
sub al,[edi+ebp-1]
jnz short @F
mov edx,256 ; 0 + (1 shl 8)
@@:
mov al,levenshtein_matrix[ebx+ecx-1]
add dx,word ptr levenshtein_matrix[ebx+ecx-1-256]
inc al
; given dl, dh, al put lowest in dl
cmp dl, dh
jle short @F
mov dl, dh
@@:
cmp dl, al
jle short @F
mov dl, al
@@:
mov byte ptr levenshtein_matrix[ebx+ecx],dl
;===========================================================
;===========================================================
align 16
inner:
mov edx,257 ; 1 + (1 shl 8)
mov al,[esi+ecx-1]
sub al,[edi+ebp-1]
mov eax,edx ; mov al,1 is another option
jnz short @F
mov edx,256 ; 0 + (1 shl 8)
@@:
add al,levenshtein_matrix[ebx+ecx-1]
add dx,word ptr levenshtein_matrix[ebx+ecx-1-256]
; given dl, dh, al put lowest in dl
cmp dl, dh
jle short @F
mov dl, dh
@@:
cmp dl, al
jle short @F
mov dl, al
@@:
mov byte ptr levenshtein_matrix[ebx+ecx],dl
;===========================================================
johnsa, if you are sure about padded strings, I timed the following to be a bit faster than strlens(only a few cycles, but I guess it counts :P):
_strlen macro txt
align 4
mov esi,txt ;esi <- buffer
xor ecx,ecx ;ecx <- counter
@@:
mov eax,[esi+ecx]
add ecx,4
sub eax,01010101h
BTK
jnc @B
inc ecx
inc al
jz @F
inc ecx
inc ah
jz @F
mov eax,[esi+ecx-4]
inc ecx
test al,al
jz @F
inc ecx
@@:
sub ecx,5
endm
Tried all the above suggestions (big thanks!) sofar though they've all come out about 10-15% slower.. I think i'll just stick to the original version at 980ms or so per million. It'll do :)
o.O
I've re-written the code based on your algorithm, it seems to speed things up a bit :U (note that magicval equals 256[i used it to sync with the data declaration]):ld proc s1,s2
;m = length(s1)
_strlen s1
push ecx
;initialize row 0
@@:
mov table[ecx],cl
dec cl
BTK
jns @B
;n = length(s2)
_strlen s2
push ecx
;initialize column 0
imul eax,ecx,magicval
@@:
mov table[eax],cl
sub eax,magicval
dec ecx
BTK
jnz @B
;loop setup
mov esi,s1
mov edi,s2
;xor eax,eax ;(already 0)
;xor ecx,ecx ;i (already 0)
rp1:
xor edx,edx ;j
xor ebx,ebx ;table offset
rp2:
;------------------------------------------------------------------------
mov al,byte ptr [esi+ecx]
sub al,[edi+edx]
setnz al ;al <- cost
;========================================================================
add al,table[ebx+ecx] ;al <- cost + table[i-1][j-1]
mov ah,table[ecx+1+ebx]
.if (al > ah) ;al <- minimum(al,table[i][j-1])
mov al,ah
inc al
.endif
mov ah,table[ecx+ebx+magicval]
.if (al > ah) ;al <- minimum(al,table[i-1][j])
mov al,ah
inc al
.endif
mov table[ecx+1+ebx+magicval],al ;table[i][j] <- al
;========================================================================
add ebx,magicval ;increase the offset
inc edx ;j++
cmp edx,[esp] ;j < n
BTK ;------------------------------------------------------------------------
jnz rp2
inc ecx ;i++
cmp ecx,[esp+4] ;i < m
BTK ;------------------------------------------------------------------------
jnz rp1
xor ah,ah ;al already contains the last entry
add esp,8 ;restore the stack frame
pop ebp
retn 8
ld endp
EDIT: ...any comments?