Levenshtein Edit Distance in ASM

johnsa · September 26, 2008, 09:28:50 AM

Hey all,

I've been working on some projects requiring fuzzy matching, mainly in C#, but for interest sake I wanted to see the performance difference of some of the core functions if written in asm.

Here is the first attempt at the Levenshtein Distance function, only restriction is that the input strings are < 256 chars long. Which is all I need, so I pre-create a standard max size matrix.

Also the strings must be padded with 4 zeros, just to use the faster string length macro.

Anyone have any optimizations/ideas/thoughts?

Code Select




strlen MACRO
	mov ecx,-1
align 16
@@:	
	inc ecx
	mov al,[esi+ecx]
	test al,al
	BTK
	jnz short @B
ENDM

strlens MACRO
	LOCAL foundLength
	mov ebx,-4
align 16
@@:
	add ebx,4
	mov eax,[esi+ebx]
	test eax,0ffh
	BNT
	jz short foundLength
	test eax,0ff00h
	BTK
	jnz short @B
	add ebx,1
	jmp short foundLength
	test eax,0ff0000h
	BTK
	jnz short @B
	add ebx,2
	jmp short foundLength
	test eax,0ff000000h
	BTK
	jnz short @B
	add ebx,3
	jmp short foundLength	
	jmp short @B
	
foundLength:
		
ENDM

BNT MACRO
	db 2eh	
ENDM

BTK MACRO
	db 3eh	
ENDM

levenshtein_distance PROTO string1Addr:DWORD, string2Addr:DWORD

.data?

align 16
levenshtein_matrix db (256*256) DUP(?)
string1len dd ?
string2len dd ?
tempebp dd ?

.code

align 16
levenshtein_distance PROC string1Addr:DWORD, string2Addr:DWORD

	push ecx
	mov tempebp,ebp
	
	; Length of String 1.
	mov esi,string1Addr
	strlens
	mov string1len,ebx
	
	; Length of String 2.
	mov esi,string2Addr
	strlens
	shl ebx,8
	mov string2len,ebx
	
	; Pre-populate matrix first column.
	mov esi,offset levenshtein_matrix
	mov ecx,(255*256)
	add esi,ecx
	neg ecx
	xor eax,eax
align 16
@@:
	mov byte ptr [esi+ecx],al
	add eax,1
	add ecx,256
	jnz short @B
	
	; Main loop and clear cells.
	mov esi,string1Addr
	mov edi,string2Addr
	mov ecx,1
align 16
outer:
	mov ebx,256
	mov ebp,1
	mov byte ptr levenshtein_matrix[ecx],cl
align 16
inner:
	mov edx,1
	mov al,[esi+ecx-1]
	sub al,[edi+ebp-1]
	jnz short @F
	xor edx,edx
@@:
	add dl,levenshtein_matrix[ebx+ecx-1-256]
	mov al,levenshtein_matrix[ebx+ecx-1]
	inc al
	mov ah,levenshtein_matrix[ebx+ecx-256]
	inc ah

	cmp dl,al
	jle short @F
	mov dl,al
@@:	
	cmp dl,ah
	jle short @F
	mov dl,ah
@@:
	mov byte ptr levenshtein_matrix[ebx+ecx],dl
	
	add ebp,1
	add ebx,256
	cmp ebx,string2len
	BTK
	jle short inner
	
	add ecx,1
	cmp ecx,string1len
	BTK
	jle short outer

	xor eax,eax
	mov al,levenshtein_matrix[ebx+ecx-1-256]
	
	mov ebp,tempebp
	pop ecx	
	ret
	
levenshtein_distance ENDP

johnsa · September 26, 2008, 09:33:46 AM

Just for interest sake the results I've gotten sofar:

1,000,000 iterations timed in C# and ASM for the two strings as follows:

string1 db 'kempton park',0,0,0,0
string2 db 'kempston park',0,0,0,0

C# optimized + release mode build: 6 seconds
ASM: 900ms

Thats on my core2duo 2.5ghz dell laptop.

hutch-- · September 26, 2008, 11:48:39 AM

Better than 6 to 1 sounds fine to me, see if you can get it faster.

dsouza123 · September 29, 2008, 04:16:28 AM

Other than filling in the 0 column with values past the length of the string,
1 to 255 always filled in, I didn't see anything else.

For example if the two strings were 'abcd' and 'wxyz'

01234
11234
22234
33334
44444 <- the answer
5
6
7
...
255

You managed to fill in the 0 row only as far as needed,
and it is incorporated in the routine to fill in the matrix.

By fixing it to a 256 x 256 byte matrix you don't have
to size it to fit different length strings.
Above left is -257, above -256, left -1, bytes earlier.

Your code is somewhat like this, past the string length code.

Code Select


  ss := 'abcd';
  m  := length(ss);
  for i := 1 to length(ss) do s[i] := ss[i];

  ss := 'wxyz';
  n  := length(ss);
  for j := 1 to length(ss) do t[j] := ss[j];

  for j := 0 to 255 do
    d[0,j] := j;

  for i := 0 to m do begin
    d[i,0] := i;
    if i > 0 then
    for j := 1 to n do begin
      if s[i] = t[j] then cost := 0
                     else cost := 1;
      d[i,j] := minimum(d[i-1,j]+1,
                        d[i,j-1]+1,
                        d[i-1,j-1] + cost);
    end;
  end;
  ans := d[m,n];

Are there any other algorithms ?

jj2007 · September 29, 2008, 06:53:54 AM

Did you try to replace

Code Select

inc al
mov ah,levenshtein_matrix[ebx+ecx-256]
inc ah

with this one?

Code Select

mov ah,levenshtein_matrix[ebx+ecx-256]
add ax, 1+256

johnsa · September 29, 2008, 07:39:31 AM

Hey,

I tried the add ax replacement, slowed it down quite a bit in fact, from 900 to 1200ms.

I don't think there are any other algorithms for doing a levenshtein, without actually doing a completely different type of string similarity comparison.
Sofar I'm reasonably happy with this performance, but I will continue to look for optimizations.

dsouza123 · September 30, 2008, 02:21:38 AM

What about these two variations ?

Code Select


;===========================================================
align 16
inner:
    mov edx,257        ; 1 + (1 shl 8)
    mov al,[esi+ecx-1]
    sub al,[edi+ebp-1]
    jnz short @F
    mov edx,256        ; 0 + (1 shl 8)
@@:
    mov al,levenshtein_matrix[ebx+ecx-1]
    add dx,word ptr levenshtein_matrix[ebx+ecx-1-256]
    inc al

    ; given dl, dh, al  put lowest in  dl 

    cmp dl, dh
    jle short @F
    mov dl, dh
@@:
    cmp dl, al
    jle short @F
    mov dl, al
@@:    
    mov byte ptr levenshtein_matrix[ebx+ecx],dl
;===========================================================

Code Select


;===========================================================
align 16
inner:
    mov edx,257        ; 1 + (1 shl 8)
    mov al,[esi+ecx-1]
    sub al,[edi+ebp-1]
    mov eax,edx        ; mov al,1   is another option
    jnz short @F
    mov edx,256        ; 0 + (1 shl 8)
@@:
    add al,levenshtein_matrix[ebx+ecx-1]
    add dx,word ptr levenshtein_matrix[ebx+ecx-1-256]

    ; given dl, dh, al  put lowest in  dl 

    cmp dl, dh
    jle short @F
    mov dl, dh
@@:
    cmp dl, al
    jle short @F
    mov dl, al
@@:    
    mov byte ptr levenshtein_matrix[ebx+ecx],dl
;===========================================================

DoomyD · September 30, 2008, 08:45:54 PM

johnsa, if you are sure about padded strings, I timed the following to be a bit faster than strlens(only a few cycles, but I guess it counts :P):

Code Select

_strlen   macro txt
   align 4
   mov      esi,txt      ;esi <- buffer
   xor      ecx,ecx      ;ecx <- counter
   @@:
      mov      eax,[esi+ecx]
      add      ecx,4
      sub      eax,01010101h
      BTK
      jnc      @B
   inc      ecx
   inc      al
   jz      @F
   inc      ecx
   inc      ah
   jz      @F
   mov      eax,[esi+ecx-4]
   inc      ecx
   test   al,al
   jz      @F
   inc      ecx
   @@:
   sub      ecx,5
endm

johnsa · October 01, 2008, 11:14:05 AM

Tried all the above suggestions (big thanks!) sofar though they've all come out about 10-15% slower.. I think i'll just stick to the original version at 980ms or so per million. It'll do :)

DoomyD · October 01, 2008, 07:51:50 PM

o.O

I've re-written the code based on your algorithm, it seems to speed things up a bit :U (note that magicval equals 256[i used it to sync with the data declaration]):

Code Select

ld   proc   s1,s2
      ;m = length(s1)
      _strlen s1
      push   ecx
      
      ;initialize   row 0
      @@:
         mov      table[ecx],cl
         dec      cl
         BTK
         jns      @B
      
      ;n = length(s2)
      _strlen s2
      push   ecx
      
      ;initialize column 0
      imul   eax,ecx,magicval
      @@:
         mov      table[eax],cl
         sub      eax,magicval
         dec      ecx
         BTK
         jnz      @B
      
      ;loop setup
      mov      esi,s1
      mov      edi,s2
      ;xor   eax,eax      ;(already 0)
      ;xor   ecx,ecx      ;i (already 0)
      rp1:               
         xor      edx,edx      ;j
         xor      ebx,ebx      ;table offset
         rp2:
            ;------------------------------------------------------------------------
            mov      al,byte ptr [esi+ecx]
            sub      al,[edi+edx]
            setnz   al                        ;al <- cost
            ;========================================================================
            add      al,table[ebx+ecx]            ;al <- cost + table[i-1][j-1]
            mov      ah,table[ecx+1+ebx]
            .if (al > ah)                     ;al <- minimum(al,table[i][j-1])
               mov      al,ah
               inc      al
            .endif
            mov      ah,table[ecx+ebx+magicval]
            .if (al > ah)                     ;al <- minimum(al,table[i-1][j])
               mov      al,ah
               inc      al
            .endif
            mov      table[ecx+1+ebx+magicval],al   ;table[i][j] <- al
            ;========================================================================
            add      ebx,magicval               ;increase the offset
            inc      edx                        ;j++
            cmp      edx,[esp]                  ;j < n
         BTK   ;------------------------------------------------------------------------
         jnz      rp2
         inc      ecx                           ;i++
         cmp      ecx,[esp+4]                     ;i < m
      BTK      ;------------------------------------------------------------------------
      jnz      rp1
      
      xor      ah,ah                     ;al already contains the last entry
      add      esp,8                     ;restore the stack frame
   pop      ebp
   retn   8
ld endp

EDIT: ...any comments?

News:

Levenshtein Edit Distance in ASM

johnsa

johnsa

hutch--

dsouza123

jj2007

johnsa

dsouza123

DoomyD

johnsa

DoomyD