News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Levenshtein Edit Distance in ASM

Started by johnsa, September 26, 2008, 09:28:50 AM

Previous topic - Next topic

johnsa

Hey all,

I've been working on some projects requiring fuzzy matching, mainly in C#, but for interest sake I wanted to see the performance difference of some of the core functions if written in asm.

Here is the first attempt at the Levenshtein Distance function, only restriction is that the input strings are < 256 chars long. Which is all I need, so I pre-create a standard max size matrix.

Also the strings must be padded with 4 zeros, just to use the faster string length macro.

Anyone have any optimizations/ideas/thoughts?




strlen MACRO
mov ecx,-1
align 16
@@:
inc ecx
mov al,[esi+ecx]
test al,al
BTK
jnz short @B
ENDM

strlens MACRO
LOCAL foundLength
mov ebx,-4
align 16
@@:
add ebx,4
mov eax,[esi+ebx]
test eax,0ffh
BNT
jz short foundLength
test eax,0ff00h
BTK
jnz short @B
add ebx,1
jmp short foundLength
test eax,0ff0000h
BTK
jnz short @B
add ebx,2
jmp short foundLength
test eax,0ff000000h
BTK
jnz short @B
add ebx,3
jmp short foundLength
jmp short @B

foundLength:

ENDM

BNT MACRO
db 2eh
ENDM

BTK MACRO
db 3eh
ENDM

levenshtein_distance PROTO string1Addr:DWORD, string2Addr:DWORD

.data?

align 16
levenshtein_matrix db (256*256) DUP(?)
string1len dd ?
string2len dd ?
tempebp dd ?

.code

align 16
levenshtein_distance PROC string1Addr:DWORD, string2Addr:DWORD

push ecx
mov tempebp,ebp

; Length of String 1.
mov esi,string1Addr
strlens
mov string1len,ebx

; Length of String 2.
mov esi,string2Addr
strlens
shl ebx,8
mov string2len,ebx

; Pre-populate matrix first column.
mov esi,offset levenshtein_matrix
mov ecx,(255*256)
add esi,ecx
neg ecx
xor eax,eax
align 16
@@:
mov byte ptr [esi+ecx],al
add eax,1
add ecx,256
jnz short @B

; Main loop and clear cells.
mov esi,string1Addr
mov edi,string2Addr
mov ecx,1
align 16
outer:
mov ebx,256
mov ebp,1
mov byte ptr levenshtein_matrix[ecx],cl
align 16
inner:
mov edx,1
mov al,[esi+ecx-1]
sub al,[edi+ebp-1]
jnz short @F
xor edx,edx
@@:
add dl,levenshtein_matrix[ebx+ecx-1-256]
mov al,levenshtein_matrix[ebx+ecx-1]
inc al
mov ah,levenshtein_matrix[ebx+ecx-256]
inc ah

cmp dl,al
jle short @F
mov dl,al
@@:
cmp dl,ah
jle short @F
mov dl,ah
@@:
mov byte ptr levenshtein_matrix[ebx+ecx],dl

add ebp,1
add ebx,256
cmp ebx,string2len
BTK
jle short inner

add ecx,1
cmp ecx,string1len
BTK
jle short outer

xor eax,eax
mov al,levenshtein_matrix[ebx+ecx-1-256]

mov ebp,tempebp
pop ecx
ret

levenshtein_distance ENDP


johnsa

Just for interest sake the results I've gotten sofar:

1,000,000 iterations timed in C# and ASM for the two strings as follows:

string1 db 'kempton park',0,0,0,0
string2 db 'kempston park',0,0,0,0

C# optimized + release mode build: 6 seconds
ASM: 900ms

Thats on my core2duo 2.5ghz dell laptop.

hutch--

Better than 6 to 1 sounds fine to me, see if you can get it faster.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

dsouza123

Other than filling in the 0 column with values past the length of the string,
1 to 255 always filled in, I didn't see anything else.

For example if the two strings were 'abcd' and 'wxyz'

01234
11234
22234
33334
44444 <- the answer
5
6
7
...
255

You managed to fill in the 0 row only as far as needed,
and it is incorporated in the routine to fill in the matrix.

By fixing it to a 256 x 256 byte matrix you don't have
to size it to fit different length strings.
Above left is -257, above -256, left -1, bytes earlier.

Your code is somewhat like this, past the string length code.

  ss := 'abcd';
  m  := length(ss);
  for i := 1 to length(ss) do s[i] := ss[i];

  ss := 'wxyz';
  n  := length(ss);
  for j := 1 to length(ss) do t[j] := ss[j];

  for j := 0 to 255 do
    d[0,j] := j;

  for i := 0 to m do begin
    d[i,0] := i;
    if i > 0 then
    for j := 1 to n do begin
      if s[i] = t[j] then cost := 0
                     else cost := 1;
      d[i,j] := minimum(d[i-1,j]+1,
                        d[i,j-1]+1,
                        d[i-1,j-1] + cost);
    end;
  end;
  ans := d[m,n];


Are there any other algorithms ?

jj2007

Did you try to replace
inc al
mov ah,levenshtein_matrix[ebx+ecx-256]
inc ah


with this one?
mov ah,levenshtein_matrix[ebx+ecx-256]
add ax, 1+256

johnsa

Hey,

I tried the add ax replacement, slowed it down quite a bit in fact, from 900 to 1200ms.

I don't think there are any other algorithms for doing a levenshtein, without actually doing a completely different type of string similarity comparison.
Sofar I'm reasonably happy with this performance, but I will continue to look for optimizations.

dsouza123

What about these two variations ?


;===========================================================
align 16
inner:
    mov edx,257        ; 1 + (1 shl 8)
    mov al,[esi+ecx-1]
    sub al,[edi+ebp-1]
    jnz short @F
    mov edx,256        ; 0 + (1 shl 8)
@@:
    mov al,levenshtein_matrix[ebx+ecx-1]
    add dx,word ptr levenshtein_matrix[ebx+ecx-1-256]
    inc al

    ; given dl, dh, al  put lowest in  dl

    cmp dl, dh
    jle short @F
    mov dl, dh
@@:
    cmp dl, al
    jle short @F
    mov dl, al
@@:   
    mov byte ptr levenshtein_matrix[ebx+ecx],dl
;===========================================================



;===========================================================
align 16
inner:
    mov edx,257        ; 1 + (1 shl 8)
    mov al,[esi+ecx-1]
    sub al,[edi+ebp-1]
    mov eax,edx        ; mov al,1   is another option
    jnz short @F
    mov edx,256        ; 0 + (1 shl 8)
@@:
    add al,levenshtein_matrix[ebx+ecx-1]
    add dx,word ptr levenshtein_matrix[ebx+ecx-1-256]

    ; given dl, dh, al  put lowest in  dl

    cmp dl, dh
    jle short @F
    mov dl, dh
@@:
    cmp dl, al
    jle short @F
    mov dl, al
@@:   
    mov byte ptr levenshtein_matrix[ebx+ecx],dl
;===========================================================

DoomyD

#7
johnsa, if you are sure about padded strings, I timed the following to be a bit faster than strlens(only a few cycles, but I guess it counts :P):
_strlen   macro txt
   align 4
   mov      esi,txt      ;esi <- buffer
   xor      ecx,ecx      ;ecx <- counter
   @@:
      mov      eax,[esi+ecx]
      add      ecx,4
      sub      eax,01010101h
      BTK
      jnc      @B
   inc      ecx
   inc      al
   jz      @F
   inc      ecx
   inc      ah
   jz      @F
   mov      eax,[esi+ecx-4]
   inc      ecx
   test   al,al
   jz      @F
   inc      ecx
   @@:
   sub      ecx,5
endm

johnsa

Tried all the above suggestions (big thanks!) sofar though they've all come out about 10-15% slower.. I think i'll just stick to the original version at 980ms or so per million. It'll do :)

DoomyD

#9
o.O

I've re-written the code based on your algorithm, it seems to speed things up a bit  :U (note that magicval equals 256[i used it to sync with the data declaration]):ld   proc   s1,s2
      ;m = length(s1)
      _strlen s1
      push   ecx
     
      ;initialize   row 0
      @@:
         mov      table[ecx],cl
         dec      cl
         BTK
         jns      @B
     
      ;n = length(s2)
      _strlen s2
      push   ecx
     
      ;initialize column 0
      imul   eax,ecx,magicval
      @@:
         mov      table[eax],cl
         sub      eax,magicval
         dec      ecx
         BTK
         jnz      @B
     
      ;loop setup
      mov      esi,s1
      mov      edi,s2
      ;xor   eax,eax      ;(already 0)
      ;xor   ecx,ecx      ;i (already 0)
      rp1:               
         xor      edx,edx      ;j
         xor      ebx,ebx      ;table offset
         rp2:
            ;------------------------------------------------------------------------
            mov      al,byte ptr [esi+ecx]
            sub      al,[edi+edx]
            setnz   al                        ;al <- cost
            ;========================================================================
            add      al,table[ebx+ecx]            ;al <- cost + table[i-1][j-1]
            mov      ah,table[ecx+1+ebx]
            .if (al > ah)                     ;al <- minimum(al,table[i][j-1])
               mov      al,ah
               inc      al
            .endif
            mov      ah,table[ecx+ebx+magicval]
            .if (al > ah)                     ;al <- minimum(al,table[i-1][j])
               mov      al,ah
               inc      al
            .endif
            mov      table[ecx+1+ebx+magicval],al   ;table[i][j] <- al
            ;========================================================================
            add      ebx,magicval               ;increase the offset
            inc      edx                        ;j++
            cmp      edx,[esp]                  ;j < n
         BTK   ;------------------------------------------------------------------------
         jnz      rp2
         inc      ecx                           ;i++
         cmp      ecx,[esp+4]                     ;i < m
      BTK      ;------------------------------------------------------------------------
      jnz      rp1
     
      xor      ah,ah                     ;al already contains the last entry
      add      esp,8                     ;restore the stack frame
   pop      ebp
   retn   8
ld endp


EDIT: ...any comments?