News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

SWAP$

Started by Ficko, November 30, 2009, 09:40:52 PM

Previous topic - Next topic

Ficko

Hi!

Before you say wtf**k is that I better expalin what my criteria was. :bg

1.) Swapping 2 strings on the spot.
2.) The exact lenghts have to be moved no extra bytes allowed.
3.) Max SIMD to be used is MMX (PII)
4.) Fast as possible.

I don't know the fourth one is achieved  :P but I think I have the first 3 done.


.686p
.model flat
.mmx
; ===========================================================================
.code
; SWAP$(String1:STRING,String2:STRING)
; =============== S U B R O U T I N E =======================================
align 16
SWAP$ proc near public
      pushREV  esi,edi,ebx
      mov   esi, [esp+20]     ;ESI = Source
      mov   edi, [esp+16]   ;EDI = Destination
      pxor   mm0, mm0    ;zero for comparison mm0
      pxor  mm1, mm1    ;zero for comparison mm1
C5:   movq   mm2, [esi]    ;move 8 bytes into mm2
      movq  mm3, [edi]    ;move 8 bytes into mm3
      pcmpeqb  mm0, mm2    ;set bytes in xmm0 to FF if nullbytes found in xmm2
      pcmpeqb  mm1, mm3    ;set bytes in xmm1 to FF if nullbytes found in xmm3
      xor   ebx, ebx
      movd  ecx, mm0
      test  ecx, ecx
      jnz   @F
      psrlq  mm0, 32
      movd  ecx, mm0
      mov   bl, 4
@@:   movd  edx, mm1
      test  edx, edx
      jnz   @F
      psrlq  mm1, 32
      movd  edx, mm1
      mov   bh, 4   
@@:   bsf   ecx, ecx
      setz  al
      shr   ecx, 3
      add   cl, bl
      bsf   edx, edx
      setz  ah
      shr   edx, 3
      add   dl, bh
      test  al, ah
      jz   @F
      movq  [edi], mm2
      movq  [esi], mm3
      lea   esi, [esi+8]
      lea   edi, [edi+8]
      jmp   C5     
@@:   cmp   al, ah
      jb   SourceShort
      ja   DestShort
C0:   movd  eax, mm2
      mov   cl, 4
@@:   test  al, al
      stosb
      jz   L0
      shr   eax, 8
      loop  @B
      psrlq  mm2, 32
      jmp   C0
L0:   xchg  edi, esi
C1:   movd  eax, mm3
      mov   cl, 4
@@:   test  al, al
      stosb
      jz   L1
      shr   eax, 8
      loop  @B
      psrlq  mm3, 32
      jmp   C1   
L1:   popREV  ebx,edi,esi
      emms
      retn   8
DestShort: sub   edx, 8
      xchg  edi, esi
      not   edx
C3:   movd  eax, mm3
      mov   cl, 4
@@:   test  al, al
      stosb
      jz   L3
      shr   eax, 8
      loop  @B 
      psrlq  mm3, 32
      jmp   C3
L3:   movq  [esi], mm2
      jmp   L4   
SourceShort:mov   edx, ecx
      sub   edx, 8
      not   edx
C2:   movd  eax, mm2
      mov   cl, 4
@@:   test  al, al
      stosb
      jz   L2
      shr   eax, 8
      loop  @B
      psrlq  mm2, 32
      jmp   C2
L2:   movq  [esi], mm3
L4:   lea   esi, [esi+8]
      add   edi, edx
      xchg  edi, esi   
      mov   ebx, edi
      mov   dl, 3
@@:   test   esi, edx
      jz    L5
      lodsb   
      stosb   
      test   al, al
      jnz   @B
      jmp   L1
L5:   inc   edx
C4:   mov   ecx, edx
      mov   eax, [esi]
@@:   stosb   
      test   al, al
      jz    L1
      shr   eax, 8
      dec   ecx
      jnz   @B
      add   esi, edx
      jmp   C4   
SWAP$ endp
end

dedndave

without disecting all of your code...
you could use simd to move data that is properly aligned for that
on 4-aligned addresses, you could use dwords
then, use a few bytes at the beginning and end to take care of the unaligned data
this assumes that the destination buffer is aligned the same as the source - lol
if they are not, things could get messy
i did a similar thing to scan for unused bytes in bignum integers (also to load the first few) in LLKF9_1.zip...
http://www.masm32.com/board/index.php?topic=12363.msg94779#msg94779

jj2007

Excuse my ignorant question: Why don't you just swap the pointers...?

dedndave

lol JJ - spoil sport
you really know how to take all the fun out of being wrong - lol

jj2007

 :bg

But seriously: Is there a real life application of shuffling two strings around, instead of swapping their pointers?
Just curious...

MichaelW

Swapping pointers is too easy.

Your code does not correctly handle different length strings when the second is longer than the first, and the meaning of criteria 2 is not clear to me, so I assumed for my simple byte scanner that both strings will always be the same length.

;==========================================================================
    include \masm32\include\masm32rt.inc
    .686
    .mmx
    include \masm32\macros\timers.asm
;==========================================================================
    .data
        strx db "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",0
        stry db "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyy",0
    .code
;==========================================================================
; SWAP$(String1:STRING,String2:STRING)
;==========================================================================
align 16

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

SWAP$ proc psz1:DWORD, psz2:DWORD

      push ebx
      push edi
      push esi
      mov   esi, [esp+20]     ;ESI = Source
      mov   edi, [esp+16]   ;EDI = Destination
      pxor   mm0, mm0    ;zero for comparison mm0
      pxor  mm1, mm1    ;zero for comparison mm1
C5:   movq   mm2, [esi]    ;move 8 bytes into mm2
      movq  mm3, [edi]    ;move 8 bytes into mm3
      pcmpeqb  mm0, mm2    ;set bytes in xmm0 to FF if nullbytes found in xmm2
      pcmpeqb  mm1, mm3    ;set bytes in xmm1 to FF if nullbytes found in xmm3
      xor   ebx, ebx
      movd  ecx, mm0
      test  ecx, ecx
      jnz   @F
      psrlq  mm0, 32
      movd  ecx, mm0
      mov   bl, 4
@@:   movd  edx, mm1
      test  edx, edx
      jnz   @F
      psrlq  mm1, 32
      movd  edx, mm1
      mov   bh, 4
@@:   bsf   ecx, ecx
      setz  al
      shr   ecx, 3
      add   cl, bl
      bsf   edx, edx
      setz  ah
      shr   edx, 3
      add   dl, bh
      test  al, ah
      jz   @F
      movq  [edi], mm2
      movq  [esi], mm3
      lea   esi, [esi+8]
      lea   edi, [edi+8]
      jmp   C5
@@:   cmp   al, ah
      jb   SourceShort
      ja   DestShort
C0:   movd  eax, mm2
      mov   cl, 4
@@:   test  al, al
      stosb
      jz   L0
      shr   eax, 8
      loop  @B
      psrlq  mm2, 32
      jmp   C0
L0:   xchg  edi, esi
C1:   movd  eax, mm3
      mov   cl, 4
@@:   test  al, al
      stosb
      jz   L1
      shr   eax, 8
      loop  @B
      psrlq  mm3, 32
      jmp   C1
L1:   pop esi
      pop edi
      pop ebx
      emms
      ret   8
DestShort: sub   edx, 8
      xchg  edi, esi
      not   edx
C3:   movd  eax, mm3
      mov   cl, 4
@@:   test  al, al
      stosb
      jz   L3
      shr   eax, 8
      loop  @B
      psrlq  mm3, 32
      jmp   C3
L3:   movq  [esi], mm2
      jmp   L4
SourceShort:mov   edx, ecx
      sub   edx, 8
      not   edx
C2:   movd  eax, mm2
      mov   cl, 4
@@:   test  al, al
      stosb
      jz   L2
      shr   eax, 8
      loop  @B
      psrlq  mm2, 32
      jmp   C2
L2:   movq  [esi], mm3
L4:   lea   esi, [esi+8]
      add   edi, edx
      xchg  edi, esi
      mov   ebx, edi
      mov   dl, 3
@@:   test   esi, edx
      jz    L5
      lodsb
      stosb
      test   al, al
      jnz   @B
      jmp   L1
L5:   inc   edx
C4:   mov   ecx, edx
      mov   eax, [esi]
@@:   stosb
      test   al, al
      jz    L1
      shr   eax, 8
      dec   ecx
      jnz   @B
      add   esi, edx
      jmp   C4
SWAP$ endp

OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

;==========================================================================

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

SwapBS proc psz1:DWORD, psz2:DWORD

    push esi
    push edi
    mov esi, [esp+12]
    mov edi, [esp+16]
    xor ecx, ecx
  @@:
    movzx eax, BYTE PTR [esi+ecx]
    movzx edx, BYTE PTR [edi+ecx]
    mov [edi+ecx], al
    mov [esi+ecx], dl
    inc ecx
    test eax, eax
    jnz  @B
    pop edi
    pop esi
    ret 8

SwapBS endp

OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

;==========================================================================
start:
;==========================================================================
    print ADDR strx,13,10
    print ADDR stry,13,10
    invoke SWAP$, ADDR strx, ADDR stry
    print ADDR strx,13,10
    print ADDR stry,13,10
    invoke SwapBS, ADDR strx, ADDR stry
    print ADDR strx,13,10
    print ADDR stry,13,10,13,10

    invoke Sleep, 4000

    counter_begin 1000, HIGH_PRIORITY_CLASS
        invoke SWAP$, ADDR strx, ADDR stry
    counter_end
    print str$(eax)," cycles, SWAP$",13,10

    counter_begin 1000, HIGH_PRIORITY_CLASS
        invoke SwapBS, ADDR strx, ADDR stry
    counter_end
    print str$(eax)," cycles, SwapBS",13,10,13,10

    inkey "Press any key to exit..."
    exit
;==========================================================================
end start


Running on a P3:

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
yyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
yyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
yyyyyyyyyyyyyyyyyyyyyyyyyyyyyy

289 cycles, SWAP$
139 cycles, SwapBS


Eliminating the emms reduced the cycle count to 279.
eschew obfuscation

jj2007

Celeron M:

268     bytes for SWAP$
36      bytes for SwapBS
31      bytes for SwapJ1$
28      bytes for SwapJ2$
24      bytes for SwapJ3$

208 cycles, SWAP$
96 cycles, SwapBS
96 cycles, SwapJ1$
146 cycles, SwapJ2$
218 cycles, SwapJ3$

hutch--

You can do an in place swap string which handles variable length strings with 3 replace passes.

text1 : string with SHORT text and VERY_LONG_STRING text to be swapped over

Output

text2 : string with VERY_LONG_STRING text and SHORT text to be swapped over

The logic is simple. Replace 1st text with marker(*.*.*.*.*.*.*.*), replace second text with first, replace marker with second.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

Ficko

Quote
But seriously: Is there a real life application of shuffling two strings around, instead of swapping their pointers?
Just curious...

You wanna sort a string array with "Quick sort" for example. :bg

jj2007

Quote from: Ficko on December 01, 2009, 09:53:46 AM
Quote
But seriously: Is there a real life application of shuffling two strings around, instead of swapping their pointers?
Just curious...

You wanna sort a string array with "Quick sort" for example. :bg

Wouldn't call that technique "Quick sort". Maybe "Snail sort" ... :wink

Ficko

@ MichaelW
Quote
Your code does not correctly handle different length strings when the second is longer than the first, and the meaning of criteria 2 is not clear to me.....

Hmm ??? :red

Why you think that?

I did this "Endurance" test - excuse me I did it from Basic and not from JJ MASM-Basic but I am not that fit on that jet but making progress :P -
and no discrepancy came up. ::)


DEF S1,S2 :STRING
DEF A,B,Error :UINT
MakeConsole()
S1 = STRING$(255,"=") 'Fill up the strings space with bogus data to see that no extra moving is happen.
S2 = S1
FOR B = 0 TO 30
FOR A = 0 TO 30
S1 = STRING$(B,"X")
S2 = STRING$(A,"*")
SWAP$(S1,S2)
PRINT "A=",USING_("'%03u'",A),"=",USING_("'%03u'",LEN(S1))," ",S1
PRINT "B=",USING_("'%03u'",B),"=",USING_("'%03u'",LEN(S2))," ",S2
IF (A <> LEN(S1)) OR B <> LEN(S2) THEN Error = TRUE
NEXT A
IF Error = TRUE THEN
PRINT "ERROR !!!"
BREAKFOR
ENDIF
NEXT B

jj2007

Ficko,
The open question is how to handle different string lengths in assembler. If you have a string A with 50 characters, and a string B with 20, then when moving A into B's memory space you get a buffer overflow. So you cannot simply swap the contents. What you could do is allocate a new string B with 50 chars, and re-alloc the new A to 20 chars; but these operations are very slow...
Or have we completely misunderstood something?

Ficko

Quote
...If you have a string A with 50 characters, and a string B with 20, then when moving A into B's memory space you get a buffer overflow...
Ok I got the worry now. :wink

Yes the space of course have to be at least so large to accommodate the longest string.
Like in a string array.
The element size of the array would be the longest string size + the ending zero.

The algo turned out that complicated because I want to make it to average the highest speed of any possible variation.
Like one string can be empty and the other 2000 chars long it would be waste of time to switch 2x2000 bytes.

@Dave
Quote
you could use simd to move data that is properly aligned for that
on 4-aligned addresses, you could use dwords...

Yes I thought about that but I wasn't able to come up with a code that would not extremely slow down short strings.
So I just align on dwords if one of the string is much longer than the other.

hutch--

 :bg

Does the 3 pass replace start to sound attractive yet ? Just make the target buffer bigger by enough, slap bamm thankya maam and you get it word perfect in 3 passes, the first will be slower than the next 2. The techinque has the advantage that it will do any string pair sizes and any count of swaps.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

dedndave

QuoteSo I just align on dwords if one of the string is much longer than the other.
that was my solution for replacing scasb, as well - scasb isn't that slow, but STD is a dog - lol

QuoteDoes the 3 pass replace start to sound attractive yet ?
i think JJ has the right idea - swap pointers, instead