Hi!
Before you say wtf**k is that I better expalin what my criteria was. :bg
1.) Swapping 2 strings on the spot.
2.) The exact lenghts have to be moved no extra bytes allowed.
3.) Max SIMD to be used is MMX (PII)
4.) Fast as possible.
I don't know the fourth one is achieved :P but I think I have the first 3 done.
.686p
.model flat
.mmx
; ===========================================================================
.code
; SWAP$(String1:STRING,String2:STRING)
; =============== S U B R O U T I N E =======================================
align 16
SWAP$ proc near public
pushREV esi,edi,ebx
mov esi, [esp+20] ;ESI = Source
mov edi, [esp+16] ;EDI = Destination
pxor mm0, mm0 ;zero for comparison mm0
pxor mm1, mm1 ;zero for comparison mm1
C5: movq mm2, [esi] ;move 8 bytes into mm2
movq mm3, [edi] ;move 8 bytes into mm3
pcmpeqb mm0, mm2 ;set bytes in xmm0 to FF if nullbytes found in xmm2
pcmpeqb mm1, mm3 ;set bytes in xmm1 to FF if nullbytes found in xmm3
xor ebx, ebx
movd ecx, mm0
test ecx, ecx
jnz @F
psrlq mm0, 32
movd ecx, mm0
mov bl, 4
@@: movd edx, mm1
test edx, edx
jnz @F
psrlq mm1, 32
movd edx, mm1
mov bh, 4
@@: bsf ecx, ecx
setz al
shr ecx, 3
add cl, bl
bsf edx, edx
setz ah
shr edx, 3
add dl, bh
test al, ah
jz @F
movq [edi], mm2
movq [esi], mm3
lea esi, [esi+8]
lea edi, [edi+8]
jmp C5
@@: cmp al, ah
jb SourceShort
ja DestShort
C0: movd eax, mm2
mov cl, 4
@@: test al, al
stosb
jz L0
shr eax, 8
loop @B
psrlq mm2, 32
jmp C0
L0: xchg edi, esi
C1: movd eax, mm3
mov cl, 4
@@: test al, al
stosb
jz L1
shr eax, 8
loop @B
psrlq mm3, 32
jmp C1
L1: popREV ebx,edi,esi
emms
retn 8
DestShort: sub edx, 8
xchg edi, esi
not edx
C3: movd eax, mm3
mov cl, 4
@@: test al, al
stosb
jz L3
shr eax, 8
loop @B
psrlq mm3, 32
jmp C3
L3: movq [esi], mm2
jmp L4
SourceShort:mov edx, ecx
sub edx, 8
not edx
C2: movd eax, mm2
mov cl, 4
@@: test al, al
stosb
jz L2
shr eax, 8
loop @B
psrlq mm2, 32
jmp C2
L2: movq [esi], mm3
L4: lea esi, [esi+8]
add edi, edx
xchg edi, esi
mov ebx, edi
mov dl, 3
@@: test esi, edx
jz L5
lodsb
stosb
test al, al
jnz @B
jmp L1
L5: inc edx
C4: mov ecx, edx
mov eax, [esi]
@@: stosb
test al, al
jz L1
shr eax, 8
dec ecx
jnz @B
add esi, edx
jmp C4
SWAP$ endp
end
without disecting all of your code...
you could use simd to move data that is properly aligned for that
on 4-aligned addresses, you could use dwords
then, use a few bytes at the beginning and end to take care of the unaligned data
this assumes that the destination buffer is aligned the same as the source - lol
if they are not, things could get messy
i did a similar thing to scan for unused bytes in bignum integers (also to load the first few) in LLKF9_1.zip...
http://www.masm32.com/board/index.php?topic=12363.msg94779#msg94779
Excuse my ignorant question: Why don't you just swap the pointers...?
lol JJ - spoil sport
you really know how to take all the fun out of being wrong - lol
:bg
But seriously: Is there a real life application of shuffling two strings around, instead of swapping their pointers?
Just curious...
Swapping pointers is too easy.
Your code does not correctly handle different length strings when the second is longer than the first, and the meaning of criteria 2 is not clear to me, so I assumed for my simple byte scanner that both strings will always be the same length.
;==========================================================================
include \masm32\include\masm32rt.inc
.686
.mmx
include \masm32\macros\timers.asm
;==========================================================================
.data
strx db "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",0
stry db "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyy",0
.code
;==========================================================================
; SWAP$(String1:STRING,String2:STRING)
;==========================================================================
align 16
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
SWAP$ proc psz1:DWORD, psz2:DWORD
push ebx
push edi
push esi
mov esi, [esp+20] ;ESI = Source
mov edi, [esp+16] ;EDI = Destination
pxor mm0, mm0 ;zero for comparison mm0
pxor mm1, mm1 ;zero for comparison mm1
C5: movq mm2, [esi] ;move 8 bytes into mm2
movq mm3, [edi] ;move 8 bytes into mm3
pcmpeqb mm0, mm2 ;set bytes in xmm0 to FF if nullbytes found in xmm2
pcmpeqb mm1, mm3 ;set bytes in xmm1 to FF if nullbytes found in xmm3
xor ebx, ebx
movd ecx, mm0
test ecx, ecx
jnz @F
psrlq mm0, 32
movd ecx, mm0
mov bl, 4
@@: movd edx, mm1
test edx, edx
jnz @F
psrlq mm1, 32
movd edx, mm1
mov bh, 4
@@: bsf ecx, ecx
setz al
shr ecx, 3
add cl, bl
bsf edx, edx
setz ah
shr edx, 3
add dl, bh
test al, ah
jz @F
movq [edi], mm2
movq [esi], mm3
lea esi, [esi+8]
lea edi, [edi+8]
jmp C5
@@: cmp al, ah
jb SourceShort
ja DestShort
C0: movd eax, mm2
mov cl, 4
@@: test al, al
stosb
jz L0
shr eax, 8
loop @B
psrlq mm2, 32
jmp C0
L0: xchg edi, esi
C1: movd eax, mm3
mov cl, 4
@@: test al, al
stosb
jz L1
shr eax, 8
loop @B
psrlq mm3, 32
jmp C1
L1: pop esi
pop edi
pop ebx
emms
ret 8
DestShort: sub edx, 8
xchg edi, esi
not edx
C3: movd eax, mm3
mov cl, 4
@@: test al, al
stosb
jz L3
shr eax, 8
loop @B
psrlq mm3, 32
jmp C3
L3: movq [esi], mm2
jmp L4
SourceShort:mov edx, ecx
sub edx, 8
not edx
C2: movd eax, mm2
mov cl, 4
@@: test al, al
stosb
jz L2
shr eax, 8
loop @B
psrlq mm2, 32
jmp C2
L2: movq [esi], mm3
L4: lea esi, [esi+8]
add edi, edx
xchg edi, esi
mov ebx, edi
mov dl, 3
@@: test esi, edx
jz L5
lodsb
stosb
test al, al
jnz @B
jmp L1
L5: inc edx
C4: mov ecx, edx
mov eax, [esi]
@@: stosb
test al, al
jz L1
shr eax, 8
dec ecx
jnz @B
add esi, edx
jmp C4
SWAP$ endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
;==========================================================================
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
SwapBS proc psz1:DWORD, psz2:DWORD
push esi
push edi
mov esi, [esp+12]
mov edi, [esp+16]
xor ecx, ecx
@@:
movzx eax, BYTE PTR [esi+ecx]
movzx edx, BYTE PTR [edi+ecx]
mov [edi+ecx], al
mov [esi+ecx], dl
inc ecx
test eax, eax
jnz @B
pop edi
pop esi
ret 8
SwapBS endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
;==========================================================================
start:
;==========================================================================
print ADDR strx,13,10
print ADDR stry,13,10
invoke SWAP$, ADDR strx, ADDR stry
print ADDR strx,13,10
print ADDR stry,13,10
invoke SwapBS, ADDR strx, ADDR stry
print ADDR strx,13,10
print ADDR stry,13,10,13,10
invoke Sleep, 4000
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke SWAP$, ADDR strx, ADDR stry
counter_end
print str$(eax)," cycles, SWAP$",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke SwapBS, ADDR strx, ADDR stry
counter_end
print str$(eax)," cycles, SwapBS",13,10,13,10
inkey "Press any key to exit..."
exit
;==========================================================================
end start
Running on a P3:
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
yyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
yyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
yyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
289 cycles, SWAP$
139 cycles, SwapBS
Eliminating the emms reduced the cycle count to 279.
Celeron M:
268 bytes for SWAP$
36 bytes for SwapBS
31 bytes for SwapJ1$
28 bytes for SwapJ2$
24 bytes for SwapJ3$
208 cycles, SWAP$
96 cycles, SwapBS
96 cycles, SwapJ1$
146 cycles, SwapJ2$
218 cycles, SwapJ3$
You can do an in place swap string which handles variable length strings with 3 replace passes.
text1 : string with SHORT text and VERY_LONG_STRING text to be swapped over
Output
text2 : string with VERY_LONG_STRING text and SHORT text to be swapped over
The logic is simple. Replace 1st text with marker(*.*.*.*.*.*.*.*), replace second text with first, replace marker with second.
Quote
But seriously: Is there a real life application of shuffling two strings around, instead of swapping their pointers?
Just curious...
You wanna sort a string array with "Quick sort" for example. :bg
Quote from: Ficko on December 01, 2009, 09:53:46 AM
Quote
But seriously: Is there a real life application of shuffling two strings around, instead of swapping their pointers?
Just curious...
You wanna sort a string array with "Quick sort" for example. :bg
Wouldn't call that technique "Quick sort". Maybe "Snail sort" ... :wink
@ MichaelW
Quote
Your code does not correctly handle different length strings when the second is longer than the first, and the meaning of criteria 2 is not clear to me.....
Hmm ??? :red
Why you think that?
I did this "Endurance" test - excuse me I did it from Basic and not from JJ MASM-Basic but I am not that fit on that jet but making progress :P -
and no discrepancy came up. ::)
DEF S1,S2 :STRING
DEF A,B,Error :UINT
MakeConsole()
S1 = STRING$(255,"=") 'Fill up the strings space with bogus data to see that no extra moving is happen.
S2 = S1
FOR B = 0 TO 30
FOR A = 0 TO 30
S1 = STRING$(B,"X")
S2 = STRING$(A,"*")
SWAP$(S1,S2)
PRINT "A=",USING_("'%03u'",A),"=",USING_("'%03u'",LEN(S1))," ",S1
PRINT "B=",USING_("'%03u'",B),"=",USING_("'%03u'",LEN(S2))," ",S2
IF (A <> LEN(S1)) OR B <> LEN(S2) THEN Error = TRUE
NEXT A
IF Error = TRUE THEN
PRINT "ERROR !!!"
BREAKFOR
ENDIF
NEXT B
Ficko,
The open question is how to handle different string lengths in assembler. If you have a string A with 50 characters, and a string B with 20, then when moving A into B's memory space you get a buffer overflow. So you cannot simply swap the contents. What you could do is allocate a new string B with 50 chars, and re-alloc the new A to 20 chars; but these operations are very slow...
Or have we completely misunderstood something?
Quote
...If you have a string A with 50 characters, and a string B with 20, then when moving A into B's memory space you get a buffer overflow...
Ok I got the worry now. :wink
Yes the space of course have to be at least so large to accommodate the longest string.
Like in a string array.
The element size of the array would be the longest string size + the ending zero.
The algo turned out that complicated because I want to make it to average the highest speed of any possible variation.
Like one string can be empty and the other 2000 chars long it would be waste of time to switch 2x2000 bytes.
@Dave
Quote
you could use simd to move data that is properly aligned for that
on 4-aligned addresses, you could use dwords...
Yes I thought about that but I wasn't able to come up with a code that would not extremely slow down short strings.
So I just align on dwords if one of the string is much longer than the other.
:bg
Does the 3 pass replace start to sound attractive yet ? Just make the target buffer bigger by enough, slap bamm thankya maam and you get it word perfect in 3 passes, the first will be slower than the next 2. The techinque has the advantage that it will do any string pair sizes and any count of swaps.
QuoteSo I just align on dwords if one of the string is much longer than the other.
that was my solution for replacing scasb, as well - scasb isn't that slow, but STD is a dog - lol
QuoteDoes the 3 pass replace start to sound attractive yet ?
i think JJ has the right idea - swap pointers, instead
> i think JJ has the right idea - swap pointers, instead
Thats fine if they are in an array of pointers, what about in a text file ?
Even better, use a 1 pass hash replace with only the swap words loaded in the hash table. :bg
Quote from: jj... but these operations are very slow...
Geez jj, why are you so obsessed with speed? So, if those operations are not super fast, we can't use them? Bologna. I think allocating new strings like that would be the best way to do it. And they would be fine for general purpose procedures.