Thinking about string reversion, I came up with this chunk of code:
mov edx, p_str ; Pointer to first 4 bytes string - the one to reverse
mov ebx, p_str1 ; Pointer to second 4 bytes string - the one that holds the result
mov eax, [edx]
bswap eax
mov [ebx], eax
This is a special case, a 4 bytes string, but I was thinking about how many ways
I could do this string reversion using stack, single bytes move, and so on.
Does anybody knows any other way to do it? Or a better/faster way?
Thanks
Frank
Back with the 8086 I would have suggested XCHG against memory, but now that is hideously slow.
mov edx, p_str ; Pointer to first 8 bytes string - the one to reverse
mov ebx, p_str1 ; Pointer to second 8 bytes string - the one that holds the result
mov eax, [edx]
mov ecx, [edx+4]
bswap eax
bswap ecx
mov [ebx], ecx
mov [ebx+4], eax
A lot of optimizations would be workable if the string length was divisible by 4, watching for odd/even multiples. Others for 5 or 6 character strings.
Also do you want it to use separate in/out buffers, or swap in place?
Quote from: clive on May 20, 2010, 09:06:31 PM
Back with the 8086 I would have suggested XCHG against memory, but now that is hideously slow.
mov edx, p_str ; Pointer to first 8 bytes string - the one to reverse
mov ebx, p_str1 ; Pointer to second 8 bytes string - the one that holds the result
mov eax, [edx]
mov ecx, [edx+4]
bswap eax
bswap ecx
mov [ebx], ecx
mov [ebx+4], eax
A lot of optimizations would be workable if the string length was divisible by 4, watching for odd/even multiples. Others for 5 or 6 character strings.
Also do you want it to use separate in/out buffers, or swap in place?
That's correct indeed! :P
If the string lenght is divisible by 4 the speed of reversing should be quite
high. Well it doesn't matter if the string is the same or a buffer-string to do the conversion,
it can be easily modified.
I used the following code for testing 1000 conversion, this is why I used 2 variables,
just to retain the original value [what a silly reason :lol ]
! mov ecx, 1000
! mov edx, p_str
! mov ebx, p_str1
count_rev:
! mov eax, [edx]
! bswap eax
! mov [ebx], eax
! dec ecx
! jnz count_rev
By the way, what kind of approach would you use for 5-7 bytes, or not multiple of 4?
Using the stack would be slow? How would you use it in case?
Thanks Clive
Frank
Without much thought, I have this (ie not tested, could be broken)
mov edx, p_str ; Pointer to first 5 bytes string - the one to reverse
mov ebx, p_str1 ; Pointer to second 5 bytes string - the one that holds the result
mov eax, [edx]
mov cl, [edx+4]
bswap eax
mov [ebx], cl
mov [ebx+1], eax
mov edx, p_str ; Pointer to first 6 bytes string - the one to reverse
mov ebx, p_str1 ; Pointer to second 6 bytes string - the one that holds the result
mov eax, [edx]
mov cx, [edx+4]
xchg cl, ch
bswap eax
mov [ebx], cx
mov [ebx+2], eax
mov edx, p_str ; Pointer to first 7 bytes string - the one to reverse
mov ebx, p_str1 ; Pointer to second 7 bytes string - the one that holds the result
mov eax, [edx]
mov ecx, [edx+4]
bswap eax
bswap ecx
ror ecx,8
mov [ebx], ecx
mov [ebx+3], eax
For larger strings, I would assume I know the length, and then step backward 8 bytes at a time, and then handle the remainder.
Swapping in place, I would again handle as may 8 byte groups as possible, perhaps 4 bytes from each end until I met in the middle, and then code for the 1,2 or 3 byte remainder.
As far as I can understand these routines looks fine. :U
Well, time to sleep. :dazzled:
If somebody is awake and has any idea on how to exploit the full potential
of the stack for string reversing, he is welcome.
Frank
Not sure the stack helps, you'd have to copy the data back off.
Here I process 8 bytes at a time
.486
.MODEL FLAT, C
.CODE
align 16
SwapString PROC USES esi edi Src:PTR BYTE, Dst:PTR BYTE, Len:DWORD
mov esi,Src
mov edi,Dst
mov ecx,Len
mov byte ptr [edi+ecx],0 ; guarantee a NUL at the end of the string
sub ecx,8
jb swap_final
align 16
swap_loop:
mov eax,[esi]
mov edx,[esi+4]
bswap eax
bswap edx
add esi,8
mov [edi+ecx],edx
mov [edi+ecx+4],eax
sub ecx,8
jnb swap_loop
swap_final:
add ecx,8
jmp swap_tbl[ecx * 4]
align 4
swap_tbl:
dd offset swap_done
dd offset swap_1
dd offset swap_2
dd offset swap_3
dd offset swap_4
dd offset swap_5
dd offset swap_6
dd offset swap_7
align 16
swap_1:
mov al,[esi]
mov [edi+ecx-1],al
swap_done:
ret
align 16
swap_2:
mov ax,[esi]
xchg al,ah
mov [edi+ecx-2],ax
ret
align 16
swap_3:
mov eax,[esi]
bswap eax
shr eax,8
mov [edi+ecx-3],al
shr eax,8
mov [edi+ecx-2],ax
ret
align 16
swap_4:
mov eax,[esi]
bswap eax
mov [edi+ecx-4],eax
ret
align 16
swap_5:
mov eax,[esi]
mov dl,[esi+4]
bswap eax
mov [edi+ecx-5],dl
mov [edi+ecx-4],eax
ret
align 16
swap_6:
mov eax,[esi]
mov dx,[esi+4]
bswap eax
xchg dl,dh
mov [edi+ecx-6],dx
mov [edi+ecx-4],eax
ret
align 16
swap_7:
mov eax,[esi]
mov edx,[esi+4]
bswap eax
bswap edx
shr edx,8
mov [edi+ecx-7],edx
mov [edi+ecx-4],eax
ret
SwapString ENDP
END
Before going to dissect your last proposed routine, I'd like to
draw your attention to this section of your previous code:
Quote
mov edx, p_str ; Pointer to first 7 bytes string - the one to reverse
mov ebx, p_str1 ; Pointer to second 7 bytes string - the one that holds the result
mov eax, [edx]
mov ecx, [edx+4]
bswap eax
bswap ecx
ror ecx,8
mov [ebx], ecx
mov [ebx+3], eax
When you
Quote
mov ecx, [edx+4]
you get the last four bytes, 3 pertaining to the string and an extra one,
and with
Quote
bswap ecx
the extra byte becomes the leftmost byte inside the ecx register,
after you
Quote
ror ecx,8
and the extra byte moves 8 bit on the right,
but I was sure you would have done a
Quote
rol ecx,8
in order to get rid of the extra byte when you copy on the rightmost
byte the content of eax
Quote
mov [ebx+3], eax
unless I am misunderstanding something about how bytes move
inside and out memory/registers.
Can you please explain me what I've missed?
Thanks
Frank
pstr db 'A','B','C','D','E','F','G',0
mov edx, p_str ; Pointer to first 7 bytes string - the one to reverse
mov ebx, p_str1 ; Pointer to second 7 bytes string - the one that holds the result
mov eax, [edx]
; eax = D/C/B/A
mov ecx, [edx+4]
; ecx = 0/G/F/E
bswap eax
; eax = A/B/C/D
bswap ecx
; ecx = E/F/G/0
ror ecx,8
; ecx = 0/E/F/G
mov [ebx], ecx
; str1+0 = 'G'
; str1+1 = 'F'
; str1+2 = 'E'
; str1+3 = 0 ; overwriten next
mov [ebx+3], eax
; str1+3 = 'D'
; str1+4 = 'C'
; str1+5 = 'B'
; str1+6 = 'A'
So the bytes are written in this order inside the registers:
mov edx, p_str ; Pointer to first 7 bytes string - the one to reverse
mov ebx, p_str1 ; Pointer to second 7 bytes string - the one that holds the result
mov eax, [edx]
; eax = D/C/B/A
mov ecx, [edx+4]
; ecx = 0/G/F/E
I had the silly idea that it was the reverse for strings and this was valid only for
numbers :lol
When I'll be back from the week-end, I'll give it a try. :P
Thanks Clive for your patience :U
Frank
Something short and crispy?
include \masm32\include\masm32rt.inc
.data
TheString db "zyxwvutsrqponmlkjihgfedcba", 0
.code
start: push offset TheString
call RevString
print "abcdefghijklmnopqrstuvwxyz", 13, 10 ; for comparison
print offset TheString
getkey
exit
RevString proc
push esi
push edi
mov esi, [esp+4+8]
mov edi, esi
add edi, len(edi) ; you must know the end of the string
shr eax, 3 ; len div 8 - we use dwords and meet in the middle
je @F
.Repeat
sub edi, 4
mov ecx, [esi]
mov edx, [edi]
bswap ecx
bswap edx
mov [edi], ecx
mov [esi], edx
add esi, 4
dec eax
.Until Zero?
@@: dec edi
cmp edi, esi
jbe @F
movzx eax, byte ptr [esi]
movzx edx, byte ptr [edi]
mov [esi], dl
mov [edi], al
inc esi
jmp @B
@@:
pop edi
pop esi
ret 4
RevString endp
end start
Frank,
Have a look at the masm32 library source file szRev.asm. It swaps pairs from opposite ends until it reaches the middle. It seems to run OK.
Celeron M:
217 cycles for RevString
.sogla gnitset sa hcus ,sesoprup fo yteirav a rof sevres taht ,gnol sretcarahc 001 ,gnirts a si sihT
377 cycles for szRev
This is a string, 100 characters long, that serves for a variety of purposes, such as testing algos.
:thumbu
a little modification on Jochen's code :P
i don't know if it'll make a big difference - lol
what is slowing us down is mis-aligned dword accesses - there's probably no good way around that
;-----------------------------------------------------------------------------------------------
RevStr proc
push edi
push esi
mov edi,[esp+4+8]
mov esi,edi
add edi,len(edi) ; you must know the end of the string
shr eax,3 ; len div 8 - we use dwords and meet in the middle
jz RevSt2
RevSt0: sub edi,4
mov ecx,[esi]
mov edx,[edi]
bswap ecx
bswap edx
mov [edi],ecx
mov [esi],edx
dec eax
lea esi,[esi+4]
jnz RevSt0
jmp short RevSt2
RevSt1: movzx edx,byte ptr [edi]
movzx eax,byte ptr [esi]
mov [esi],dl
mov [edi],al
inc esi
RevSt2: dec edi
cmp edi,esi
ja RevSt1
pop esi
pop edi
ret 4
RevStr endp
;-----------------------------------------------------------------------------------------------
oops
Quote from: dedndave on May 21, 2010, 06:09:45 PM
oops
? ::)
223 cycles for RevString
..sogla ym gnitset sa hcus ,s
207 cycles for RevString2
This is a string, 100 charact
224 cycles for RevStr
..sogla ym gnitset sa hcus ,s
380 cycles for szRev
This is a string, 100 charact
Sizes:
64 RevString
64 RevString2
65 RevStr
72 szRev
I guess
RevString2 will make it into the MasmBasic library :bg
By the way: The Masm32 lib szRev triggers an exception for string lens below 2, so you better check for the len before using it.
Atom
516 cycles for RevString
427 cycles for RevString2
419 cycles for RevStr
1663 cycles for szRev
Sizes:
64 RevString
64 RevString2
65 RevStr
72 szRev
A factor 4... that's quite a big difference. I wonder what the Atom doesn't like in szRev's inner loop:
@@:
mov cl, [esi+eax] ; load end pairs
mov dl, [edi]
mov [esi+eax], dl ; swap end pairs
mov [edi], cl
sub edi, 1
add eax, 1
jnz @B ; exit on half length
RevStr runs fastest on the prescott dual core, as well
hard to know why only the celeron M runs the new one faster :red
337 cycles for RevString
..sogla ym gnitset sa hcus ,sesoprup fo yteirav a sevres taht ,gnol sretcarahc 0
01 ,gnirts a si sihT
474 cycles for RevString2
This is a string, 100 characters long, that serves a variety of purposes, such a
s testing my algos..
328 cycles for RevStr
..sogla ym gnitset sa hcus ,sesoprup fo yteirav a sevres taht ,gnol sretcarahc 0
01 ,gnirts a si sihT
600 cycles for szRev
This is a string, 100 characters long, that serves a variety of purposes, such a
s testing my algos..
at any rate, the biggest change i made was a simple improvement in loop structure
if your loop ends with a JMP instruction, it is usually a sign that something can be done better
in this example, we check the loop control (count) prior to executing the body of the loop
this leaves us with the ugly tell-tale sign of a JMP instruction at the end
;loop fall-through entry begins here
Loop01: cmp ecx,0 ;used for example only
jz Loop03
Loop02:
;code representing the body of the loop goes here
jmp Loop01
;process continues after completion
Loop03:
by placing the control test at the end, and using JMP (usually SHORT) at entry,
we can avoid executing that JMP instruction with each pass of the loop
;loop fall-through entry begins here
jmp short Loop02
Loop01:
;code representing the body of the loop goes here
Loop02: cmp ecx,0 ;used for example only
jnz Loop01
;process continues after completion
Loop03:
notice, there is no difference in total size
we have simply moved the JMP from inside the loop to outside the loop
Dave,
What are your timings? What happens if you put if 0 in line 109? It could be the lodsd...
it's a little better...
339 cycles for RevString
..sogla ym gnitset sa hcus ,sesoprup fo yteirav a sevres taht ,gnol sretcarahc 0
01 ,gnirts a si sihT
396 cycles for RevString2
This is a string, 100 characters long, that serves a variety of purposes, such a
s testing my algos..
333 cycles for RevStr
..sogla ym gnitset sa hcus ,sesoprup fo yteirav a sevres taht ,gnol sretcarahc 0
01 ,gnirts a si sihT
599 cycles for szRev
This is a string, 100 characters long, that serves a variety of purposes, such a
s testing my algos..
i liked the first code - simple and easy to understand - lol
(except for the individual-byte loop structure, as mentioned above)
Here is a quick twiddle of the test piece. I removed some of the stuff from szRev so it was doing that same amount of work and the others and it dropped in timings about 40%. Misaligned the string by 3 bytes and all of the DWORD versions fell over timings wise like normal.
I wrote it as byte reads and writes because its alignment insensitive.
291 cycles for RevString
..sogla ym gnitset sa hcus ,sesoprup fo yteirav a sevres taht ,gnol sretcarahc 0
01 ,gnirts a si sihT
283 cycles for RevString2
This is a string, 100 characters long, that serves a variety of purposes, such a
s testing my algos..
286 cycles for RevStr
..sogla ym gnitset sa hcus ,sesoprup fo yteirav a sevres taht ,gnol sretcarahc 0
01 ,gnirts a si sihT
225 cycles for szRev
his is a string, 100 characters long, that serves a variety of purposes, such as
testing my algos..T
Sizes:
64 RevString
64 RevString2
65 RevStr
62 szRev
--- ok ---
i like the idea, Hutch, but i think there is a little bug in there
what processor were those times on ?
prescott dual core 3 GHz
442 cycles for RevString
..sogla ym gnitset sa hcus ,sesoprup fo yteirav a sevres taht ,gnol sretcarahc 0
01 ,gnirts a si sihT
579 cycles for RevString2
This is a string, 100 characters long, that serves a variety of purposes, such a
s testing my algos..
442 cycles for RevStr
..sogla ym gnitset sa hcus ,sesoprup fo yteirav a sevres taht ,gnol sretcarahc 0
01 ,gnirts a si sihT
418 cycles for szRev
his is a string, 100 characters long, that serves a variety of purposes, such as
testing my algos..T
notice the string at the end :red
i think you have to increment the length or something
Bug corrected. Celeron M:
314 cycles for RevString
..sogla ym gnitset sa hcus ,
249 cycles for RevString2
This is a string, 100 charac
313 cycles for RevStr
..sogla ym gnitset sa hcus ,
263 cycles for szRev
This is a string, 100 charac
Sizes:
64 RevString
64 RevString2
65 RevStr
59 szRev
RevString2 is a bit faster now because I shamelessly copied a line from Hutch.... :bg
What is still missing is
invoke StrLen, esi
cmp eax, 1
jbe TooShort
Otherwise szRev chokes on one-byte or null strings.
prescott
430 cycles for RevString
..sogla ym gnitset sa hcus ,sesoprup fo yteirav a sevres taht ,gnol sretcarahc 0
01 ,gnirts a si sihT
439 cycles for RevString2
This is a string, 100 characters long, that serves a variety of purposes, such a
s testing my algos..
428 cycles for RevStr
..sogla ym gnitset sa hcus ,sesoprup fo yteirav a sevres taht ,gnol sretcarahc 0
01 ,gnirts a si sihT
404 cycles for szRev
This is a string, 100 characters long, that serves a variety of purposes, such a
s testing my algos..
If it actually mattered which I am not convinced about, using the Agner Fog style StrLen unrolled by 4 locally would see some gains in speed but it would need to be corrected for the leading alignment first.
I didn't fix the 1 byte error as I was testing the idea, it would need both the counter corrected and the test for 1 or 0 length.
Quote from: dedndave on May 22, 2010, 04:53:17 PM
prescott
Where is lingo when I need him? A little rant on legacy CPUs??
Wait for my pshufb version. It may take a while though until I have enough bucks for a brand new SSE3 puter :bg
oh - we are calling this a legacy CPU, already ?
:lol
that was quick - seems like i just bought the damn thing
Quote from: hutch-- on May 22, 2010, 04:57:09 PM
using the Agner Fog style StrLen unrolled by 4 locally would see some gains in speed
That was actually the line I copied from you:
if newversion_shamelessly_copied_from_Hutch
invoke StrLen, esi
lea edi, [esi+eax-4] ; you must know the end of the string
else
lea edi, [esi+len(esi)-4] ; szLen is horribly slow
endif
I use a faster SSE2 strlen in MasmBasic, but it would be a bit of an overkill here.
i would think a quick SCASB to find the null should work as fast as anything
with a little adjustment, that could leave EDI pointing at the end of the string, too
what do i know - lol
Quote from: dedndave on May 22, 2010, 05:24:16 PM
i would think a quick SCASB to find the null should work as fast as anything
with a little adjustment, that could leave EDI pointing at the end of the string, too
what do i know - lol
repne scasb is horribly slow, I am afraid :(
i was looking at Hutch's proc
i noticed he had the capability to specify a seperate output buffer
but, combining the 2 capabilities isn't as simple as one might think :P
probably best to just do in-place reversal
if they want a copy - let them make it before the call (or call it again - lol)
This is what it looks like without the extra code for the alternate buffer. It is faster but it needs to have to Strlen algo unrolled directly into it to try aned pick up a bit more pace. Either aligning the label or unrollinng it makes it slower. I fixed the 1st character being chopped off, (removed 1 SUB) but it still does not test for a character length of 1 or null.
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
szRev1 proc src:DWORD
push esi
push edi
mov esi, [esp+4][8] ; src
mov edi, esi
invoke StrLen,esi
sub eax, 1
add edi, eax
shr eax, 1 ; int divide length by 2
neg eax ; invert sign
sub esi, eax ; sub half len from ESI
@@:
movzx ecx, BYTE PTR [esi+eax] ; load end pairs
movzx edx, BYTE PTR [edi]
mov [esi+eax], dl ; swap end pairs
mov [edi], cl
sub edi, 1
add eax, 1
jnz @B ; exit on half length
pop edi
pop esi
ret 4
szRev1 endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
i don't understand why it is better to use register-index addressing rather than using eax for the count
@@:
movzx ecx, BYTE PTR [esi] ; load end pairs
movzx edx, BYTE PTR [edi]
mov [edi], cl ; swap end pairs
mov [esi], dl
sub edi, 1
add esi, 1
sub eax, 1
jnz @B ; exit on half length
sinsi:
push [esp+4]
call StrLen
mov ecx,[esp+4]
lea edx,[ecx+eax]
@@: cmp edx,ecx
jbe @f
mov al,[ecx]
mov ah,[edx-1]
mov [ecx],ah
mov [edx-1],al
add ecx,1
sub edx,1
jmp @b
@@: ret 4
Any length, even 0, but chokes on a 1000 character string.
Same time as hutch's latest algo on my q6600.
QuoteAny length, even 0, but chokes on a 1000 character string.
what do you mean by "chokes" ?????
try not to be so technical :P i don't get it
chokes -> is a bit slower than the others
ahhhh - ok
that is strange behaviour - but at least it keeps ticking
i was just looking at the StrLen procedure
trying to make sense of it
i don't know where Hutch came up with some of this stuff :bg
i think, in the real world, this is more likely to be used for shorter strings
Dave,
the calculations and negating EAX before the loop was designed to remove the extra ADD that you had to put in the loop. Somewhere around the PIII era older hardware did not like more complex addressing but the PIV generation and later don't seem to have a problem with it.
The real problem with an opposite end swapper is cache distance, over a certain source length you break the cache which would impinge on its speed very badly. If you had to reverse strings where the ends were larger than the cache, you would do it as buffer copy so you at least got linear read then linear write.
You cam blame Agner Fog for the StrLen algo, its a classic that still has the legs over most others and its 15 years old.
well - that's good to know
i have tried to avoid such complex addressing, unless the situation requires it - like multidimensional arrays or something
as for StrLen - i am not saying anything bad about it - it is obviously fast
i am just trying to make it work in my head - lol
Quote from: dedndave on May 23, 2010, 05:17:47 AM
as for StrLen - i am not saying anything bad about it - it is obviously fast
It is pretty fast for a non-SSE2 algo, although I vaguely remember we had a faster one already.
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
Testing string len for a 100 character string, aligned
132 cycles for Masm32 len
27 cycles for MasmBasic Len
102 cycles for StrLen
Testing string len for a 100 character string, unaligned
133 cycles for Masm32 len
27 cycles for MasmBasic Len
97 cycles for StrLen
JJ,
Can't buld it, missing file.
Quote from: hutch-- on May 23, 2010, 07:59:45 AM
JJ,
Can't buld it, missing file.
Hutch,
Either disable the MasmBasic stuff (line 2, 32, 60) or install the library from http://www.masm32.com/board/index.php?topic=12460
Anyway, StrLen is pretty fast, even faster than crt_strlen. Is there a good reason why you still use szLen for the len() macro? There is one risk in exchanging them: len() aka szLen does not alter ecx and edx, StrLen does. That might break some code, but 2 push/pop pairs cost about 2 cycles only.
JJ,
With szLen() its a point of diminishing returns for where it is normally used as a macro. StrLen is the aligned "go fast" version, szLen is indifferent to alignment and easily fast enough in common usages. The type of usage is so common if I can't find the lib in a hurry I write a simple loop to do the job. If you have a task that is (1) aligned correctly and (2) big enough a genuinely fast sse version makes sense but the application for it would be very rare.
I will test the last algo with the lines removed, I cannot interfere with the setup adding extra stuff into it.
PS: It still won't build with the lines commented out. Too much of the example depends on code not present.
Quote from: hutch-- on May 23, 2010, 10:28:48 AM
If you have a task that is (1) aligned correctly and (2) big enough a genuinely fast sse version makes sense
Hutch,
SSE2 also works for small unaligned strings, see below. Typical string len for an asm source is around 60 bytes. The only valid argument not to use it is pre-SS2 hardware.
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz (SSE3)
Testing string len for a 100 character string, aligned
138 cycles for Masm32 len
119 cycles for crt_strlen
32 cycles for MasmBasic Len
108 cycles for StrLen
Testing string len for a 100 character string, unaligned
136 cycles for Masm32 len
120 cycles for crt_strlen
27 cycles for MasmBasic Len
94 cycles for StrLen
Testing string len for a 30 character string, aligned
52 cycles for Masm32 len
41 cycles for crt_strlen
16 cycles for MasmBasic Len
35 cycles for StrLen
Testing string len for a 30 character string, unaligned
45 cycles for Masm32 len
41 cycles for crt_strlen
14 cycles for MasmBasic Len
31 cycles for StrLen
> PS: It still won't build with the lines commented out. Too much of the example depends on code not present.
See line 2: useMB = 0
I used the pgm JJ posted and in my core duo I get:
Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz (SSE4)
Testing string len for a 100 character string, aligned
104 cycles for Masm32 len
80 cycles for crt_strlen
16 cycles for MasmBasic Len
82 cycles for StrLen
Testing string len for a 100 character string, unaligned
105 cycles for Masm32 len
80 cycles for crt_strlen
17 cycles for MasmBasic Len
67 cycles for StrLen
Testing string len for a 30 character string, aligned
34 cycles for Masm32 len
26 cycles for crt_strlen
9 cycles for MasmBasic Len
23 cycles for StrLen
Testing string len for a 30 character string, unaligned
34 cycles for Masm32 len
25 cycles for crt_strlen
10 cycles for MasmBasic Len
26 cycles for StrLen
I am just back from the week-end, no time to see anything so far,
but a lot of experimentation has been done, I guess. Very well done friends!!! :P
Let's see if tomorrow I have time to print and see what has happened.
Almost sure I'll understand a small portion of the whole. :lol
Thanks everybody for entering the game. :U
Frank
JJ,
This is why I don't care all that much in this context. The following example runs at about 2.2 gigabyte a second on my Core2 Quad. Yes there are far faster algos to get string lengths but a tiny one does the job in almost every usable instance.
Timing
1048576000
1 gigabyte in 469 milliseconds
Press any key to continue ...
Example
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
include \masm32\include\masm32rt.inc
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
comment * -----------------------------------------------------
Build this template with
"CONSOLE ASSEMBLE AND LINK"
----------------------------------------------------- *
slen PROTO :DWORD
.code
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
call main
inkey
exit
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
main proc
LOCAL pMem :DWORD
mov pMem, alloc(1024*1024*1001) ; 1001 meg
invoke memfill,pMem,1024*1024*1000,"xxxx"
invoke GetTickCount
push eax
print str$(rv(slen,pMem)),13,10,"1 gigabyte in "
invoke GetTickCount
pop ecx
sub eax, ecx
print str$(eax)," milliseconds",13,10
free pMem
ret
main endp
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
slen proc pstr:DWORD
mov ecx, [esp+4]
mov eax, ecx
sub eax, 1
@@:
add eax, 1
cmp BYTE PTR [eax], 0
jne @B
sub eax, ecx
ret 4
slen endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start
Old stuff again (http://www.masm32.com/board/index.php?topic=2869.15).. :wink
64 cycles for RevLingo
..sogla ym gnitset sa hcus ,sesoprup fo yteirav a sevres taht ,gnol sretcarahc 0
01 ,gnirts a si sihT
161 cycles for RevString
This is a string, 100 characters long, that serves a variety of purposes, such a
s testing my algos..
174 cycles for RevString2
..sogla ym gnitset sa hcus ,sesoprup fo yteirav a sevres taht ,gnol sretcarahc 0
01 ,gnirts a si sihT
162 cycles for RevStr
This is a string, 100 characters long, that serves a variety of purposes, such a
s testing my algos..
216 cycles for szRev
.sogla ym gnitset sa hcus ,sesoprup fo yteirav a sevres taht ,gnol sretcarahc 00
1 ,gnirts a si sihT.
Sizes:
64 RevString
64 RevString2
65 RevStr
62 szRev
121 RevLingo
--- ok ---
:lol :lol :lol :lol :lol :lol :lol :lol :lol :lol :lol :lol :lol :lol
I have to confess that...
I'd like some Algos explanation. Just reading the CODE could take
half of my holydays to figure it out, and not sure about that.
After all I'm just an old slow assembly beginner student :P
What is the logic behind the routines? Why should they be faster or slower?
What kind of Assembly gems are these? What are their strong points?
:lol :lol :lol :lol :lol :lol :lol :lol :lol :lol :lol :lol :lol :lol
I'll try to understand them anyway, but a little help is always welcome :U
Frank
Frank,
I have addresses two very simple algo designs, the reverse string algo scans a string to get its length first, then it swaps opposite end pairs until it gets to the middle.
123456789 ; original
923456781 ; 1st swap
983456721 ; 2nd swap
987456321 ; 3rd pair
987654321 ; reversed string
Quote from: hutch-- on May 24, 2010, 12:12:09 AM
Frank,
I have addresses two very simple algo designs, the reverse string algo scans a string to get its length first, then it swaps opposite end pairs until it gets to the middle.
123456789 ; original
923456781 ; 1st swap
983456721 ; 2nd swap
987456321 ; 3rd pair
987654321 ; reversed string
Thanks Hutch, this is the kind of help I need to direct my eyes
into the routines :U