Here's my attempt at a quick string reversal.
str1 db 'abcdefghijklmnopqrstuvwxyzabc',0
str2 db 64 dup(0)
start:
lea esi,str1
lea edi,str2
mov ecx,[sizeof str1-1]
mov ebx,ecx
cmp ecx,4
jb l1
and ebx,3 ; test for alignment
jz l2
cmp ebx,3
jnz @f
sub ecx,1
mov al,[esi+ecx]
sub ebx,1
mov [edi],al
add edi,1
@@: cmp ebx,2
jnz @f
sub ecx,1
mov al,[esi+ecx]
sub ebx,1
mov [edi],al
add edi,1
@@: cmp ebx,1
jnz l1
sub ecx,1
mov al,[esi+ecx]
mov [edi],al
add edi,1
jmp l2
l1: sub ecx,1 ; string is less than 4 characters
mov al,[esi+ecx]
mov [edi],al
add edi,1
or ecx,ecx
jz l3
jmp l1
l2: sub ecx,4
jnz @f
mov eax,[esi+ecx]
bswap eax
mov [edi],eax
jmp l3
@@: mov eax,[esi+ecx] ; 4 byte aligned string
bswap eax
mov [edi],eax
add edi,4
sub ecx,4
jnz @b
mov eax,[esi]
bswap eax
mov [edi],eax
l3: ; result in str2
This looks like a job fer BSWAP. Something like
.data
inputstring db "abcdef"
inputstring_ equ $-offset teststring ;or, say, inputstring_ dd $-inputstring
outputbuff db 10h dup (?)
.code
reverse_string:
mov esi,offset inputstring
mov ecx,inputstring_
mov edi,offset outputbuff
lea edi,[edi+ecx-1]
and ecx,3
jz zero_mod_4
@@: lodsb
std
stosb
cld
loop @B
zero_mod_4:
sub edi,3
mov ecx,inputstring_
shr ecx,2
jz no_whole_dwords
@@: lodsd
bswap_eax
std
stosd
cld
loop @B
no_whole_dwords: ;done
here the one i use :
ALIGN 4
ReverseString PROC StrAddr:DWORD
push edx
push esi
push edi
mov esi,StrAddr
mov edi,esi
sub edi,4
Label1: add edi,4
mov eax,DWORD PTR [edi]
lea edx,[eax-01010101h]
xor eax,edx
and eax,80808080h
jz Label1
and eax,edx
jz Label1
bsf edx,eax
sub edx,4
shr edx,3
add edi,edx
mov eax,edi
sub eax,esi
Label3: sub edi,1
mov dh,BYTE PTR [edi]
mov dl,BYTE PTR [esi]
mov BYTE PTR [esi],dh
mov BYTE PTR [edi],dl
add esi,1
cmp esi,edi
jb Label3
Label4: pop edi
pop esi
pop edx
ret
ReverseString ENDP
or this one, it's faster but the string have to be 4 bytes long mini :red :
ALIGN 4
FastReverseString PROC StrAddr:DWORD
push edx
push esi
push edi
mov esi,StrAddr
mov edi,esi
sub edi,4
Label1: add edi,4
mov eax,DWORD PTR [edi]
lea edx,[eax-01010101h]
xor eax,edx
and eax,80808080h
jz Label1
and eax,edx
jz Label1
bsf edx,eax
sub edx,4
shr edx,3
add edi,edx
mov eax,edi
sub eax,esi
Label3: sub edi,4
mov edx,DWORD PTR [edi]
mov ecx,DWORD PTR [esi]
bswap ecx
bswap edx
mov DWORD PTR [esi],edx
mov DWORD PTR [edi],ecx
add esi,4
cmp esi,edi
jb Label3
Label4: pop edi
pop esi
pop edx
ret
FastReverseString ENDP
To the Ineffable All,
Here is my contribution to a reverse byte string program. Eight bytes get transferred with every iteration through the loop. Works for all string lengths too. It does not try to figure out the string length as there are many programs to do that. Why duplicate? By the way, the FastReverseString routine gives false results on string 'ABCDEFGHI' . Ratch
;*******************************************************************************
; REVBSTR: Reverses the bytes of an ASCII string. *
; *
; Called by: PUSH <address of string> *
; PUSH <string length> *
; CALL REVBSTR *
; *
; Returns: Nothing *
; *
; Notes: This subroutine conforms with WIN32 conventions regarding *
; register preservation and stack balancing. *
; *
; Coder: Ratch *
;*******************************************************************************
REVBS$ STRUC
RETURN DWORD ?
RV1$ = $
STRLEN DWORD ?
SADR DWORD ?
RV2$ = $
REVBS$ ENDS
RVBS$ EQU ESP.REVBS$ ;save some typing
REVBSTR: ;it all begins here
PUSH ESI ;save register
PUSH EDI ;save register
MOV ESI,[RVBS$.SADR+2*DWORD] ;string start address
MOV EAX,[RVBS$.STRLEN+2*DWORD] ;string length
MOV EDI,ESI ;string start address
PUSH EBX ;save register
ADD ESI,EAX ;string end address
SHRD EDX,EAX,3 ;1st part of MOD operation
SUB ESI,DWORD ;back up string end address one DWORD
SHR EAX,3 ;EAX = (string length) divided by 8
SHR EDX,32-3 ;2nd part of MOD operation EDX = (string length) MOD 8
.IF EDX > 4 ;modify loop counter and remainder
INC EAX ;increment loop counter for below
XOR EDX,EDX ;set remainder to zero for below
.ENDIF
.WHILE EAX ;8 bytes moved each time through loop
MOV ECX,[ESI] ;pick up 4 bytes
MOV EBX,[EDI] ;pick up 4 bytes
BSWAP ECX
BSWAP EBX
MOV [EDI],ECX ;put down 4 bytes
MOV [ESI],EBX ;put down 4 bytes
SUB ESI,DWORD ;move string end pointer
ADD EDI,DWORD ;move string beginning pointer
DEC EAX ;loop counter
.ENDW
.IF EDX ;move remaining 3,2, or 1 bytes
MOV EBX,[EDI] ;pick'em up
BSWAP EBX ;swap'em around
.IF EDX == 3 ;EDX = (string length) MOD 8
ROR EBX,1*8 ;shift around
.ELSEIF EDX == 2
XOR BH,BL ;\
XOR BL,BH ; > these three XORs swap the values of BL & BH (XCHG)
XOR BH,BL ;/
ROR EBX,2*8 ;shift around
.ELSEIF EDX == 1
BSWAP EBX ;swap'em around
.ENDIF
MOV [EDI],EBX ;put 'em down
.ENDIF
POP ESI ;restore register
POP EDI ;restore register
POP EBX ;restore register
RET RV2$-RV1$ ;return home
Thanks for the responses! :U
Did yall time the different methods? Using my alphabet+abc test string and a P4 I got 47 cycles for the one I posted and 88 for Ratch's. The others were slower. I started out using string instructions but they seemed to slow things down. I had also tried the direction flag bit which also seemed slow.
Quote from: Ratch on October 03, 2005, 07:53:39 PM
the FastReverseString routine gives false results on string 'ABCDEFGHI' . Ratch
ok, here the correction (i've tested different size this time :red ), it's just a mix of my previous code :
ALIGN 4
FastReverseString PROC StrAddr:DWORD
push ecx
push edx
push esi
push edi
; finding final 0
mov esi,StrAddr
mov edi,esi
sub edi,4
Label1: add edi,4
mov eax,DWORD PTR [edi]
lea edx,[eax-01010101h]
xor eax,edx
and eax,80808080h
jz Label1
and eax,edx
jz Label1
bsf edx,eax
sub edx,4
shr edx,3
add edi,edx
mov eax,edi
sub eax,esi
; something to reverse ?
cmp eax,2
jb Label4
; big or small string ?
cmp eax,9
jb Label3
; treatment for the big one
Label2: sub edi,4
mov edx,DWORD PTR [edi]
mov ecx,DWORD PTR [esi]
bswap ecx
bswap edx
mov DWORD PTR [esi],edx
mov DWORD PTR [edi],ecx
add esi,4
cmp esi,[edi+8]
jb Label2
; treatment for the small one
Label3: sub edi,1
mov dh,BYTE PTR [edi]
mov dl,BYTE PTR [esi]
mov BYTE PTR [esi],dh
mov BYTE PTR [edi],dl
add esi,1
cmp esi,edi
jb Label3
Label4:
pop edi
pop esi
pop edx
pop ecx
ret
FastReverseString ENDP
Quote from: Ratch on October 03, 2005, 07:53:39 PM
It does not try to figure out the string length as there are many programs to do that. Why duplicate?
lol, maybe because i don't want to see an extra line to get lenght, and another one to store the result... :bg
NightWare,
Your routine still needs work. Did you test it? I tried it with 'ABCDEFGHI' and it did NOT reverse the characters. Also it cycled through the dword loop 9 times when it only needs to cycle once for this string.
You do not need to save EDX because that is not a essential Windows register. A little documentation would also be helpful.
Quote
lol, maybe because i don't want to see an extra line to get lenght, and another one to store the result...
Ok, but you have lots of lines within the subroutine that duplicate the code of a STRLEN program. Ratch
lostcauz,
The string instructions are notorious for being slow. Timing can be tricky, especially with cache effects. What you think you see is not always what is so. You should make your routine into a subroutine so I can test it. That means storing the essential Windows registers, clearing the parameters from the stack when finished, and putting a RET instruction at the end. Some documentation would help too. Ratch
Hi Ratch,
You are correct, after adding the additional overhead yours is slightly quicker. :U Thanks for your input. I commented the attached code.
[attachment deleted by admin]
:lol
StrRev:
mov ecx, sizeof str1-1
xor eax,eax
cmp ecx, 4
jl Bytes3
push ebx
@@:
mov edx, dword ptr [str1+eax]
add eax, 4
add ecx, -8
mov ebx, dword ptr [str1+eax+ecx]
bswap edx
mov dword ptr [str2+eax +ecx], edx
bswap ebx
mov dword ptr [str2+eax-4], ebx
jg @b
pop ebx
ret
Bytes3:
movzx edx, byte ptr [str1]
cmp ecx, 1
mov byte ptr [str2+ecx-1], dl
jle End3
mov dl, byte ptr [str1+1]
cmp ecx, 2
mov byte ptr [str2+ecx-2], dl
jle End3
mov dl, byte ptr [str1+2]
mov byte ptr [str2+ecx-3], dl
End3:
ret
Regards,
Lingo
Hi lingo,
Very nice! I attached the code (from MichaelW I think) I used for timing. Is this the accepted method used for speed comparisons?
[attachment deleted by admin]
This is my piece of code. Bit old though, but still works :lol
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
; usage strrev(buffer, buffer2);
; reverts a string from buffer2 and places it into the buffer
strrev PROC uses esi edi lpBuffer:DWORD, lpSource:DWORD
mov edi, lpBuffer
mov esi, lpSource
invoke strlen, edi
dec eax
mov ecx, -1
@@:
inc ecx
push eax
sub eax, ecx
mov al, byte ptr [edi+eax]
mov byte ptr [esi+ecx], al
pop eax
cmp eax, ecx
je @F
jmp @B
@@:
ret
strrev ENDP
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
Good luck! :U
lingo,
Back to the coding desk for your subroutine. I tested your code, and it returned a string of HGFEDCBA given ABCDEFGHI. In fact, it truncated a character off every string submitted. It would be more useful if it used the same buffer for both output and input, but if it outputs to a different buffer, it should tack on a zero byte at the end. Code also needs indentation, documentation, and inputing parameters from the stack or registers. Ratch
lostcauz,
Before you do timing tests, code should be tested for correctness. That comes first. Strange that your code truncates the last string character just like lingo's did. Also, you need to write a zero byte at the end of the string. Don't depend on the buffer to be initialially zeroed out. You still need to save the essential Window registers that you used, and either put the input parameters on the stack or define them in a register. You have a good start on documentation, now try some indentation to highlight the loops and conditional clauses. Ratch
lostcauz :lol
Here is my times (P4 3.6GHz and WinXP SP2):
Please terminate any high-priority tasks and press ENTER to begin.
ReverseStr Tests:
ReverseStr - lostcauz : 35 clocks
ReverseStr - lingo : 26 clocks
Press ENTER to exit...
[attachment deleted by admin]
BoR0,
Quote
This is my piece of code. Bit old though, but still works
Not the code you submitted. I had to correct the loading of ESI and EDI. The source and destination were "reversed". After making the corrections, it performed correctly in all cases. I don't like the way it calls a string lenth subroutine internally, however. I think the string length should be parameterized, so manual editing is not necessary to select a string length subroutine. Since it transfers a byte at a time, it is probably slower that the dword movers. And it would be better if it did not need a second buffer. Neverless, it works! Ratch
lingo,
You're spinning your wheels. Both lostcauz32 and Lingo32 do NOT reverse the 'ABCDEFGHI' string correctly. They should be fixed before timing is done. Ratch
Hi Ratch,
I tested my method on every combination from string length of 1 to buffer size. Absolutely no problems, including the string you mentioned. Of course I test for correctness first and thought I was past that point. Did the zip I included work correctly for you? Or the original?
lostcauz,
OK, now that I know what I am looking for, I see that you and lingo compensate for the zero byte by a str1-1 term in SIZEOF. This causes of string in my test to be truncated by one byte, because I add the zero byte to the string by a data statement on another line. Therefore SIZEOF does not read that zero byte. So both you and lingo's subroutines DO work correctly.
Now about timing. All of our programs basically use the same algorithm. That is, pick up a DWORD or BYTE from one end of the string, BSWAP if it is a DWORD, and put it down on the other end of the string. There is not much room for improvement in that loop where it really counts, if we use that algo. All the other code is for setup and post loop follow up, which only gets executed once. So unless you come up with a new kick-ass algorithm, all of our code is going to time about the same, with the variations explained by cache, CPU load, CPU brand, memory, and who knows whatever else. You should see a difference for a BYTE move vs a DWORD move, but that is about it. So I guess the next challenge is to come up with a better algo and more features. Like a single buffer maybe? Ratch
Quote from: Ratch on October 04, 2005, 01:38:55 AM
NightWare,
Your routine still needs work. Did you test it? I tried it with 'ABCDEFGHI' and it did NOT reverse the characters.
? i don't understand... i've made test again, and it seams it works fine...
Quote from: Ratch on October 04, 2005, 01:38:55 AM
Also it cycled through the dword loop 9 times when it only needs to cycle once for this string.
? i'd like to know how it's possible...
and my code use a single buffer already...
NightWare,
Look 8 instructions past label2 in the code you submitted. You should find a instruction cmp esi,[edi+8] . Both ESI and EDI have source and destination addresses respectively. You are comparing an address in ESI with the CONTENTS of memory [EDI+8] . I can't see where that makes any sense. I checked it with a debugger, and that is what is happening. Maybe you want the comparison to be with the VALUE of EDI+8. Are you sure you are using the same code you posted? Better check. Ratch
Quote from: Ratch on October 05, 2005, 01:34:07 AM
NightWare,
Look 8 instructions past label2 in the code you submitted. You should find a instruction cmp esi,[edi+8] . Both ESI and EDI have source and destination addresses respectively. You are comparing an address in ESI with the CONTENTS of memory [EDI+8] . I can't see where that makes any sense. I checked it with a debugger, and that is what is happening. Maybe you want the comparison to be with the VALUE of EDI+8. Are you sure you are using the same code you posted? Better check. Ratch
:toothy, it make sense coz this code allow a compare instruction wich can exit with a differencie of 8, that's why it just make 1 loop instead of 9.... you should change of debugger... :wink , and if you think a bit, the code cant' made the task correctly otherwise... :dance:
to play with the others in the speed test, here the same code without the included szlen, (but it's still single buffer, coz i don't see why i should downgrade this code with extra lines...):
ALIGN 4
FastReverseString PROC StrAddr:DWORD,StrSize:DWORD
push esi
push edi
mov esi,StrAddr
mov eax,StrSize
lea edi,[esi+eax] ; if you use SIZEOF => lea edi,[esi+eax-1]
cmp eax,2 ; something to reverse ?
jb Label4
cmp eax,9 ; small string ?
jb Label3
Label2: sub edi,4 ; big string algo
mov edx,DWORD PTR [edi]
mov eax,DWORD PTR [esi]
bswap eax
bswap edx
mov DWORD PTR [esi],edx
mov DWORD PTR [edi],eax
add esi,4
cmp esi,[edi+8]
jb Label2
Label3: sub edi,1 ; small string algo
mov dh,BYTE PTR [edi]
mov dl,BYTE PTR [esi]
mov BYTE PTR [esi],dh
mov BYTE PTR [edi],dl
add esi,1
cmp esi,edi
jb Label3
Label4:
pop edi
pop esi
ret
FastReverseString ENDP
:snooty: hmm..., maybe i should explain why i've put the +8, coz it seams you don't understand...
the +8 i've added to the compare instruction is equal to a virtual loop (+4 and -4, so the distance between the left and the right of the string = -8)
in the big string algo, it's made to see if i have to made the loop again, coz i have to keep in mind that i have to pass by the small string algo after (in all cases)
voilà... i hope you understand now... coz i can't be more explicit... :'(
NightWare,
Quote
, it make sense coz this code allow a compare instruction wich can exit with a differencie of 8, that's why it just make 1 loop instead of 9.... you should change of debugger...
The question is what is it comparing, the address or the contents of the address. I use OllyDbg, they don't come much better than that.
Quote
to play with the others in the speed test, here the same code without the included szlen, (but it's still single buffer, coz i don't see why i should downgrade this code with extra lines...):
A single buffer is a virtue. That should not be changed.
OK I corrected your code so it works correctly now. I enclose the corrected snippet. Your code is the smallest submitted thus far. I will have to try to beat its small size. Ratch
Label2: sub edi,4 ; big string algo
mov edx,DWORD PTR [edi]
mov eax,DWORD PTR [esi]
bswap eax
bswap edx
mov DWORD PTR [esi],edx
mov DWORD PTR [edi],eax
LEA EAX,[EDI+8]
add esi,4
; cmp esi,[edi+8]
CMP ESI,EAX
; jb Label2
JAE Label2
Label3: sub edi,1 ; small string algo
To All,
Here is my revised reverse string subroutine. It's short, and only one loop. Beat it do death everyone. Ratch
;*******************************************************************************
; REVBSTR: Reverses the bytes of an ASCII string. *
; *
; Called by: PUSH <address of string> *
; PUSH <string length> *
; CALL REVBSTR *
; *
; Returns: Nothing *
; *
; Notes: This subroutine conforms with WIN32 conventions regarding *
; register preservation and stack balancing. *
; *
; Coder: Ratch *
;*******************************************************************************
REVBS$ STRUC
RETURN DWORD ?
RV1$ = $
STRLEN DWORD ?
SADR DWORD ?
RV2$ = $
REVBS$ ENDS
RVBS$ EQU ESP.REVBS$ ;save some typing
REVBSTR: ;it all begins here
PUSH ESI ;save register
PUSH EDI ;save register
MOV EAX,[RVBS$.STRLEN+2*DWORD] ;string length
MOV ESI,[RVBS$.SADR+2*DWORD] ;string start address
SHRD EDX,EAX,3 ;mod 8
PUSH EBX ;save register
LEA EDI,[ESI+EAX-DWORD] ;string end address - DWORD
SHR EDX,32-3 ;EDX = remainder
SHR EAX,3 ;divide by 8
.IF EDX > 3
INC EAX ;increase loop count
XOR EDX,EDX ;remainder is set to zero
.ENDIF
.WHILE EAX ;8 bytes moved each time through loop
MOV ECX,[ESI] ;pick up 4 bytes
MOV EBX,[EDI] ;pick up 4 bytes
BSWAP ECX
BSWAP EBX
MOV [EDI],ECX ;put down 4 bytes
MOV [ESI],EBX ;put down 4 bytes
ADD ESI,DWORD ;move string end pointer
SUB EDI,DWORD ;move string beginning pointer
DEC EAX ;loop counter
.ENDW
.IF EDX ;if remainder
ADD EDI,DWORD-BYTE ;adjust pointer
MOV CL,BYTE PTR [ESI] ;pick up byte
MOV BL,BYTE PTR [EDI] ;pick up byte
MOV BYTE PTR [EDI],CL ;put down byte
MOV BYTE PTR [ESI],BL ;put down byte
.ENDIF
POP ESI ;restore register
POP EDI ;restore register
POP EBX ;restore register
RET RV2$-RV1$ ;return home
ratch,
fisrt of all, you haven't corrected anything ! coz my code only make ONE LOOP ! if you want a proof, make a speed test (mine should be slower than your corrected version in this case... isn't it ? :naughty: and it's not the case...). if you want another proof, the string CAN'T be reversed correctly, if i made those "extra" loops... :bdg
to finish with a pleasant word, your revised code is quite interresting :U , coz it avoid lot of useless work (that's not the case in mine, it made useless work in many case) so i've decided to make something similar to avoid the systematic pass by the small string algo... here the result :
AnotherStringReverse PROC StrAddr:DWORD,StrSize:DWORD
ALIGN 16
push ebx
push esi
push edi
mov esi,StrAddr
mov eax,StrSize
lea edi,[esi+eax] ; if you use SIZEOF => lea edi,[esi+eax-1]
mov ebx,eax
and eax,00000000000000000000000000000111b
shr eax,1 ; to eliminate impair values
and ebx,11111111111111111111111111111000b
; test ebx,ebx ; no dword ? (useless coz the previous and fix the zero flag)
jz Label2
Label1: sub edi,4 ; big string algo
mov edx,DWORD PTR [edi]
mov ecx,DWORD PTR [esi]
bswap ecx
bswap edx
mov DWORD PTR [esi],edx
mov DWORD PTR [edi],ecx
add esi,4
sub ebx,8
jnz Label1
Label2: test eax,eax ; no byte ?
jz Label4
Label3: sub edi,1 ; small string algo
mov dh,BYTE PTR [edi]
mov dl,BYTE PTR [esi]
mov BYTE PTR [esi],dh
mov BYTE PTR [edi],dl
add esi,1
sub eax,1
jnz Label3
Label4:
pop edi
pop esi
pop ebx
ret
AnotherStringReverse ENDP
NightWare,
Quote
fisrt of all, you haven't corrected anything ! coz my code only make ONE LOOP ! if you want a proof, make a speed test (mine should be slower than your corrected version in this case... isn't it ? and it's not the case...). if you want another proof, the string CAN'T be reversed correctly, if i made those "extra" loops...
One does not determine whether code is correct by how fast it executes. Results are what count. Would some neutral 3rd party test FastReverseString, preferably with a debugger, and settle this matter....Pleeezz?
Your latest entry, AnotherStringReverse does work correctly. It has two loops instead of one, and its main loop is one instruction longer compared to REVBSTR. That is sure to make it just a bit slower. But it wqrks! Ratch
mov edx,DWORD PTR [edi]
mov ecx,DWORD PTR [esi]
bswap ecx
bswap edx
mov DWORD PTR [esi],edx
mov DWORD PTR [edi],ecx
could instead be
mov edx,[edi]
bswap edx
xchg edx,[esi]
bswap edx
mov [edi],edx
:lol
Please terminate any high-priority tasks and press ENTER to begin.
Input string ->abcdefghijklmnopqrstuvwxyzabc
RevStr - lostcauz : 34 clocks. Result: cbazyxwvutsrqponmlkjihgfedcba
RevStr - lingo : 26 clocks. Result: cbazyxwvutsrqponmlkjihgfedcba
RevStr - Ratch : 47 clocks. Result: cbazyxwvutsrqponmlkjihgfedcba
RevStr - NightWare : 49 clocks. Result: cbazyxwvutsrqponmlkjihgfedcba
Press ENTER to exit...
[attachment deleted by admin]
Larry Hammick,
Your suggestion would cut on line of code from the loop, BUT the XCHG instruction is known for being "expensive" in terms of CPU time. That is because a "lock" is implemented whenever a memory location is referenced with XCHG. For that reason, most programmers shun that instruction. Also the code causes 4 read/write stalls to occur within the loop. That is, a register or memory location is changed in one instruction and read or written again in the next instruction. The CPU has got to wait/stall until the register/memory location is updated before it can proceed to the next instruction. That is why a lot of programmers try to interleave their instructions. Ratch
lingo,
I find your speed tests interesting, but suspicious. Does it make sense that your program using the same loop algo, with 2 read/write stalls, and 2 instructions with double register indexing, would be almost twice as fast as NightWare's routine and mine? Does that pass the smell test? Not to belittle your efforts, but your routine does not reverse the same buffer either. Ratch
If you don't understand the usage of the attached testing program
will be better to ask the author: MichaelW
I like his program and just use it...
Unfortunately I have no enough free time to open your eyes..
lingo,
Quote
If you don't understand the usage of the attached testing program
will be better to ask the author: MichaelW
I understand how to use the program. That's not the issue. I will even concede that the test is implemented correctly. I just do not understand the results. Our code is similar enough so as to report approximately equal times.
Quote
I like his program and just use it...
That's irrelevant with respect to the significantly different results.
Quote
Unfortunately I have no enough free time to open your eyes..
At least concede that my point is valid, even if you do not know the reason for the descrepancy. Ratch
here the result on my old celeron 700 mghz :
Please terminate any high-priority tasks and press ENTER to begin.
Input string ->abcdefghijklmnopqrstuvwxyzabc
RevStr - lostcauz : 55 clocks. Result: cbazyxwvutsrqponmlkjihgfedcba
RevStr - lingo : 28 clocks. Result: cbazyxwvutsrqponmlkjihgfedcba
RevStr - Ratch : 59 clocks. Result: cbazyxwvutsrqponmlkjihgfedcba
RevStr - NightWare : 46 clocks. Result: cbazyxwvutsrqponmlkjihgfedcba
Press ENTER to exit...
:eek cool, i pass from the last position to the second... :P seriouly, i don't know you guys, but i already had some strange result with the speed test macros of MichealW, in my case, i'm not an ultrafast code generator, i'm just playing with coding technics... and like you i use these macros, coz there's nothing better (without downgrading the os to win98, and i don't want to do that)... but it's true there is strange results sometime... an example : if you change the order of the routines, the results change, sometimes with big differencies... and i don't know why (the only difference is the length of the jumps)
these problems appear both on my Celeron and my P4 (it worst on P4). i'd like to know the reason... anyone know why ?
note : don't use FastReverseString, coz there's a bug with the small string algo every 8 multiples +5...
Nightware,
Quote
if you change the order of the routines, the results change, sometimes with big differencies... and i don't know why (the only difference is the length of the jumps)
these problems appear both on my Celeron and my P4 (it worst on P4). i'd like to know the reason... anyone know why ?
Yes, after a flurry timing runs for STRLEN, I gave up on code timing, because I too got inconsistent and/or inexplainable results. I have had substantial timing changes occur after rebooting, running the identical run on two different days, or just by adding an innocuous instruction somplace. So now I just code by implementating the best algo the best way I can. If I see large variations between runs using of the same basic code, I dismiss the results unless they can be explained. See pages 10 and 11 of this link Ratch
http://www.masmforum.com/simple/index.php?topic=1807.135
Ratch,
The method I use to get around timing variations is to use a reference algo that I am clocking others against. For what its worth I tend to get them running at very similar times from day to day but of course this can vary with workload on the box.
ratch, hutch,
thanks for your answers
ratch, it seams you're right, the comparison of my old code can't work correctly :'( , apologies, but your correction don't works anyway... :green it's strange i've obtained some result with it...
concerning lingo's algo, i think he has already prooved in different test he's good... so i'm not surpise his code is faster, i've made some tests with different length, and lingo's algo is always the winner, aproximatively 2 time faster, for the others algos, it's the dance :dance:, every algo is 2nd, 3rd or 4th... it depends of the size of the string
concerning my algos, here a very small speedup (it should, coz there is 1+ instruction less, but i don't see the difference... :toothy ):
AnotherStringReverse PROC StrAddr:DWORD,StrSize:DWORD
ALIGN 16
;db 0cch ???
push ebx
push esi
push edi
mov esi,StrAddr
mov eax,StrSize
lea edi,[esi+eax] ; if you use SIZEOF => lea edi,[esi+eax-1]
mov ebx,eax
and ebx,11111111111111111111111111111000b
; test ebx,ebx ; no dword ? (useless coz the previous and fix the zero flag)
jz Label2
and eax,00000000000000000000000000000111b
Label1: sub edi,4 ; big string algo
mov edx,DWORD PTR [edi]
bswap edx
mov ecx,DWORD PTR [esi]
bswap ecx
mov DWORD PTR [edi],ecx
mov DWORD PTR [esi],edx
add esi,4
sub ebx,8
jnz Label1
Label2: shr eax,1 ; to eliminate impair values
; test eax,eax ; no byte ? (useless coz the previous and fix the zero flag)
jz Label4
Label3: sub edi,1 ; small string algo
mov dh,BYTE PTR [edi]
mov dl,BYTE PTR [esi]
mov BYTE PTR [edi],dl
mov BYTE PTR [esi],dh
add esi,1
sub eax,1
jnz Label3
Label4:
pop edi
pop esi
pop ebx
ret
AnotherStringReverse ENDP
and to finish another approch, there is 1 instrution less in the loops (so it should be a bit faster (in theory :boohoo: ) on very big strings) :
FastReverseString PROC StrAddr:DWORD,StrSize:DWORD
ALIGN 16
push ebx
push esi
push edi
mov esi,StrAddr
mov eax,StrSize
lea edi,[esi+eax-4] ; ) put the middle of the string in edi
shr eax,1 ; )
sub edi,eax ; )
mov ebx,eax
and ebx,11111111111111111111111111111100b
; test ebx,ebx ; no dword ? (useless coz the previous and fix the zero flag)
jz Label2
and eax,00000000000000000000000000000011b
add edi,eax ; necessary shift
Label1: mov edx,DWORD PTR [edi+ebx] ; big string algo
bswap edx
mov ecx,DWORD PTR [esi]
bswap ecx
mov DWORD PTR [edi+ebx],ecx
mov DWORD PTR [esi],edx
add esi,4
sub ebx,4
jnz Label1
test eax,eax
jz Label3
sub edi,eax ; cancel the shift
Label2: mov dh,BYTE PTR [edi+eax+3] ; small string algo
mov dl,BYTE PTR [esi]
mov BYTE PTR [edi+eax+3],dl
mov BYTE PTR [esi],dh
add esi,1
sub eax,1
jnz Label2
Label3:
pop edi
pop esi
pop ebx
ret
FastReverseString ENDP
i think i can't go beyond... of course it's possible to unroll the last loop, and win 1 to 2/3 instructions... but it takes to much memory for an insignifiant speedup...
lingo, there is just a small change to made with the implementation, just change the :
lenstr4 EQU $-str4-1 to lenstr4 EQU $-str4
lenstr4 EQU $-str5-1 to lenstr4 EQU $-str5
coz the result of REVBSTR and AnotherReverseString appear wrong if you change the length of the string
Nightware,
Quote
ratch, it seams you're right, the comparison of my old code can't work correctly , apologies, but your correction don't works anyway... it's strange i've obtained some result with it...
I beg to differ. I tested my patch for your code, and it does work.
Quote
concerning lingo's algo, i think he has already prooved in different test he's good...so i'm not surpise his code is faster
That's no criteria for faster code. I consider myself a good coder also. Doesn't mean my code is faster, does it?
Quote
so i'm not surpise his code is faster, i've made some tests with different length, and lingo's algo is always the winner, aproximatively 2 time faster, for the others algos, it's the dance , every algo is 2nd, 3rd or 4th... it depends of the size of the string
You SHOULD be surprised, since he uses the same basic algorithm as the rest of us, especially in the main loop. I think you are saying that you get erratic results. That should make you suspect your test is failing to measure what you want. All erratic results need to be explained and validated in order to inspire confidence.
Quote
concerning my algos, here a very small speedup (it should, coz there is 1+ instruction less, but i don't see the difference...
It depends where the instruction is. Generally speaking, if it is in the main loop, that should speed things up. If it is outside the loop, then it won't make much difference. If you can't tell the difference in your test, be suspicious, be very suspicious of your test.
Your latest AnotherStringReverse works correctly, but it has 3 read/write register stalls in the loop. Your latest FastReverseString errors out if the string is 1 character long. Ratch
Quote from: Ratch on October 09, 2005, 02:54:26 AM
I beg to differ. I tested my patch for your code, and it does work.
i'd made test with different size, and it didn't work... but to be honest we you, i have the bad tendancy to change codes (to optimize it) before testing... so maybe you're right again... :red
Quote from: Ratch on October 09, 2005, 02:54:26 AM
That's no criteria for faster code. I consider myself a good coder also. Doesn't mean my code is faster, does it?
no, of course, but you must admit his routines are fast... even if there is strange result with speed macros sometimes, the results must be close to reality... i make my codes on my old celeron, and believe it or not, but i don't need clocks cycles to see the difference...
Quote from: Ratch on October 09, 2005, 02:54:26 AM
You SHOULD be surprised, since he uses the same basic algorithm as the rest of us, especially in the main loop. I think you are saying that you get erratic results. That should make you suspect your test is failing to measure what you want. All erratic results need to be explained and validated in order to inspire confidence.
if you ask me, there is some reasons to explain why his code is faster... firstly he doesn't have to deal with the last bytes as we have to... so it's faster (outside the main loop) and he unroll the last loop (a small speed up again). secondly in the main loop he makes addition in every case, it's not the case for us, we alternate add and sub (it's the price we have to paid when we work directly on the string...) and if i remember well sub is a bit slower...
the thing i don't understand is why he uses :
cmp
mov
jcc
instead of the classic :
mov
cmp
jcc
Quote from: Ratch on October 09, 2005, 02:54:26 AM
Your latest FastReverseString errors out if the string is 1 character long. Ratch
nice, i haven't saw it !
NightWare,
Quote
no, of course, but you must admit his routines are fast... even if there is strange result with speed macros sometimes, the results must be close to reality... i make my codes on my old celeron, and believe it or not, but i don't need clocks cycles to see the difference...
All of our routines are fast. There is fast and then there is faster. I don't believe you can tell the difference unless you have a humongous large string.
Quote
if you ask me, there is some reasons to explain why his code is faster... firstly he doesn't have to deal with the last bytes as we have to... so it's faster (outside the main loop) and he unroll the last loop (a small speed up again).
Last bytes are processed outside the main loop. Any improvements made outside the main loop are insignificant. My program has only one loop.
Quote
secondly in the main loop he makes addition in every case, it's not the case for us, we alternate add and sub (it's the price we have to paid when we work directly on the string...) and if i remember well sub is a bit slower...
I think you remember wrong. Why would subtraction be slower? Can you document that? His loop also has 1 register stall.
Quote
the thing i don't understand is why he uses :
cmp
mov
jcc
instead of the classic :
mov
cmp
jcc
MOVs do not affect the status flags, so it does not matter which of the above code sequences you use. Unless you need to use a particular sequence prevent a register stall. Notice in his main loop, there are 3 MOVs and 2 BSWAPs between the ADD and the conditional jump. His ECX register indexes the DWORD and also determines the loop duration.
I made some minor changes to my routine. Here is the latest. Ratch
;*******************************************************************************
; REVBSTR: Reverses the bytes of an ASCII string. *
; *
; Called by: PUSH <string length> *
; PUSH <address of string> *
; CALL REVBSTR *
; *
; Returns: Nothing *
; *
; Notes: This subroutine conforms with WIN32 conventions regarding *
; register preservation and stack balancing. *
; *
; Coder: Ratch *
;*******************************************************************************
REVBS$ STRUC
RETURN DWORD ?
RV1$ = $
SADR DWORD ?
STRLEN DWORD ?
RV2$ = $
REVBS$ ENDS
RVBS$ EQU ESP.REVBS$ ;save some typing
REVBSTR: ;it all begins here
PUSH ESI ;save register
PUSH EDI ;save register
MOV EAX,[RVBS$.STRLEN+2*DWORD] ;string length
MOV ESI,[RVBS$.SADR+2*DWORD] ;string start address
SHRD EDX,EAX,3 ;mod 8
PUSH EBX ;save register
LEA EDI,[ESI+EAX-DWORD] ;string end address - DWORD
SHR EDX,32-3 ;EDX = remainder
SHR EAX,3 ;divide by 8
.IF EDX > 3
INC EAX ;increase loop count
XOR EDX,EDX ;remainder is set to zero
.ENDIF
.IF EAX
.REPEAT ;8 bytes moved each time through loop
MOV ECX,[ESI] ;pick up 4 bytes
MOV EBX,[EDI] ;pick up 4 bytes
BSWAP ECX
BSWAP EBX
MOV [EDI],ECX ;put down 4 bytes
MOV [ESI],EBX ;put down 4 bytes
SUB EDI,DWORD ;move string right end pointer
ADD ESI,DWORD ;move string left end pointer
DEC EAX ;loop counter
.UNTIL ZERO?
.ENDIF
.IF EDX > 1 ;if remainder is 2 or 3
MOV CL,BYTE PTR [ESI] ;pick up byte
MOV BL,BYTE PTR [EDI+DWORD-BYTE] ;pick up byte
MOV BYTE PTR [EDI+DWORD-BYTE],CL ;put down byte
MOV BYTE PTR [ESI],BL ;put down byte
.ENDIF
POP ESI ;restore register
POP EDI ;restore register
POP EBX ;restore register
RET RV2$-RV1$ ;return home
I played around with these routines a bit.
NightWare ( and others) -
To get consistant timing results, you have to run all the routines under test with identical conditions. This means executing from the exact same location in memory, both the calling routine and the routine being tested. We must also use the exact same input string, and output to the exact same output address. Since some of the routines here worked in place, I could not accomplish the last of these restrictions, but we can come close enough.
To make the routines have the same call, I had to slightly modify a few. I settled on making all of them with no proc parameters, and placing the offsets to the input and output strings and the length of the string in variables.
So the procedure is:
1. copy the routine to a common location so all routines run from the same memory.
2. create the input string before running each routine to be sure the input is correct.
3. copy the output string to the input string for routines that work in place.
3. call the rountine.
4. verify the output.
By doing this, we can get good consistant timings with only 10 iterations. There is no need to run a million times. We simply select the smallest execution time from the 10 trys. We can also test with various string lengths, and with different byte misalignments.
Here are my results:
Str1 Size 1 9 18 27 28 29 36 100 400 1000
============ ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
0 byte misalignment
OVH 467 0 0 0 0 0 0 0 0 0
lostcauz32 20 18 26 34 26 33 32 89 314 764
Anonymous32 9 20 24 28 27 28 31 82 268 642
Ratch 14 18 39 55 32 66 37 90 274 649
NightWare 11 19 46 62 37 54 42 93 272 647
1 byte misalignment
OVH 467 0 0 0 0 0 0 0 0 0
lostcauz32 20 18 26 35 26 33 32 89 315 765
Anonymous32 9 20 24 27 33 33 36 88 282 693
Ratch 14 18 53 58 69 69 85 208 736 1806
NightWare 11 19 47 53 65 66 81 192 692 1735
2 byte misalignment
OVH 467 0 0 0 0 0 0 0 0 0
lostcauz32 20 18 26 35 28 34 34 91 318 768
Anonymous32 9 20 22 33 33 33 36 88 282 693
Ratch 14 18 40 54 69 69 85 208 736 1806
NightWare 11 19 39 55 65 66 81 192 685 1735
3 byte misalignment
OVH 467 0 0 0 0 0 0 0 0 0
lostcauz32 20 18 26 35 28 34 34 91 318 768
Anonymous32 9 17 24 33 33 27 36 88 282 693
Ratch 14 18 47 58 69 70 85 208 736 1806
NightWare 11 16 45 55 67 52 81 192 685 1735
Press enter to exit...
The OVH routine is an empty routine, just to get the overhead of setting up the input strings and calling the routines. This overhead is subtracted from all other results.
Warning: This was done on an Athlon, Pentiums will give different results.
[attachment deleted by admin]
Jimg,
Quite a job you did. I also tried some timing. I made lingo's and my subroutine flip the alphabet 100,000,000 times. It took about 6 seconds for my subroutine to do it and about 3 sec for lingo's subroutine. That is hardly worth worrying about, but it makes a good argument.
Quote
We must also use the exact same input string, and output to the exact same output address. Since some of the routines here worked in place, I could not accomplish the last of these restrictions, but we can come close enough.
Well, when I set my EDI output pointer to a different buffer in a .DATA? segment as lingo does, then my times were comparable to his, as can be expected. Therefore to be fair, you have to make lingo's subroutine write in the same source buffer, as the other programs do. I think this causes a "cache clash", because when reads and writes are very close to each other in memory, the CPU has to stall until the write is complete before reading again. My AMD Athlon has a moderate L1 cache size, but a 64 byte cache line. I think that means it can isolate 64 bytes within its cache buffer. If it has to read/write within that limit, a clash can be expected. Now I realize that the output is going to be all mixed up because his routine is not written for a single buffer. My output was hashed also when I changed my destination to a different buffer. However, both programs will do the same work as before.
I tried to assemble your test program, but I failed because I was missing a library or macro. Maybe you can try it again using my suggestions. Ratch
Hutch,
I'm wondering why you tolerate such offences towards me
"To make the routines have the same call, I had to slightly modify a few "
and
"The worst modification was in Lingo's routine,"
and
"I used esi and edi for this, but had to do an extra add and subtract as some of the instructions"
and my proc from:
comment * ReverseStr by Lingo *
align 16
db 0A4h, 24h, 0, 0, 0, 0 , 8Dh, 9Bh, 0, 0, 0, 0,90
Lingo32 proc
mov ecx, sizeof str1-1
xor eax, eax
cmp ecx, 4
mov byte ptr [str2+ecx], al
jl Bytes3
push ebx
@@:
mov edx, dword ptr [str1+eax]
add eax, 4
add ecx, -8
mov ebx, dword ptr [str1+eax+ecx]
bswap edx
mov dword ptr [str2+eax+ecx], edx
bswap ebx
mov dword ptr [str2+eax-4], ebx
jg @b
pop ebx
ret
become garbage:
comment * ReverseStr by Lingo *
align 16
db 13 dup (0) ;0A4h, 24h, 0, 0, 0, 0 , 8Dh, 9Bh, 0, 0, 0, 0,90
Lingo32 proc
push esi
push edi
mov esi,Str1Offset
mov edi,Str2Offset
;mov ecx, sizeof Str1-1
mov ecx, Str1Size
xor eax, eax
cmp ecx, 4
mov byte ptr [edi+ecx], al ;[str2+ecx]
jl Bytes3
push ebx
@@:
mov edx, dword ptr [esi+eax] ;[str1+eax]
add eax, 4
add ecx, -8
add ecx,eax ; temporarily compute eax+ecx
mov ebx, dword ptr [esi+ecx] ;[str1+eax+ecx]
bswap edx
mov dword ptr [edi+ecx], edx ;[str2+eax+ecx]
sub ecx,eax ; restore ecx
bswap ebx
mov dword ptr [edi+eax-4], ebx ;[str2+eax-4]
jg @b ; from the add ecx,-8 instruction
pop ebx
pop edi
pop esi
ret
and this garbage continue to have my nickname
I don't know what to do; to laugh or to cry....
Regards,
Lingo
Lingo-
You have my sincerest appology. I had no idea you would be so offended by this. The basic algorythm is still yours, I only had to change a few instructions to fit the scheme to make the comparison. And you STILL are the quickest. Is it that you no longer want your name associated with the routine, or want some disclaimer with the code, or what. Please let us know.
"Is it that you no longer want your name associated with the routine "
Exactly, pls fill free to change and use what you want but delete my nickname from YOUR job
Done.
Guys,
Can we keep this something like friendly as these extended discussions about algo design are often useful and produce some very good code.
Thanks
Sorry Hutch, I wasn't trying to be contentious in the slightest. I guess my people skills are right up there with the average nerd. :'(
Jimg,
You have nothing to apologize about. You tried to do the right thing with your timing program. Just remember, no good deed goes unpunished. Ratch
return of the son of the algos...
a small speedup for AnotherStringReverse, very small (i expected more... because it avoid stalls in many case... but...)
ALIGN 16
AnotherStringReverse PROC StrAddr:DWORD,StrSize:DWORD
push ebx
push esi
push edi
mov esi,StrAddr
mov eax,StrSize
lea edi,[esi+eax] ; if you use SIZEOF => lea edi,[esi+eax-1]
mov ebx,eax
and ebx,11111111111111111111111111111000b
jz Label2
and eax,00000000000000000000000000000111b
Label1: add edi,-4
mov ecx,DWORD PTR [esi]
mov edx,DWORD PTR [edi] ; big string algo
bswap ecx
bswap edx
mov DWORD PTR [edi],ecx
mov DWORD PTR [esi],edx
add esi,4
add ebx,-8
jnz Label1
Label2: shr eax,1 ; to eliminate impair values
jz Label4
cmp eax,1
jne Label3
mov cl,BYTE PTR [esi] ; very small string algo
mov ch,BYTE PTR [edi-1]
mov BYTE PTR [esi],ch
mov BYTE PTR [edi-1],cl
jmp Label4
Label3: mov ecx,DWORD PTR [esi] ; small string algo
mov edx,DWORD PTR [edi-4]
bswap ecx
bswap edx
mov DWORD PTR [edi-4],ecx
mov DWORD PTR [esi],edx
Label4:
pop edi
pop esi
pop ebx
ret
AnotherStringReverse ENDP
here the fixed FastReverseString, and it's a bit faster... this is the one i've chosen to use... a bit slower on small string, but faster on bigger :
ALIGN 16
FastReverseString PROC StrAddr:DWORD,StrSize:DWORD
push ebx
push esi
push edi
mov esi,StrAddr
mov eax,StrSize
lea edi,[esi+eax-4] ; ) put the middle of the string in edi
shr eax,1 ; )
sub edi,eax ; )
mov ebx,eax
and eax,00000000000000000000000000000011b ; ) add the shift
add edi,eax ; )
and ebx,11111111111111111111111111111100b
jz Label2
Label1: mov ecx,DWORD PTR [esi] ; big string algo
mov edx,DWORD PTR [edi+ebx]
bswap ecx
bswap edx
mov DWORD PTR [edi+ebx],ecx
mov DWORD PTR [esi],edx
add esi,4
add ebx,-4
jnz Label1
Label2: test eax,eax ; nothing to do ?
jz Label4
cmp eax,1 ; wich algo ?
jne Label3
mov cl,BYTE PTR [edi+3] ; very small string algo
mov ch,BYTE PTR [esi]
mov BYTE PTR [edi+3],ch
mov BYTE PTR [esi],cl
jmp Label4
Label3: mov ecx,DWORD PTR [esi] ; small string algo
mov edx,DWORD PTR [edi+ebx]
bswap ecx
bswap edx
mov DWORD PTR [edi+ebx],ecx
mov DWORD PTR [esi],edx
Label4:
pop edi
pop esi
pop ebx
ret
FastReverseString ENDP
now, i will not spend time anymore for any reverse procedure... it makes me sick now...
note : strange, it seams (on my celeron) it's a bit faster to use :
mov ecx,DWORD PTR [esi]
mov edx,DWORD PTR [edi]
bswap ecx
bswap edx
mov DWORD PTR [edi],ecx
mov DWORD PTR [esi],edx
instead of :
mov edx,DWORD PTR [edi]
mov ecx,DWORD PTR [esi]
bswap ecx
bswap edx
mov DWORD PTR [esi],edx
mov DWORD PTR [edi],ecx
here this is not a question of alignment, so it seams it's better to not use the same registers of the previous line (when it's possible)
NightWare,
Quote
a small speedup for AnotherStringReverse, very small (i expected more... because it avoid stalls in many case... but...)
Both your latest submissions of AnotherStringReverse and FastReverseString appear to work correctly. If you really want to speed things up, do as Lingo does. Make them write in second buffer distant from the source buffer, as I pointed out in a previous post to Jimg.
Quote
here this is not a question of alignment, so it seams it's better to not use the same registers of the previous line (when it's possible)
Of course, haven't I been talking about write/read stalls? Ratch
Reverse String? Can't you just PUSH everything on the stack, and then POP it off, and it's reversed?
later guys,
jeff c
:U
OceanJeff32,
Quote
Reverse String? Can't you just PUSH everything on the stack, and then POP it off, and it's reversed?
Yes, but you first have to write the DWORD into a register for a BSWAP. And PUSH/POP are not as fast as MOV. And you need two loops, one for PUSHes and one for POPs. And if your string is really long, you need to augment the stack. And you need to synchronize the string end if the string length is not divisible by 4. You would ibe transferring the contents of one memory location (source string) into a register, then transferring the register to stack memory, then transferring the contents of the stack to still another memory location (destination string). Not very efficient. Ratch
OceanJeff32, :lol
small and slow...
OPTION PROLOGUE:NONE ; turn it off
OPTION EPILOGUE:NONE ;
StrRev proc lpString:DWORD ;
;
pop edx ; edx->return address
pop eax ; eax->lpString
push edx ; edx->return address
push esi ; save esi
cld ; clears the Direction Flag
xor esi, esi ; esi = 0
push edi ; save edi
mov edi, eax ; edi->lpString
xchg eax, esi ; esi->lpString; eax = 0
@@: ; saving the string in the stack
push eax ;
lodsb ; mov al, [esi] -> inc esi
test eax, eax ; is it end of the string?
jne @b ;
@@: ; restoring the string from the stack
pop eax ;
stosb ; mov [edi], al -> inc edi
dec eax ; is it end of the string from the stack?
jns @b ;
pop edi ; restore esi and edi
pop esi ;
ret ; 25 bytes
StrRev endp ;
OPTION PROLOGUE:PROLOGUEDEF ; turn back on the defaults
OPTION EPILOGUE:EPILOGUEDEF ;
Regards,
Lingo
ratch,
Quote from: Ratch on October 12, 2005, 01:47:28 AM
Of course, haven't I been talking about write/read stalls? Ratch
i wasn't aware about that... for me, stalls only happened when you change the size of registers... as i said before, i'm just playing with coding technics... i'm not a good coder... (but i think i'm able to beat the previous algo posted by lingo, :toothy :toothy :toothy )
jimg,
:dazzled: only 10 loops ?... you really think the result reflect the real speed ? coz the results i've obtained are totally different of your post... and also quite strange, here an example...
Str1 Size 1 9 18 27 28 29 36 100 400 1000
============ ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
NightWare 13 20 57 62 51 56 57 122 418 1018
same speed for 18 bytes and 36 bytes, but :
18 = 8*2 + 2 => (2 main loop) + (1 byte exchange)
36 = 8*4 + 4 => (4 main loop) + (2 byte exchange)
and like ratch, i'm unable to assemble it, so i can't see the result with more loops...
Hi NightWare-
The problem with assembling it is probably the path to your library and or include files. I use WinAsm Studio and have my path as part of the WinAsm settings. You will probably have to add your path specs up in the includes. If this doesn't fix it, let me know what the error comment you are getting is.
Yes, I've tested it extensively, and I think the numbers are much more representative and repeatable (and much faster) than running it a million times. You can change the code to average and loop more times if you want. The important part of the process is calling from and running from the exact same spot in memory, and using the exact same input and output memory locations. I'll have to step through your code to see what the problem is, but the 27, and 29 were specifically chosen to be three and one extra bytes from an even dword and so it takes longer to process. Since 36 is an even dword number of bytes, it make sense that it would be fast compared to 18 which has an extra two bytes to take care of. Likewise for 28 vs. 27 or 29. Again, I'll have to step through your code to see what exactly is going on. The beauty of Lingo is that it uses the same code for an odd number of bytes, it just processes the extras twice, using the main code, which is much faster than loading the code to do the exceptions.
Of course, your numbers will probably be quite different than mine because it is cpu dependent, and there is a really big difference between AMD and Intel chips.
I can't seem to work through your code. As I read it, I keep thinking of other things to try which destroys my train of thought as far as stepping through yours. But there is definately a big penalty somewhere when the input is not an even multiple of four. Yours is faster on large strings of dwords, but look at this penalty for being off a byte-
Str1 Size 1 9 18 27 28 29 36 100 401 1000 1001
============ ===== ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
0 byte misalignment
OVH 467 0 0 0 0 0 0 0 0 0 0
NightWare 11 19 43 57 37 54 42 93 562 647 1667
NightWare2 10 18 36 48 60 67 64 96 666 522 1659
Algo3 9 19 23 27 26 28 31 82 273 642 648
1 byte misalignment
OVH 467 0 0 0 0 0 0 0 0 0 0
NightWare 11 19 47 53 66 66 81 192 687 1735 1735
NightWare2 10 18 44 46 76 76 87 202 695 1708 1708
Algo3 9 20 23 28 28 28 32 83 272 643 647
2 byte misalignment
OVH 467 0 0 0 0 0 0 0 0 0 0
NightWare 11 19 39 55 66 66 81 192 687 1735 1735
NightWare2 10 18 36 48 76 76 87 202 695 1708 1708
Algo3 9 20 22 28 28 28 32 83 272 643 647
3 byte misalignment
OVH 467 0 0 0 0 0 0 0 0 0 0
NightWare 11 16 45 55 67 52 81 192 667 1735 1697
NightWare2 10 14 37 48 76 69 87 202 669 1708 1765
Algo3 9 18 24 28 28 28 32 83 273 643 647
Press enter to exit...
(Algo3 is a mangle of Lingo's code I've been looking at, NIghtWare2 is your latest code posting)
Well, so far I had just done timing tests and not really played with the algorithms other than mangling Lingo's code. But I decided to start with a very simple loop as a basis to see what the effect of various schemes would be. I used the first and last part of Lingo's code since I was really only interested in the speeding up the main loop and I couldn't see anything obvious or worth while to do to the first or last part. So here's the code I was going to use as the base timing to see how the other schemes would speed it up-
Algo4 proc ; first and last parts are modified Lingo, thank you Lingo, main part is all jimg.
push esi
push edi
mov esi,Str1Offset ; pickup left side
mov edi,Str2Offset ; store left side
mov ecx,Str1Size ; will be offset from right
xor eax,eax ; offset from left
cmp ecx,4
mov byte ptr [edi+ecx],al ; zero byte terminator
jl Bytes3
push ebx
sub eax,4
@@: ; Main loop
add eax,4
add ecx,-4
mov edx,[esi+eax]
mov ebx,[esi+ecx]
bswap edx
bswap ebx
mov [edi+eax],ebx
mov [edi+ecx],edx
cmp eax,ecx
jl @b
pop ebx
pop edi
pop esi
ret
align 16
Bytes3: ; possible input a ab abc
; ecx= 1 2 3
; currrent output .0.. ..0. ...0
movzx edx, byte ptr [esi] ;[str1] edx= a a a
cmp ecx, 1
mov byte ptr [edi+ecx-1],dl ;[str2+ecx-1] a0.. .a0. ..a0
jle End3 ; done
mov dl, byte ptr [esi+1] ;[str1+1] edx= b b
cmp ecx, 2
mov byte ptr [edi+ecx-2],dl ;[str2+ecx-2] ba0. .ba0
jle End3 ; done
mov dl, byte ptr [esi+2] ;[str1+2] edx= c
mov byte ptr [edi+ecx-3],dl ;[str2+ecx-3] cba0
End3:
pop edi
pop esi
ret
Algo4 endp
To my amazement, here's the timing results-
Str1 Size 1 9 18 27 28 29 36 100 401 1000 1001
============ ===== ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
0 byte misalignment
OVH 467 0 0 0 0 0 0 0 0 0 0
Ratch 14 18 39 55 32 66 37 90 597 649 1424
NightWare2 10 18 36 48 60 67 64 96 666 522 1659
Algo3 9 19 23 27 26 28 31 82 273 642 648
Algo4 9 18 22 24 23 29 26 67 223 477 530
1 byte misalignment
OVH 467 0 0 0 0 0 0 0 0 0 0
Ratch 14 18 46 56 69 69 85 208 736 1806 1806
NightWare2 10 18 49 51 76 76 87 202 695 1708 1708
Algo3 9 20 23 28 28 28 32 83 272 643 647
Algo4 9 19 25 24 25 29 29 73 243 566 566
2 byte misalignment
OVH 467 0 0 0 0 0 0 0 0 0 0
Ratch 14 18 40 58 69 69 85 208 736 1806 1806
NightWare2 10 18 36 56 76 76 87 202 695 1708 1708
Algo3 9 20 22 28 28 28 32 83 272 643 647
Algo4 9 19 19 25 25 29 29 73 243 566 566
3 byte misalignment
OVH 467 0 0 0 0 0 0 0 0 0 0
Ratch 14 18 47 57 69 70 78 208 695 1806 1797
NightWare2 10 14 37 56 76 69 87 202 669 1708 1765
Algo3 9 18 24 28 28 28 32 83 273 643 647
Algo4 9 16 21 25 25 28 29 73 222 566 552
Press enter to exit...
I guess "KISS" is the answer :P
Jimg,
Quote
I guess "KISS" is the answer
I don't think so. Remember what I described before? To be fair, all the programs to be compared must write to the same buffer so as to measure the cache clashes. If you write to two separated buffers, you give an advantage to that program. That is the one thing that can account for the 3 to 1 disparity of times. Ratch
Ok, I was using the first lostcauz post as the basis which copied from one buffer to another.
You're absolutely right. Writing back to the same buffer when it is not a multiple of 4 bytes incurs a large time penalty.
jimg,
maybe i should been more explicit... the problem doesn't appear during the assembling, it crash when i launch the assembled code, here a copy of the screen when it crash
(maybe i've an unupdated .inc file... :'( unfortunatelly i haven't enought time to see what's wrong here...)
Str1 Size 1 9 18 27 28 29 36 100 400 1000
============ ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
0 byte misalignment
OVH
now, return of the son of the son of the algo... :bdg
it's short, it's fast (i think it will be very difficult to make a faster code than the following algo... i speak about "ReverseString algo", i don't speak about "IVeCreatedAnotherStringInReverseOrderNowYouJustHaveToCopyItOverTheOriginal algo" :wink )
don't try to understand the code, if you want to keep your mind safe... :dazzled: there is already a victim :toothy , we don't need another one... just use it (if you need it)
FastReverseString -> UltimateReverseString => ebx is not used anymore, a smart mini-shift is used to avoid lot of instructions (i'm a genius, sometimes... :U )
ALIGN 16
UltimateStringReverse PROC StrAddr:DWORD,StrSize:DWORD
push esi
push edi
mov esi,StrAddr
mov eax,StrSize
lea edi,[esi+eax-2]
shr eax,1
sub edi,eax
add eax,-2
js Label2
Label1: mov ecx,DWORD PTR [esi]
mov edx,DWORD PTR [edi+eax]
bswap ecx
bswap edx
mov DWORD PTR [edi+eax],ecx
mov DWORD PTR [esi],edx
add esi,4
add eax,-4
jns Label1
Label2: cmp eax,-1
jne Label3
mov cl,BYTE PTR [edi+2]
mov ch,BYTE PTR [esi]
mov BYTE PTR [edi+2],ch
mov BYTE PTR [esi],cl
Label3:
pop edi
pop esi
ret
UltimateStringReverse ENDP
NightWare,
I tested UltimateStringReverse, and it reverses a string correctly. When I compared its time to my routine by reversing the alphabet 100,000,000 times, it took around 6 seconds to do that on my machine. That is the same time that my routine takes. I don't see anything in your main loop that is significantly different from the way your previous submissions, my submissions, and other submissions do things in the main loop. They pick up a DWORD from one end of the string and put it down at the other end. Therefore the times should be about the same. If you want to really speed thing up, you can write the output to a different buffer like Lingo does, so as to avoid cache collisions. But then the string is not reversed in place is it? Ratch
Nightware-
Quotejimg,
maybe i should been more explicit... the problem doesn't appear during the assembling, it crash when i launch the assembled code, here a copy of the screen when it crash
Please try this code and let me know what is printed out in the debug window. Thanks.
[attachment deleted by admin]
Ratch,
Quote from: Ratch on October 16, 2005, 01:54:28 AM
If you want to really speed thing up, you can write the output to a different buffer like Lingo does, so as to avoid cache collisions. But then the string is not reversed in place is it? Ratch
our code are faster than you imagine... if you want a speed-up you can use a direct esp read, in the beginning... (nice speedup for me)...
but most important of all... with our code we don't need to reserve an empty space, we don't need to free this space after a use, etc... all these facts are not counted in the speed test...
jimg,
HARG ! i've said i haven't enought time.... damn... well, ok...
then i try to assemble the new code => fatal error, unable to find kernel32.lib ??? i don't understand, of course, i look inside the lib dir, and you know what ? it's in... so i spend some time to see what's wrong (coz all paths are correct), and i don't find... so i decide to see the difference with your old code... => debug.lib
ok, so i change your uselib macro to a classic includelib (coz i don't have a debug.inc file) and then it pass the kernel32.lib error (if you ask me, debug is bugged)
BUT ! (there is always a "but") i have 5 or 6 errors with PrintText (it seams it's one of your macros). so i transform it to old print... so i hope the result is fine on the screen, since i don't know if your routine add something or not...
here a copy of the screen when it crash :
Str1 Size 1
=========== ====
0 byte misalignment
starting procsloopOVH moving proccreating stringcreating answer
Ok, never mind. Let me know if you ever want to figure out the problem.
Hey all it's been a loooong while,firstly I just want to say hi again and I did
miss everyone.
Anyhow I have been getting into ASM again and was reading the board
and saw this problem. I was wondering if this was a viable solution, It's what
I use.
GOASM format should be easy to convert if anyone cares to.
ReverseString frame LpSrc,LpBuffer
; Will reverse a standard
; size string , ADDR PTR
; returns in eax. LpBuffer
; is the string buffer
; yet EAX will hold the
; strings starting addr.
mov eax,[LpBuffer] ; {move the values to registers.
mov ecx,[LpSrc] ; {
mov B[eax+0100h],00h ; Null terminate the string
add eax,0100h ; Get ready to climb down the buffer
sub eax,01h ; dec the buffer
mov dl,[ecx] ; {
mov [eax],dl ; {buffer byte=src byte
add ecx,01h ; inc ECX
cmp B[ecx],00h ; did we finish?
db 74h,02h ; JZ @F
db EBh,EFh ; JMP @B @@:
ret
endf
This way you still use the buffer you just don't have the buffers starting address .
It returns in EAX .
-OS
*Edit used the wrong version the old version copied the trailing zero.