void strcpy(char* p, const char* q)
{
while (*p++ = *q++);
}
I'm sure that could beat assembly if compiled with optimization set for speed.
I'm not sure how I can test it. Still, that code above does look promising.
Here is a disassembly of that code with the optimizations set.
.text:10011330 strcpy_er_0 proc near ; CODE XREF: strcpy_erj
.text:10011330
.text:10011330 var_C4 = byte ptr -0C4h
.text:10011330 var_C1 = byte ptr -0C1h
.text:10011330 arg_0 = dword ptr 8
.text:10011330 arg_4 = dword ptr 0Ch
.text:10011330
.text:10011330 push ebp
.text:10011331 mov ebp, esp
.text:10011333 sub esp, 0C4h
.text:10011339 push ebx
.text:1001133A push esi
.text:1001133B push edi
.text:1001133C lea edi, [ebp+var_C4]
.text:10011342 mov ecx, 31h
.text:10011347 mov eax, 0CCCCCCCCh
.text:1001134C rep stosd
.text:1001134E
.text:1001134E loc_1001134E: ; CODE XREF: strcpy_er_0+50j
.text:1001134E mov eax, [ebp+arg_0]
.text:10011351 mov ecx, [ebp+arg_4]
.text:10011354 mov dl, [ecx]
.text:10011356 mov [eax], dl
.text:10011358 mov eax, [ebp+arg_0]
.text:1001135B mov cl, [eax]
.text:1001135D mov [ebp+var_C1], cl
.text:10011363 mov edx, [ebp+arg_0]
.text:10011366 add edx, 1
.text:10011369 mov [ebp+arg_0], edx
.text:1001136C mov eax, [ebp+arg_4]
.text:1001136F add eax, 1
.text:10011372 mov [ebp+arg_4], eax
.text:10011375 movsx ecx, [ebp+var_C1]
.text:1001137C test ecx, ecx
.text:1001137E jz short loc_10011382
.text:10011380 jmp short loc_1001134E
.text:10011382 ; ---------------------------------------------------------------------------
.text:10011382
.text:10011382 loc_10011382: ; CODE XREF: strcpy_er_0+4Ej
.text:10011382 pop edi
.text:10011383 pop esi
.text:10011384 pop ebx
.text:10011385 mov esp, ebp
.text:10011387 pop ebp
.text:10011388 retn
.text:10011388 strcpy_er_0 endp
8 'mov' instructions in a row. :clap:
What does bother me is that it uses the 'add' instruction instead of 'inc'. That is 2 bytes extra! 4 clocks extra, too!
It doesn't look anything exciting, just the usual mess that compiler make of code like this. try benchmarking it against something fast.
You will need to work out if it handle misaligned text as well as much string data is not 4 byte or better aligned.
I took your function, renamed it to str_cpy to avoid a conflict, compiled it to an object module with /O2 /G6, and linked it with the object module from this source:
;===================================================================================
include \masm32\include\masm32rt.inc
.686
include \masm32\macros\timers.asm
str_cpy PROTO C :DWORD,:DWORD
;===================================================================================
.data
str1 db "Sample String 01234 56789 ABCDEF AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoP",\
"pQqRrSsTtUuVvWwXxYyZz Now I Know My ABC's, Won't You Come Play ",0
str2 db 128 dup(0)
.code
;===================================================================================
start:
;===================================================================================
invoke str_cpy, ADDR str2, ADDR str1
print ADDR str2,13,10,13,10
invoke Sleep, 4000
REPEAT 3
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke szCopy, ADDR str1, ADDR str2
counter_end
print str$(eax)," cycles, szCopy",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke str_cpy, ADDR str2, ADDR str1
counter_end
print str$(eax)," cycles, str_cpy",13,10
ENDM
inkey "Press any key to exit..."
exit
;===================================================================================
end start
And got these results running on a P3:
530 cycles, szCopy
524 cycles, str_cpy
529 cycles, szCopy
524 cycles, str_cpy
529 cycles, szCopy
524 cycles, str_cpy
I used the test string from here (http://www.masm32.com/board/index.php?topic=1589.0), so the results can be compared with the tests in that thread. I just tested against the MASM32 szCopy function, but there are assembly versions in the linked thread that are considerably faster.
yeah that's not beating anything...as I said in other threads optimized compiling is a joke.
E^cube,
I can't seem to find your topic on optimized compiling.
MichaelW, thank you for posting the results. I am curious: what compiler did you use? :bg
I used Visual C++ Toolkit 2003.
I became curious about how the CRT strcpy function was coded. After searching the PSDK I could not find any source or define for it. So I added it to my test app, along with code to verify that it is not comparing the buffers and returning without performing the copy if they are identical.
;===================================================================================
include \masm32\include\masm32rt.inc
.686
include \masm32\macros\timers.asm
str_cpy PROTO C :DWORD,:DWORD
;===================================================================================
.data
str1 db "Sample String 01234 56789 ABCDEF AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoP",\
"pQqRrSsTtUuVvWwXxYyZz Now I Know My ABC's, Won't You Come Play ",0
str2 db 128 dup(0)
.code
;===================================================================================
start:
;===================================================================================
invoke str_cpy, ADDR str2, ADDR str1
print ADDR str2,13,10,13,10
invoke RtlZeroMemory, ADDR str2, 128
invoke crt_strcpy, ADDR str2, ADDR str1
print ADDR str2,13,10,13,10
invoke RtlZeroMemory, ADDR str2, 128
invoke Sleep, 4000
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke RtlZeroMemory, ADDR str2, 128
counter_end
print str$(eax)," cycles, RtlZeroMemory",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke crt_strcpy, ADDR str2, ADDR str1
invoke RtlZeroMemory, ADDR str2, 128
counter_end
print str$(eax)," cycles, RtlZeroMemory + crt_strcpy",13,10,13,10
invoke Sleep, 4000
REPEAT 3
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke crt_strcpy, ADDR str2, ADDR str1
counter_end
print str$(eax)," cycles, crt_strcpy",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke szCopy, ADDR str1, ADDR str2
counter_end
print str$(eax)," cycles, szCopy",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke str_cpy, ADDR str2, ADDR str1
counter_end
print str$(eax)," cycles, str_cpy",13,10
ENDM
inkey "Press any key to exit..."
exit
;===================================================================================
end start
98 cycles, RtlZeroMemory
321 cycles, RtlZeroMemory + crt_strcpy
219 cycles, crt_strcpy
529 cycles, szCopy
524 cycles, str_cpy
218 cycles, crt_strcpy
529 cycles, szCopy
524 cycles, str_cpy
218 cycles, crt_strcpy
529 cycles, szCopy
524 cycles, str_cpy
Considering that I called it from assembly code it could not have been the inline form that the compiler would use. However it is implemented, it clearly has been heavily optimized.
Then again, I didn't compare it to the fastest assembly version in the thread that I linked, so maybe I should have left off the "heavily".
my first post in http://www.masm32.com/board/index.php?topic=14510.0 I also give a link in there to an entire thread focusing on "optimized compiling"