strcpy test

Twister · August 09, 2010, 07:31:22 AM

void strcpy(char* p, const char* q)
{
    while (*p++ = *q++);
}

I'm sure that could beat assembly if compiled with optimization set for speed.

I'm not sure how I can test it. Still, that code above does look promising.

Twister · August 09, 2010, 07:54:06 AM

Here is a disassembly of that code with the optimizations set.

Code Select

.text:10011330 strcpy_er_0     proc near               ; CODE XREF: strcpy_erj
.text:10011330
.text:10011330 var_C4          = byte ptr -0C4h
.text:10011330 var_C1          = byte ptr -0C1h
.text:10011330 arg_0           = dword ptr  8
.text:10011330 arg_4           = dword ptr  0Ch
.text:10011330
.text:10011330                 push    ebp
.text:10011331                 mov     ebp, esp
.text:10011333                 sub     esp, 0C4h
.text:10011339                 push    ebx
.text:1001133A                 push    esi
.text:1001133B                 push    edi
.text:1001133C                 lea     edi, [ebp+var_C4]
.text:10011342                 mov     ecx, 31h
.text:10011347                 mov     eax, 0CCCCCCCCh
.text:1001134C                 rep stosd
.text:1001134E
.text:1001134E loc_1001134E:                           ; CODE XREF: strcpy_er_0+50j
.text:1001134E                 mov     eax, [ebp+arg_0]
.text:10011351                 mov     ecx, [ebp+arg_4]
.text:10011354                 mov     dl, [ecx]
.text:10011356                 mov     [eax], dl
.text:10011358                 mov     eax, [ebp+arg_0]
.text:1001135B                 mov     cl, [eax]
.text:1001135D                 mov     [ebp+var_C1], cl
.text:10011363                 mov     edx, [ebp+arg_0]
.text:10011366                 add     edx, 1
.text:10011369                 mov     [ebp+arg_0], edx
.text:1001136C                 mov     eax, [ebp+arg_4]
.text:1001136F                 add     eax, 1
.text:10011372                 mov     [ebp+arg_4], eax
.text:10011375                 movsx   ecx, [ebp+var_C1]
.text:1001137C                 test    ecx, ecx
.text:1001137E                 jz      short loc_10011382
.text:10011380                 jmp     short loc_1001134E
.text:10011382 ; ---------------------------------------------------------------------------
.text:10011382
.text:10011382 loc_10011382:                           ; CODE XREF: strcpy_er_0+4Ej
.text:10011382                 pop     edi
.text:10011383                 pop     esi
.text:10011384                 pop     ebx
.text:10011385                 mov     esp, ebp
.text:10011387                 pop     ebp
.text:10011388                 retn
.text:10011388 strcpy_er_0     endp

8 'mov' instructions in a row. :clap:

What does bother me is that it uses the 'add' instruction instead of 'inc'. That is 2 bytes extra! 4 clocks extra, too!

hutch-- · August 09, 2010, 08:36:34 AM

It doesn't look anything exciting, just the usual mess that compiler make of code like this. try benchmarking it against something fast.

You will need to work out if it handle misaligned text as well as much string data is not 4 byte or better aligned.

MichaelW · August 09, 2010, 09:26:20 AM

I took your function, renamed it to str_cpy to avoid a conflict, compiled it to an object module with /O2 /G6, and linked it with the object module from this source:

Code Select


;===================================================================================
    include \masm32\include\masm32rt.inc
    .686
    include \masm32\macros\timers.asm

    str_cpy PROTO C :DWORD,:DWORD
;===================================================================================
    .data
        str1 db "Sample String 01234 56789 ABCDEF AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoP",\
                "pQqRrSsTtUuVvWwXxYyZz Now I Know My ABC's, Won't You Come Play ",0
        str2 db  128 dup(0)
    .code
;===================================================================================
start:
;===================================================================================

    invoke str_cpy, ADDR str2, ADDR str1
    print ADDR str2,13,10,13,10

    invoke Sleep, 4000

    REPEAT 3

        counter_begin 1000, HIGH_PRIORITY_CLASS
            invoke szCopy, ADDR str1, ADDR str2
        counter_end
        print str$(eax)," cycles, szCopy",13,10

        counter_begin 1000, HIGH_PRIORITY_CLASS
            invoke str_cpy, ADDR str2, ADDR str1
        counter_end
        print str$(eax)," cycles, str_cpy",13,10

    ENDM

    inkey "Press any key to exit..."
    exit

;===================================================================================
end start

And got these results running on a P3:

Code Select


530 cycles, szCopy
524 cycles, str_cpy
529 cycles, szCopy
524 cycles, str_cpy
529 cycles, szCopy
524 cycles, str_cpy

I used the test string from here, so the results can be compared with the tests in that thread. I just tested against the MASM32 szCopy function, but there are assembly versions in the linked thread that are considerably faster.

ecube · August 09, 2010, 09:38:14 AM

yeah that's not beating anything...as I said in other threads optimized compiling is a joke.

Twister · August 09, 2010, 04:23:35 PM

E^cube,

I can't seem to find your topic on optimized compiling.

MichaelW, thank you for posting the results. I am curious: what compiler did you use? :bg

MichaelW · August 09, 2010, 05:14:32 PM

I used Visual C++ Toolkit 2003.

I became curious about how the CRT strcpy function was coded. After searching the PSDK I could not find any source or define for it. So I added it to my test app, along with code to verify that it is not comparing the buffers and returning without performing the copy if they are identical.

Code Select


;===================================================================================
    include \masm32\include\masm32rt.inc
    .686
    include \masm32\macros\timers.asm

    str_cpy PROTO C :DWORD,:DWORD
;===================================================================================
    .data
        str1 db "Sample String 01234 56789 ABCDEF AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoP",\
                "pQqRrSsTtUuVvWwXxYyZz Now I Know My ABC's, Won't You Come Play ",0
        str2 db  128 dup(0)
    .code
;===================================================================================
start:
;===================================================================================

    invoke str_cpy, ADDR str2, ADDR str1
    print ADDR str2,13,10,13,10

    invoke RtlZeroMemory, ADDR str2, 128

    invoke crt_strcpy, ADDR str2, ADDR str1
    print ADDR str2,13,10,13,10

    invoke RtlZeroMemory, ADDR str2, 128

    invoke Sleep, 4000

    counter_begin 1000, HIGH_PRIORITY_CLASS
        invoke RtlZeroMemory, ADDR str2, 128
    counter_end
    print str$(eax)," cycles, RtlZeroMemory",13,10

    counter_begin 1000, HIGH_PRIORITY_CLASS
        invoke crt_strcpy, ADDR str2, ADDR str1
        invoke RtlZeroMemory, ADDR str2, 128
    counter_end
    print str$(eax)," cycles, RtlZeroMemory + crt_strcpy",13,10,13,10

    invoke Sleep, 4000

    REPEAT 3

        counter_begin 1000, HIGH_PRIORITY_CLASS
            invoke crt_strcpy, ADDR str2, ADDR str1
        counter_end
        print str$(eax)," cycles, crt_strcpy",13,10

        counter_begin 1000, HIGH_PRIORITY_CLASS
            invoke szCopy, ADDR str1, ADDR str2
        counter_end
        print str$(eax)," cycles, szCopy",13,10

        counter_begin 1000, HIGH_PRIORITY_CLASS
            invoke str_cpy, ADDR str2, ADDR str1
        counter_end
        print str$(eax)," cycles, str_cpy",13,10

    ENDM

    inkey "Press any key to exit..."
    exit

;===================================================================================
end start

Code Select


98 cycles, RtlZeroMemory
321 cycles, RtlZeroMemory + crt_strcpy

219 cycles, crt_strcpy
529 cycles, szCopy
524 cycles, str_cpy
218 cycles, crt_strcpy
529 cycles, szCopy
524 cycles, str_cpy
218 cycles, crt_strcpy
529 cycles, szCopy
524 cycles, str_cpy

Considering that I called it from assembly code it could not have been the inline form that the compiler would use. However it is implemented, it clearly has been heavily optimized.

Then again, I didn't compare it to the fastest assembly version in the thread that I linked, so maybe I should have left off the "heavily".

ecube · August 09, 2010, 07:42:38 PM

my first post in http://www.masm32.com/board/index.php?topic=14510.0 I also give a link in there to an entire thread focusing on "optimized compiling"

News:

strcpy test

Twister

Twister

hutch--

MichaelW

ecube

Twister

MichaelW

ecube