News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

why does C optomize this way?

Started by thomas_remkus, June 28, 2005, 06:27:40 PM

Previous topic - Next topic

thomas_remkus

I am expanding some of my testing/knowledge-growth by working with a little inline __asm with Visual Studio. It seems that my 'for' loop has been optomized out but I can't really see how this is effective by producing the same results. Here is the C code:

#include "stdafx.h"
#include <stdlib.h>
#include "windows.h"

int addNumbers1(int a, int b)
{
    return a + b;
}

int addNumbers2(int a, int b)
{
    __asm
    {
        mov eax, a
        mov ecx, b
        add eax, ecx
    }
}

int _tmain(int argc, _TCHAR* argv[])
{
    unsigned long ul = 0;
    DWORD dw = 0;
    unsigned long loop = GetTickCount() * 20;
    unsigned long results = 0;
   
    printf("loop is: %u\r\n\r\n", loop);
    printf("start 1: \r\n");
    dw = GetTickCount();

    for (ul = 0; ul < loop; ++ul)
        results = addNumbers1(1, ul);
   
    printf("finis 1: %u\r\n", GetTickCount() - dw);
    ul = 0;   
    printf("start 2: \r\n");
    dw = GetTickCount();
   
    for (ul = 0; ul < loop; ++ul)
        results = addNumbers2(1, ul);
   
    printf("finis 2: %u\r\n", GetTickCount() - dw);
    ul = 0;   
    printf("start 3: \r\n");
    dw = GetTickCount();
   
    for (ul = 0; ul < loop; ++ul)
    __asm
    {   
        mov eax, 1
        mov ecx, ul
        add eax, ecx
        mov results, eax
    }
   
    printf("finis 3: %u\r\n", GetTickCount() - dw);
    system("pause");

return 0;
}


Now, here is the 'cod' file that was produced. Note the first loop seems to be optomized out. Is that true? What the heck is it does here????


; Listing generated by Microsoft (R) Optimizing Compiler Version 13.10.3077
.386P
    .model FLAT

INCLUDELIB LIBC
INCLUDELIB OLDNAMES

PUBLIC ?addNumbers1@@YAHHH@Z ; addNumbers1
_TEXT SEGMENT
_a$ = 8 ; size = 4
_b$ = 12 ; size = 4
?addNumbers1@@YAHHH@Z PROC NEAR ; addNumbers1, COMDAT

; 7    :     return a + b;

  00000 8b 44 24 08 mov eax, DWORD PTR _b$[esp-4]
  00004 8b 4c 24 04 mov ecx, DWORD PTR _a$[esp-4]
  00008 03 c1 add eax, ecx

; 8    : }

  0000a c3 ret 0
?addNumbers1@@YAHHH@Z ENDP ; addNumbers1
_TEXT ENDS
PUBLIC ?addNumbers2@@YAHHH@Z ; addNumbers2
_TEXT SEGMENT
_a$ = 8 ; size = 4
_b$ = 12 ; size = 4
?addNumbers2@@YAHHH@Z PROC NEAR ; addNumbers2, COMDAT

; 12   :     __asm
; 13   :     {
; 14   :         mov eax, a

  00000 8b 44 24 04 mov eax, DWORD PTR _a$[esp-4]

; 15   :         mov ecx, b

  00004 8b 4c 24 08 mov ecx, DWORD PTR _b$[esp-4]

; 16   :         add eax, ecx

  00008 03 c1 add eax, ecx

; 17   :     }
; 18   : }

  0000a c3 ret 0
?addNumbers2@@YAHHH@Z ENDP ; addNumbers2
_TEXT ENDS
PUBLIC _main
; COMDAT _main
_TEXT SEGMENT
_ul$ = -8 ; size = 4
_results$ = -4 ; size = 4
_argc$ = 8 ; size = 4
_argv$ = 12 ; size = 4
_main PROC NEAR ; COMDAT

; 21   : {

  00000 83 ec 08 sub esp, 8
  00003 53 push ebx
  00004 56 push esi
  00005 57 push edi

; 22   :     unsigned long ul = 0;
; 23   :     DWORD dw = 0;
; 24   :     unsigned long loop = GetTickCount() * 20;

  00006 8b 3d 00 00 00
00 mov edi, DWORD PTR __imp__GetTickCount@0
  0000c ff d7 call edi
  0000e 8d 34 80 lea esi, DWORD PTR [eax+eax*4]
  00011 c1 e6 02 shl esi, 2

; 25   :     unsigned long results = 0;
; 26   :     
; 27   :     printf("loop is: %u\r\n\r\n", loop);

  00014 56 push esi
  00015 68 00 00 00 00 push OFFSET FLAT:??_C@_0BA@DEODJLEB@loop?5is?3?5?$CFu?$AN?6?$AN?6?$AA@
  0001a e8 00 00 00 00 call _printf

; 28   :     printf("start 1: \r\n");

  0001f 68 00 00 00 00 push OFFSET FLAT:??_C@_0M@OIGAMJMG@start?51?3?5?$AN?6?$AA@
  00024 e8 00 00 00 00 call _printf
  00029 83 c4 0c add esp, 12 ; 0000000cH

; 29   :     dw = GetTickCount();

  0002c ff d7 call edi
  0002e 8b d8 mov ebx, eax

; 30   :
; 31   :     for (ul = 0; ul < loop; ++ul)
; 32   :         results = addNumbers1(1, ul);
; 33   :     
; 34   :     printf("finis 1: %u\r\n", GetTickCount() - dw);

  00030 ff d7 call edi
  00032 2b c3 sub eax, ebx
  00034 50 push eax
  00035 68 00 00 00 00 push OFFSET FLAT:??_C@_0O@EDGHBOEI@finis?51?3?5?$CFu?$AN?6?$AA@
  0003a e8 00 00 00 00 call _printf

; 35   :     ul = 0;   
; 36   :     printf("start 2: \r\n");

  0003f 68 00 00 00 00 push OFFSET FLAT:??_C@_0M@GOPELLGI@start?52?3?5?$AN?6?$AA@
  00044 e8 00 00 00 00 call _printf
  00049 83 c4 0c add esp, 12 ; 0000000cH

; 37   :     dw = GetTickCount();

  0004c ff d7 call edi

; 38   :     
; 39   :     for (ul = 0; ul < loop; ++ul)

  0004e 33 d2 xor edx, edx
  00050 85 f6 test esi, esi
  00052 8b d8 mov ebx, eax
  00054 89 54 24 0c mov DWORD PTR _ul$[esp+20], edx
  00058 76 1a jbe SHORT $L63850
  0005a 8d 9b 00 00 00
00 npad 6
$L63848:

; 40   :         results = addNumbers2(1, ul);

  00060 b8 01 00 00 00 mov eax, 1
  00065 8b 4c 24 0c mov ecx, DWORD PTR _ul$[esp+20]
  00069 03 c1 add eax, ecx
  0006b 42 inc edx
  0006c 3b d6 cmp edx, esi
  0006e 89 54 24 0c mov DWORD PTR _ul$[esp+20], edx
  00072 72 ec jb SHORT $L63848
$L63850:

; 41   :     
; 42   :     printf("finis 2: %u\r\n", GetTickCount() - dw);

  00074 ff d7 call edi
  00076 2b c3 sub eax, ebx
  00078 50 push eax
  00079 68 00 00 00 00 push OFFSET FLAT:??_C@_0O@MNOIBJKL@finis?52?3?5?$CFu?$AN?6?$AA@
  0007e e8 00 00 00 00 call _printf

; 43   :     ul = 0;   
; 44   :     printf("start 3: \r\n");

  00083 68 00 00 00 00 push OFFSET FLAT:??_C@_0M@KFKIGIMN@start?53?3?5?$AN?6?$AA@
  00088 e8 00 00 00 00 call _printf
  0008d 83 c4 0c add esp, 12 ; 0000000cH

; 45   :     dw = GetTickCount();

  00090 ff d7 call edi

; 46   :     
; 47   :     for (ul = 0; ul < loop; ++ul)

  00092 33 d2 xor edx, edx
  00094 85 f6 test esi, esi
  00096 8b d8 mov ebx, eax
  00098 89 54 24 0c mov DWORD PTR _ul$[esp+20], edx
  0009c 76 1a jbe SHORT $L63855
  0009e 8b ff npad 2
$L63853:

; 49   :     {   
; 50   :         mov eax, 1

  000a0 b8 01 00 00 00 mov eax, 1

; 51   :         mov ecx, ul

  000a5 8b 4c 24 0c mov ecx, DWORD PTR _ul$[esp+20]

; 52   :         add eax, ecx

  000a9 03 c1 add eax, ecx

; 53   :         mov results, eax

  000ab 89 44 24 10 mov DWORD PTR _results$[esp+20], eax

; 48   :     __asm

  000af 42 inc edx
  000b0 3b d6 cmp edx, esi
  000b2 89 54 24 0c mov DWORD PTR _ul$[esp+20], edx
  000b6 72 e8 jb SHORT $L63853
$L63855:

; 54   :     }
; 55   :     
; 56   :     printf("finis 3: %u\r\n", GetTickCount() - dw);

  000b8 ff d7 call edi
  000ba 2b c3 sub eax, ebx
  000bc 50 push eax
  000bd 68 00 00 00 00 push OFFSET FLAT:??_C@_0O@BECBJDF@finis?53?3?5?$CFu?$AN?6?$AA@
  000c2 e8 00 00 00 00 call _printf

; 57   :     system("pause");

  000c7 68 00 00 00 00 push OFFSET FLAT:??_C@_05PDJBBECF@pause?$AA@
  000cc e8 00 00 00 00 call _system
  000d1 83 c4 0c add esp, 12 ; 0000000cH
  000d4 5f pop edi
  000d5 5e pop esi

; 58   :
; 59   : return 0;

  000d6 33 c0 xor eax, eax
  000d8 5b pop ebx

; 60   : }

  000d9 83 c4 08 add esp, 8
  000dc c3 ret 0
_main ENDP
_TEXT ENDS
END

Jibz

Yes, it is true.

Since the compiler can see that the loop has no side effects, and that the result of the computation is never used, it removes the 'dead' code :U.

If you compile without optimizations turned on, or print the result after the loop, the loop code is generated.

hutch--

I forget the terminology but TENKEY has used it before and its part of the design of modern compilers to remove redundant code. My only real complaint with modern compiler generated code is the use of an early 90s RISC technique of placing function addresses in registers first before calling them which involves redundant loads and stores and reduces the number of registers available for code optimisation.

There is a lot of decent optimisation practice available for you if you build the module with ALL of the optimisation turned off and then manually otimise it yourself. By doing this you retain the logic layout of your C source and have more registers to play with while optimising the code. You will get two benefits here, first it will show you where your C source is inefficient so you can write better C source and where you need it, you can manually optimise the generated code to get some performance improvements over the optimised C output.

Its not an easy task but its a useful one for you to learn.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

thomas_remkus

Does the C compiler then not know how to optimize the __asm code? I presume that if it just plops it in there with no checks, no warnings, then that too could be an advantage I've never taken hold of. Normally I need to do indices holders to pointers in arrays with #pragma warnings disabled. It would be nice to work up a small macro that will do this in __asm.

Are my assumptions right? No checks, no optimizations, no warnings?

thomas

hutch--

Its usually good practice NOT to use inline asm in C code as it messes up the internal optimisation for the C code. It should leave your inline asm code unmodified but it may have to preserve some registers if your register usage is different to the internal usage. It is a better proposition to write a module in MASM and link it into the C code, that way you don't mess up the internal otimisation at all.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

Jibz

Many of the optimizations, like the removal of the loop here, are most likely happening before the code generation, so the compiler cannot take your inline asm code into account there.

Moving inline asm code to a separate module also has the benefit that your c source files are more portable (even across Windows C compilers there are differences in inline asm syntax) :U.