why does C optomize this way?

thomas_remkus · June 28, 2005, 06:27:40 PM

I am expanding some of my testing/knowledge-growth by working with a little inline __asm with Visual Studio. It seems that my 'for' loop has been optomized out but I can't really see how this is effective by producing the same results. Here is the C code:

Code Select

#include "stdafx.h"
#include <stdlib.h>
#include "windows.h"

int addNumbers1(int a, int b)
{
    return a + b;
}

int addNumbers2(int a, int b)
{
    __asm
    {
        mov eax, a
        mov ecx, b
        add eax, ecx
    }
}

int _tmain(int argc, _TCHAR* argv[])
{
    unsigned long ul = 0;
    DWORD dw = 0;
    unsigned long loop = GetTickCount() * 20;
    unsigned long results = 0;
    
    printf("loop is: %u\r\n\r\n", loop);
    printf("start 1: \r\n");
    dw = GetTickCount();

    for (ul = 0; ul < loop; ++ul)
        results = addNumbers1(1, ul);
    
    printf("finis 1: %u\r\n", GetTickCount() - dw);
    ul = 0;    
    printf("start 2: \r\n");
    dw = GetTickCount();
    
    for (ul = 0; ul < loop; ++ul)
        results = addNumbers2(1, ul);
    
    printf("finis 2: %u\r\n", GetTickCount() - dw);
    ul = 0;    
    printf("start 3: \r\n");
    dw = GetTickCount();
    
    for (ul = 0; ul < loop; ++ul)
    __asm 
    {   
        mov eax, 1
        mov ecx, ul
        add eax, ecx
        mov results, eax
    }
    
    printf("finis 3: %u\r\n", GetTickCount() - dw);
    system("pause");

	return 0;
}

Now, here is the 'cod' file that was produced. Note the first loop seems to be optomized out. Is that true? What the heck is it does here????

Code Select


; Listing generated by Microsoft (R) Optimizing Compiler Version 13.10.3077 
	.386P
    .model FLAT

INCLUDELIB LIBC
INCLUDELIB OLDNAMES

PUBLIC	?addNumbers1@@YAHHH@Z				; addNumbers1
_TEXT	SEGMENT
_a$ = 8							; size = 4
_b$ = 12						; size = 4
?addNumbers1@@YAHHH@Z PROC NEAR				; addNumbers1, COMDAT

; 7    :     return a + b;

  00000	8b 44 24 08	 mov	 eax, DWORD PTR _b$[esp-4]
  00004	8b 4c 24 04	 mov	 ecx, DWORD PTR _a$[esp-4]
  00008	03 c1		 add	 eax, ecx

; 8    : }

  0000a	c3		 ret	 0
?addNumbers1@@YAHHH@Z ENDP				; addNumbers1
_TEXT	ENDS
PUBLIC	?addNumbers2@@YAHHH@Z				; addNumbers2
_TEXT	SEGMENT
_a$ = 8							; size = 4
_b$ = 12						; size = 4
?addNumbers2@@YAHHH@Z PROC NEAR				; addNumbers2, COMDAT

; 12   :     __asm
; 13   :     {
; 14   :         mov eax, a

  00000	8b 44 24 04	 mov	 eax, DWORD PTR _a$[esp-4]

; 15   :         mov ecx, b

  00004	8b 4c 24 08	 mov	 ecx, DWORD PTR _b$[esp-4]

; 16   :         add eax, ecx

  00008	03 c1		 add	 eax, ecx

; 17   :     }
; 18   : }

  0000a	c3		 ret	 0
?addNumbers2@@YAHHH@Z ENDP				; addNumbers2
_TEXT	ENDS
PUBLIC	_main
;	COMDAT _main
_TEXT	SEGMENT
_ul$ = -8						; size = 4
_results$ = -4						; size = 4
_argc$ = 8						; size = 4
_argv$ = 12						; size = 4
_main	PROC NEAR					; COMDAT

; 21   : {

  00000	83 ec 08	 sub	 esp, 8
  00003	53		 push	 ebx
  00004	56		 push	 esi
  00005	57		 push	 edi

; 22   :     unsigned long ul = 0;
; 23   :     DWORD dw = 0;
; 24   :     unsigned long loop = GetTickCount() * 20;

  00006	8b 3d 00 00 00
	00		 mov	 edi, DWORD PTR __imp__GetTickCount@0
  0000c	ff d7		 call	 edi
  0000e	8d 34 80	 lea	 esi, DWORD PTR [eax+eax*4]
  00011	c1 e6 02	 shl	 esi, 2

; 25   :     unsigned long results = 0;
; 26   :     
; 27   :     printf("loop is: %u\r\n\r\n", loop);

  00014	56		 push	 esi
  00015	68 00 00 00 00	 push	 OFFSET FLAT:??_C@_0BA@DEODJLEB@loop?5is?3?5?$CFu?$AN?6?$AN?6?$AA@
  0001a	e8 00 00 00 00	 call	 _printf

; 28   :     printf("start 1: \r\n");

  0001f	68 00 00 00 00	 push	 OFFSET FLAT:??_C@_0M@OIGAMJMG@start?51?3?5?$AN?6?$AA@
  00024	e8 00 00 00 00	 call	 _printf
  00029	83 c4 0c	 add	 esp, 12			; 0000000cH

; 29   :     dw = GetTickCount();

  0002c	ff d7		 call	 edi
  0002e	8b d8		 mov	 ebx, eax

; 30   : 
; 31   :     for (ul = 0; ul < loop; ++ul)
; 32   :         results = addNumbers1(1, ul);
; 33   :     
; 34   :     printf("finis 1: %u\r\n", GetTickCount() - dw);

  00030	ff d7		 call	 edi
  00032	2b c3		 sub	 eax, ebx
  00034	50		 push	 eax
  00035	68 00 00 00 00	 push	 OFFSET FLAT:??_C@_0O@EDGHBOEI@finis?51?3?5?$CFu?$AN?6?$AA@
  0003a	e8 00 00 00 00	 call	 _printf

; 35   :     ul = 0;    
; 36   :     printf("start 2: \r\n");

  0003f	68 00 00 00 00	 push	 OFFSET FLAT:??_C@_0M@GOPELLGI@start?52?3?5?$AN?6?$AA@
  00044	e8 00 00 00 00	 call	 _printf
  00049	83 c4 0c	 add	 esp, 12			; 0000000cH

; 37   :     dw = GetTickCount();

  0004c	ff d7		 call	 edi

; 38   :     
; 39   :     for (ul = 0; ul < loop; ++ul)

  0004e	33 d2		 xor	 edx, edx
  00050	85 f6		 test	 esi, esi
  00052	8b d8		 mov	 ebx, eax
  00054	89 54 24 0c	 mov	 DWORD PTR _ul$[esp+20], edx
  00058	76 1a		 jbe	 SHORT $L63850
  0005a	8d 9b 00 00 00
	00		 npad	 6
$L63848:

; 40   :         results = addNumbers2(1, ul);

  00060	b8 01 00 00 00	 mov	 eax, 1
  00065	8b 4c 24 0c	 mov	 ecx, DWORD PTR _ul$[esp+20]
  00069	03 c1		 add	 eax, ecx
  0006b	42		 inc	 edx
  0006c	3b d6		 cmp	 edx, esi
  0006e	89 54 24 0c	 mov	 DWORD PTR _ul$[esp+20], edx
  00072	72 ec		 jb	 SHORT $L63848
$L63850:

; 41   :     
; 42   :     printf("finis 2: %u\r\n", GetTickCount() - dw);

  00074	ff d7		 call	 edi
  00076	2b c3		 sub	 eax, ebx
  00078	50		 push	 eax
  00079	68 00 00 00 00	 push	 OFFSET FLAT:??_C@_0O@MNOIBJKL@finis?52?3?5?$CFu?$AN?6?$AA@
  0007e	e8 00 00 00 00	 call	 _printf

; 43   :     ul = 0;    
; 44   :     printf("start 3: \r\n");

  00083	68 00 00 00 00	 push	 OFFSET FLAT:??_C@_0M@KFKIGIMN@start?53?3?5?$AN?6?$AA@
  00088	e8 00 00 00 00	 call	 _printf
  0008d	83 c4 0c	 add	 esp, 12			; 0000000cH

; 45   :     dw = GetTickCount();

  00090	ff d7		 call	 edi

; 46   :     
; 47   :     for (ul = 0; ul < loop; ++ul)

  00092	33 d2		 xor	 edx, edx
  00094	85 f6		 test	 esi, esi
  00096	8b d8		 mov	 ebx, eax
  00098	89 54 24 0c	 mov	 DWORD PTR _ul$[esp+20], edx
  0009c	76 1a		 jbe	 SHORT $L63855
  0009e	8b ff		 npad	 2
$L63853:

; 49   :     {   
; 50   :         mov eax, 1

  000a0	b8 01 00 00 00	 mov	 eax, 1

; 51   :         mov ecx, ul

  000a5	8b 4c 24 0c	 mov	 ecx, DWORD PTR _ul$[esp+20]

; 52   :         add eax, ecx

  000a9	03 c1		 add	 eax, ecx

; 53   :         mov results, eax

  000ab	89 44 24 10	 mov	 DWORD PTR _results$[esp+20], eax

; 48   :     __asm 

  000af	42		 inc	 edx
  000b0	3b d6		 cmp	 edx, esi
  000b2	89 54 24 0c	 mov	 DWORD PTR _ul$[esp+20], edx
  000b6	72 e8		 jb	 SHORT $L63853
$L63855:

; 54   :     }
; 55   :     
; 56   :     printf("finis 3: %u\r\n", GetTickCount() - dw);

  000b8	ff d7		 call	 edi
  000ba	2b c3		 sub	 eax, ebx
  000bc	50		 push	 eax
  000bd	68 00 00 00 00	 push	 OFFSET FLAT:??_C@_0O@BECBJDF@finis?53?3?5?$CFu?$AN?6?$AA@
  000c2	e8 00 00 00 00	 call	 _printf

; 57   :     system("pause");

  000c7	68 00 00 00 00	 push	 OFFSET FLAT:??_C@_05PDJBBECF@pause?$AA@
  000cc	e8 00 00 00 00	 call	 _system
  000d1	83 c4 0c	 add	 esp, 12			; 0000000cH
  000d4	5f		 pop	 edi
  000d5	5e		 pop	 esi

; 58   : 
; 59   : 	return 0;

  000d6	33 c0		 xor	 eax, eax
  000d8	5b		 pop	 ebx

; 60   : }

  000d9	83 c4 08	 add	 esp, 8
  000dc	c3		 ret	 0
_main	ENDP
_TEXT	ENDS
END

Jibz · June 28, 2005, 06:42:21 PM

Yes, it is true.

Since the compiler can see that the loop has no side effects, and that the result of the computation is never used, it removes the 'dead' code :U.

If you compile without optimizations turned on, or print the result after the loop, the loop code is generated.

hutch-- · June 29, 2005, 12:45:46 AM

I forget the terminology but TENKEY has used it before and its part of the design of modern compilers to remove redundant code. My only real complaint with modern compiler generated code is the use of an early 90s RISC technique of placing function addresses in registers first before calling them which involves redundant loads and stores and reduces the number of registers available for code optimisation.

There is a lot of decent optimisation practice available for you if you build the module with ALL of the optimisation turned off and then manually otimise it yourself. By doing this you retain the logic layout of your C source and have more registers to play with while optimising the code. You will get two benefits here, first it will show you where your C source is inefficient so you can write better C source and where you need it, you can manually optimise the generated code to get some performance improvements over the optimised C output.

Its not an easy task but its a useful one for you to learn.

thomas_remkus · June 29, 2005, 03:34:48 AM

Does the C compiler then not know how to optimize the __asm code? I presume that if it just plops it in there with no checks, no warnings, then that too could be an advantage I've never taken hold of. Normally I need to do indices holders to pointers in arrays with #pragma warnings disabled. It would be nice to work up a small macro that will do this in __asm.

Are my assumptions right? No checks, no optimizations, no warnings?

thomas

hutch-- · June 29, 2005, 03:59:55 AM

Its usually good practice NOT to use inline asm in C code as it messes up the internal optimisation for the C code. It should leave your inline asm code unmodified but it may have to preserve some registers if your register usage is different to the internal usage. It is a better proposition to write a module in MASM and link it into the C code, that way you don't mess up the internal otimisation at all.

Jibz · June 29, 2005, 07:33:01 AM

Many of the optimizations, like the removal of the loop here, are most likely happening before the code generation, so the compiler cannot take your inline asm code into account there.

Moving inline asm code to a separate module also has the benefit that your c source files are more portable (even across Windows C compilers there are differences in inline asm syntax) :U.

News:

why does C optomize this way?

thomas_remkus

Jibz

hutch--

thomas_remkus

hutch--

Jibz