News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Clock cycle count macros

Started by MichaelW, January 08, 2005, 06:07:21 AM

Previous topic - Next topic

MichaelW

I thought I would post this before I actually used it in anything so you guys could point out any glaring errors.

; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

    .586                       ; create 32 bit code
    .model flat, stdcall       ; 32 bit memory model
    option casemap :none       ; case sensitive

    include \masm32\include\windows.inc
    include \masm32\include\masm32.inc
    include \masm32\include\kernel32.inc

    includelib \masm32\lib\masm32.lib
    includelib \masm32\lib\kernel32.lib

    include \masm32\macros\macros.asm

    ; ---------------------------------------------------------------------
    ; These two macros perform the grunt work involved in measuring the
    ; processor clock cycle count for a block of code. These macros must
    ; be used in pairs, and the block of code must be placed in between
    ; the clockctr_begin and clockctr_end macro calls. The clockctr_end
    ; macro returns the clock cycle count for a single pass through the
    ; block of code, corrected for the test loop overhead, in EAX.
    ; ---------------------------------------------------------------------
    clockctr_begin MACRO loopcount:REQ, priority
        LOCAL label
        IFNDEF __clockctr__stuff__defined__
          __clockctr__stuff__defined__ equ <1>
          .data
            __clockctr__loop__count__   dd 0
            __clockctr__loop__counter__ dd 0
            __clockctr__qword__count__  dq 0
          .code           
        ENDIF
        mov __clockctr__loop__count__, loopcount
        IFNB <priority>   
          invoke GetCurrentProcess
          invoke SetPriorityClass, eax, priority
        ENDIF
        xor   eax, eax        ;; Use same CPUID input value for each call
        cpuid                 ;; Flush pipe and wait for pending ops to finish
        rdtsc                 ;; Read Time Stamp Counter
        push  edx             ;; Preserve high-order 32 bits of start count
        push  eax             ;; Preserve low-order 32 bits of start count

        mov   __clockctr__loop__counter__, loopcount
      @@:                     
        sub   __clockctr__loop__counter__, 1
        ;; Empty reference loop
        jnz   @B

        xor   eax, eax
        cpuid
        rdtsc
        pop   ecx             ;; Recover low-order 32 bits of start count
        sub   eax, ecx        ;; Low-order 32 bits of overhead count in EAX
        pop   ecx             ;; Recover high-order 32 bits of start count
        sbb   edx, ecx        ;; High-order 32 bits of overhead count in EDX
        push  edx             ;; Preserve high-order 32 bits of overhead count
        push  eax             ;; Preserve low-order 32 bits of overhead count
        mov   eax, loopcount  ;; Set up test loop
        mov   __clockctr__loop__counter__, eax
        xor   eax, eax
        cpuid
        rdtsc
        push  edx             ;; Preserve high-order 32 bits of start count
        push  eax             ;; Preserve low-order 32 bits of start count
      label:
        __clockctr__loop__label__ equ <label>
    ENDM

    clockctr_end MACRO
        sub   __clockctr__loop__counter__, 1
        jnz   __clockctr__loop__label__
        xor   eax, eax
        cpuid
        rdtsc
        push  eax
        push  edx
        invoke GetCurrentProcess
        invoke SetPriorityClass,eax,NORMAL_PRIORITY_CLASS
        pop   edx
        pop   eax
        pop   ecx             ;; Recover low-order 32 bits of start count
        sub   eax, ecx        ;; Low-order 32 bits of test count in EAX
        pop   ecx             ;; Recover high-order 32 bits of start count
        sbb   edx, ecx        ;; High-order 32 bits of test count in EDX
        pop   ecx             ;; Recover low-order 32 bits of overhead count
        sub   eax, ecx        ;; Low-order 32 bits of adjusted count in EAX
        pop   ecx             ;; Recover high-order 32 bits of overhead count
        sbb   edx, ecx        ;; High-order 32 bits of adjusted count in EDX

        mov   DWORD PTR __clockctr__qword__count__, eax
        mov   DWORD PTR __clockctr__qword__count__ + 4, edx

        finit
        fild  __clockctr__qword__count__
        fild  __clockctr__loop__count__
        fdiv
        fistp __clockctr__qword__count__
        mov   eax, DWORD PTR __clockctr__qword__count__
    ENDM

; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
    .data
    .code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

    LOOPCOUNT equ 5000000

    clockctr_begin LOOPCOUNT, HIGH_PRIORITY_CLASS
      nop
      nop
    clockctr_end
    print ustr$(eax)
    print chr$(13,10)
   
    clockctr_begin LOOPCOUNT, HIGH_PRIORITY_CLASS
      REPEAT 2
        mov eax, 123
        mov edx, 456
        mov ecx, 789
        add edx, eax
        sub ecx, eax
        mul edx
      ENDM
    clockctr_end
    print ustr$(eax)
    print chr$(13,10)

    clockctr_begin LOOPCOUNT, HIGH_PRIORITY_CLASS
      REPEAT 4
        mov eax, 123
        mov edx, 456
        mov ecx, 789
        add edx, eax
        sub ecx, eax
        mul edx
      ENDM
    clockctr_end
    print ustr$(eax)
    print chr$(13,10)
   
    clockctr_begin LOOPCOUNT, HIGH_PRIORITY_CLASS
      REPEAT 8
        mov eax, 123
        mov edx, 456
        mov ecx, 789
        add edx, eax
        sub ecx, eax
        mul edx
      ENDM
    clockctr_end
    print ustr$(eax)
    print chr$(13,10)

    clockctr_begin LOOPCOUNT, HIGH_PRIORITY_CLASS
      REPEAT 16
        mov eax, 123
        mov edx, 456
        mov ecx, 789
        add edx, eax
        sub ecx, eax
        mul edx
      ENDM
    clockctr_end
    print ustr$(eax)
    print chr$(13,10)

    clockctr_begin LOOPCOUNT, HIGH_PRIORITY_CLASS
      REPEAT 32
        mov eax, 123
        mov edx, 456
        mov ecx, 789
        add edx, eax
        sub ecx, eax
        mul edx
      ENDM
    clockctr_end
    print ustr$(eax)
    print chr$(13,10)

    mov eax,input(13,10,"Press enter to exit...",13,10)
    exit
end start

; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««


Playing around with REALTIME_PRIORITY_CLASS brought Windows 2000 down for the first time since I installed it.


[attachment deleted by admin]
eschew obfuscation

Vortex

Hi MichaelW,

Nice work :U

Here are my results on my PIV 2.66 GHz

-1
11
36
85
180
375

MichaelW

#2
Hi Vortex,

Thanks. So you don't see any serious problems?

My results on a P3:

1
6
15
28
61
134


Any idea why the clock counts don't scale linearly with the increasing repeat counts?
eschew obfuscation

hutch--

Michael,

It tends to be a couple of things, data in that cache which gets slower as the data size exceeds the cache size and processor priority. It is common to see small code go faster than big code.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

roticv

On my Celeron 2.4ghz,

I get the following results:
0
5
10
26
57
116


Weird, the code seems to run faster on my processor than onP4 which is supposed to be faster than mine

TOTGEBOREN

On my Athlon XP-M 2400+

I get the following results:

Code:
2
5
12
24
52
102

Very strange results, don't you think so?

Vortex

Hi MichaelW,

Will you release a new version of your macro set?

MichaelW

Hi Vortex,

I will, but I have several other things to finish first.
eschew obfuscation

Vortex

Hi MichaelW,

That's O.K, thanks :U

chetnik

AMD Athlon 64 2800+

2
2
10
26
59
125

:U :U

Mark_Larson

 Great stuff Michael :)  It might not be a big deal, but you can shave off 4 instructions from the END version of the macro.  You push EAX and EDX before calling the SetPriortiyClass.  Just move SetPriorityClass to the last thing so you don't have to PUSH and POP EAX and EDX


    clockctr_end MACRO
        sub   __clockctr__loop__counter__, 1
        jnz   __clockctr__loop__label__
        xor   eax, eax
        cpuid
        rdtsc
;;;;;;;;push  eax
;;;;;;;;push  edx
;;;;;;;;invoke GetCurrentProcess
;;;;;;;;invoke SetPriorityClass,eax,NORMAL_PRIORITY_CLASS
;;;;;;;;pop   edx
;;;;;;;;pop   eax
        pop   ecx             ;; Recover low-order 32 bits of start count
        sub   eax, ecx        ;; Low-order 32 bits of test count in EAX
        pop   ecx             ;; Recover high-order 32 bits of start count
        sbb   edx, ecx        ;; High-order 32 bits of test count in EDX
        pop   ecx             ;; Recover low-order 32 bits of overhead count
        sub   eax, ecx        ;; Low-order 32 bits of adjusted count in EAX
        pop   ecx             ;; Recover high-order 32 bits of overhead count
        sbb   edx, ecx        ;; High-order 32 bits of adjusted count in EDX

        mov   DWORD PTR __clockctr__qword__count__, eax
        mov   DWORD PTR __clockctr__qword__count__ + 4, edx

        invoke GetCurrentProcess
        invoke SetPriorityClass,eax,NORMAL_PRIORITY_CLASS


        finit
        fild  __clockctr__qword__count__
        fild  __clockctr__loop__count__
        fdiv
        fistp __clockctr__qword__count__
        mov   eax, DWORD PTR __clockctr__qword__count__
    ENDM

BIOS programmers do it fastest, hehe.  ;)

My Optimization webpage
htttp://www.website.masmforum.com/mark/index.htm

Mark_Larson

  This is important.  I totally forgot about this until I looked at my timing code.  You need an EMMS instruction before using any of the floating point stuff.  I don't put EMMS in my routines anymore since it's so slow ( it's an optimization to not have it in every routine).  So I put it before I use floating point.


       mov   DWORD PTR __clockctr__qword__count__, eax
        mov   DWORD PTR __clockctr__qword__count__ + 4, edx

        invoke GetCurrentProcess
        invoke SetPriorityClass,eax,NORMAL_PRIORITY_CLASS

       EMMS

        finit
        fild  __clockctr__qword__count__
        fild  __clockctr__loop__count__
        fdiv
        fistp __clockctr__qword__count__
        mov   eax, DWORD PTR __clockctr__qword__count__
    ENDM
BIOS programmers do it fastest, hehe.  ;)

My Optimization webpage
htttp://www.website.masmforum.com/mark/index.htm

MichaelW

Thanks Mark,

When I placed the code to restore the priority, in the back of my mind I was thinking ASAP, but what difference could a few more clocks make. To avoid making the macros dependent on the MMX instructions being supported by the processor and enabled in MASM, I include EMMS only if _EMMS has been defined. I tried to use @Cpu for this, but at least for MASM 6.14 the return value is not affected by .MMX or .XMM. A new version of the macros with these and several other minor changes:

  ; ---------------------------------------------------------------------
  ; These two macros perform the grunt work involved in measuring the
  ; processor clock cycle count for a block of code. These macros must
  ; be used in pairs, and the block of code must be placed in between
  ; the clockctr_begin and clockctr_end macro calls. The clockctr_end
  ; macro returns the clock cycle count for a single pass through the
  ; block of code, corrected for the test loop overhead, in EAX.
  ;
  ; The loopcount parameter should be set to a relatively high value
  ; to produce repeatable results.
  ;
  ; Note that setting the priority to REALTIME_PRIORITY_CLASS is risky,
  ; as it will cause your process to preempt *all* other processes,
  ; including critical Windows processes.
  ; ---------------------------------------------------------------------
    clockctr_begin MACRO loopcount:REQ, priority
        LOCAL label
        IFNDEF __clockctr__stuff__defined__
          __clockctr__stuff__defined__ equ <1>
          .data
            __clockctr__qword__count__  dq 0
            __clockctr__loop__count__   dd 0
            __clockctr__loop__counter__ dd 0
          .code
        ENDIF
        mov __clockctr__loop__count__, loopcount
        IFNB <priority>
          invoke GetCurrentProcess
          invoke SetPriorityClass, eax, priority
        ENDIF
        xor   eax, eax        ;; Use same CPUID input value for each call
        cpuid                 ;; Flush pipe & wait for pending ops to finish
        rdtsc                 ;; Read Time Stamp Counter
        push  edx             ;; Preserve high-order 32 bits of start count
        push  eax             ;; Preserve low-order 32 bits of start count

        mov   __clockctr__loop__counter__, loopcount
      @@:
        sub   __clockctr__loop__counter__, 1
        ;; Run an empty reference loop
        jnz   @B

        xor   eax, eax
        cpuid
        rdtsc
        pop   ecx             ;; Recover low-order 32 bits of start count
        sub   eax, ecx        ;; Low-order 32 bits of overhead count in EAX
        pop   ecx             ;; Recover high-order 32 bits of start count
        sbb   edx, ecx        ;; High-order 32 bits of overhead count in EDX
        push  edx             ;; Preserve high-order 32 bits of overhead count
        push  eax             ;; Preserve low-order 32 bits of overhead count
        mov   __clockctr__loop__counter__, loopcount
        xor   eax, eax
        cpuid
        rdtsc
        push  edx             ;; Preserve high-order 32 bits of start count
        push  eax             ;; Preserve low-order 32 bits of start count
      label:                  ;; Start test loop
        __clockctr__loop__label__ equ <label>
    ENDM

    clockctr_end MACRO
        sub   __clockctr__loop__counter__, 1
        jnz   __clockctr__loop__label__
        xor   eax, eax
        cpuid
        rdtsc
        pop   ecx             ;; Recover low-order 32 bits of start count
        sub   eax, ecx        ;; Low-order 32 bits of test count in EAX
        pop   ecx             ;; Recover high-order 32 bits of start count
        sbb   edx, ecx        ;; High-order 32 bits of test count in EDX
        pop   ecx             ;; Recover low-order 32 bits of overhead count
        sub   eax, ecx        ;; Low-order 32 bits of adjusted count in EAX
        pop   ecx             ;; Recover high-order 32 bits of overhead count
        sbb   edx, ecx        ;; High-order 32 bits of adjusted count in EDX

        mov   DWORD PTR __clockctr__qword__count__, eax
        mov   DWORD PTR __clockctr__qword__count__ + 4, edx

        invoke GetCurrentProcess
        invoke SetPriorityClass, eax, NORMAL_PRIORITY_CLASS

        IFDEF _EMMS
          EMMS
        ENDIF

        finit
        fild  __clockctr__qword__count__
        fild  __clockctr__loop__count__
        fdiv
        fistp __clockctr__qword__count__

        mov   eax, DWORD PTR __clockctr__qword__count__
    ENDM

eschew obfuscation

Mark_Larson


  You can read CR4 bit 9.  If it is set the processor supports MMX.  The downside is it only works privilege level 0 ;)

BIOS programmers do it fastest, hehe.  ;)

My Optimization webpage
htttp://www.website.masmforum.com/mark/index.htm

Vortex

Hi MichaelW,

Here the the new results on my PIV 2.66 GHz:

1
11
40
84
180
376


Not much difference between the two versions.