I thought I would post this before I actually used it in anything so you guys could point out any glaring errors.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.586 ; create 32 bit code
.model flat, stdcall ; 32 bit memory model
option casemap :none ; case sensitive
include \masm32\include\windows.inc
include \masm32\include\masm32.inc
include \masm32\include\kernel32.inc
includelib \masm32\lib\masm32.lib
includelib \masm32\lib\kernel32.lib
include \masm32\macros\macros.asm
; ---------------------------------------------------------------------
; These two macros perform the grunt work involved in measuring the
; processor clock cycle count for a block of code. These macros must
; be used in pairs, and the block of code must be placed in between
; the clockctr_begin and clockctr_end macro calls. The clockctr_end
; macro returns the clock cycle count for a single pass through the
; block of code, corrected for the test loop overhead, in EAX.
; ---------------------------------------------------------------------
clockctr_begin MACRO loopcount:REQ, priority
LOCAL label
IFNDEF __clockctr__stuff__defined__
__clockctr__stuff__defined__ equ <1>
.data
__clockctr__loop__count__ dd 0
__clockctr__loop__counter__ dd 0
__clockctr__qword__count__ dq 0
.code
ENDIF
mov __clockctr__loop__count__, loopcount
IFNB <priority>
invoke GetCurrentProcess
invoke SetPriorityClass, eax, priority
ENDIF
xor eax, eax ;; Use same CPUID input value for each call
cpuid ;; Flush pipe and wait for pending ops to finish
rdtsc ;; Read Time Stamp Counter
push edx ;; Preserve high-order 32 bits of start count
push eax ;; Preserve low-order 32 bits of start count
mov __clockctr__loop__counter__, loopcount
@@:
sub __clockctr__loop__counter__, 1
;; Empty reference loop
jnz @B
xor eax, eax
cpuid
rdtsc
pop ecx ;; Recover low-order 32 bits of start count
sub eax, ecx ;; Low-order 32 bits of overhead count in EAX
pop ecx ;; Recover high-order 32 bits of start count
sbb edx, ecx ;; High-order 32 bits of overhead count in EDX
push edx ;; Preserve high-order 32 bits of overhead count
push eax ;; Preserve low-order 32 bits of overhead count
mov eax, loopcount ;; Set up test loop
mov __clockctr__loop__counter__, eax
xor eax, eax
cpuid
rdtsc
push edx ;; Preserve high-order 32 bits of start count
push eax ;; Preserve low-order 32 bits of start count
label:
__clockctr__loop__label__ equ <label>
ENDM
clockctr_end MACRO
sub __clockctr__loop__counter__, 1
jnz __clockctr__loop__label__
xor eax, eax
cpuid
rdtsc
push eax
push edx
invoke GetCurrentProcess
invoke SetPriorityClass,eax,NORMAL_PRIORITY_CLASS
pop edx
pop eax
pop ecx ;; Recover low-order 32 bits of start count
sub eax, ecx ;; Low-order 32 bits of test count in EAX
pop ecx ;; Recover high-order 32 bits of start count
sbb edx, ecx ;; High-order 32 bits of test count in EDX
pop ecx ;; Recover low-order 32 bits of overhead count
sub eax, ecx ;; Low-order 32 bits of adjusted count in EAX
pop ecx ;; Recover high-order 32 bits of overhead count
sbb edx, ecx ;; High-order 32 bits of adjusted count in EDX
mov DWORD PTR __clockctr__qword__count__, eax
mov DWORD PTR __clockctr__qword__count__ + 4, edx
finit
fild __clockctr__qword__count__
fild __clockctr__loop__count__
fdiv
fistp __clockctr__qword__count__
mov eax, DWORD PTR __clockctr__qword__count__
ENDM
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.data
.code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
LOOPCOUNT equ 5000000
clockctr_begin LOOPCOUNT, HIGH_PRIORITY_CLASS
nop
nop
clockctr_end
print ustr$(eax)
print chr$(13,10)
clockctr_begin LOOPCOUNT, HIGH_PRIORITY_CLASS
REPEAT 2
mov eax, 123
mov edx, 456
mov ecx, 789
add edx, eax
sub ecx, eax
mul edx
ENDM
clockctr_end
print ustr$(eax)
print chr$(13,10)
clockctr_begin LOOPCOUNT, HIGH_PRIORITY_CLASS
REPEAT 4
mov eax, 123
mov edx, 456
mov ecx, 789
add edx, eax
sub ecx, eax
mul edx
ENDM
clockctr_end
print ustr$(eax)
print chr$(13,10)
clockctr_begin LOOPCOUNT, HIGH_PRIORITY_CLASS
REPEAT 8
mov eax, 123
mov edx, 456
mov ecx, 789
add edx, eax
sub ecx, eax
mul edx
ENDM
clockctr_end
print ustr$(eax)
print chr$(13,10)
clockctr_begin LOOPCOUNT, HIGH_PRIORITY_CLASS
REPEAT 16
mov eax, 123
mov edx, 456
mov ecx, 789
add edx, eax
sub ecx, eax
mul edx
ENDM
clockctr_end
print ustr$(eax)
print chr$(13,10)
clockctr_begin LOOPCOUNT, HIGH_PRIORITY_CLASS
REPEAT 32
mov eax, 123
mov edx, 456
mov ecx, 789
add edx, eax
sub ecx, eax
mul edx
ENDM
clockctr_end
print ustr$(eax)
print chr$(13,10)
mov eax,input(13,10,"Press enter to exit...",13,10)
exit
end start
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
Playing around with REALTIME_PRIORITY_CLASS brought Windows 2000 down for the first time since I installed it.
[attachment deleted by admin]
Hi MichaelW,
Nice work :U
Here are my results on my PIV 2.66 GHz
-1
11
36
85
180
375
Hi Vortex,
Thanks. So you don't see any serious problems?
My results on a P3:
1
6
15
28
61
134
Any idea why the clock counts don't scale linearly with the increasing repeat counts?
Michael,
It tends to be a couple of things, data in that cache which gets slower as the data size exceeds the cache size and processor priority. It is common to see small code go faster than big code.
On my Celeron 2.4ghz,
I get the following results:
0
5
10
26
57
116
Weird, the code seems to run faster on my processor than onP4 which is supposed to be faster than mine
On my Athlon XP-M 2400+
I get the following results:
Code:
2
5
12
24
52
102
Very strange results, don't you think so?
Hi MichaelW,
Will you release a new version of your macro set?
Hi Vortex,
I will, but I have several other things to finish first.
Hi MichaelW,
That's O.K, thanks :U
AMD Athlon 64 2800+
2
2
10
26
59
125
:U :U
Great stuff Michael :) It might not be a big deal, but you can shave off 4 instructions from the END version of the macro. You push EAX and EDX before calling the SetPriortiyClass. Just move SetPriorityClass to the last thing so you don't have to PUSH and POP EAX and EDX
clockctr_end MACRO
sub __clockctr__loop__counter__, 1
jnz __clockctr__loop__label__
xor eax, eax
cpuid
rdtsc
;;;;;;;;push eax
;;;;;;;;push edx
;;;;;;;;invoke GetCurrentProcess
;;;;;;;;invoke SetPriorityClass,eax,NORMAL_PRIORITY_CLASS
;;;;;;;;pop edx
;;;;;;;;pop eax
pop ecx ;; Recover low-order 32 bits of start count
sub eax, ecx ;; Low-order 32 bits of test count in EAX
pop ecx ;; Recover high-order 32 bits of start count
sbb edx, ecx ;; High-order 32 bits of test count in EDX
pop ecx ;; Recover low-order 32 bits of overhead count
sub eax, ecx ;; Low-order 32 bits of adjusted count in EAX
pop ecx ;; Recover high-order 32 bits of overhead count
sbb edx, ecx ;; High-order 32 bits of adjusted count in EDX
mov DWORD PTR __clockctr__qword__count__, eax
mov DWORD PTR __clockctr__qword__count__ + 4, edx
invoke GetCurrentProcess
invoke SetPriorityClass,eax,NORMAL_PRIORITY_CLASS
finit
fild __clockctr__qword__count__
fild __clockctr__loop__count__
fdiv
fistp __clockctr__qword__count__
mov eax, DWORD PTR __clockctr__qword__count__
ENDM
This is important. I totally forgot about this until I looked at my timing code. You need an EMMS instruction before using any of the floating point stuff. I don't put EMMS in my routines anymore since it's so slow ( it's an optimization to not have it in every routine). So I put it before I use floating point.
mov DWORD PTR __clockctr__qword__count__, eax
mov DWORD PTR __clockctr__qword__count__ + 4, edx
invoke GetCurrentProcess
invoke SetPriorityClass,eax,NORMAL_PRIORITY_CLASS
EMMS
finit
fild __clockctr__qword__count__
fild __clockctr__loop__count__
fdiv
fistp __clockctr__qword__count__
mov eax, DWORD PTR __clockctr__qword__count__
ENDM
Thanks Mark,
When I placed the code to restore the priority, in the back of my mind I was thinking ASAP, but what difference could a few more clocks make. To avoid making the macros dependent on the MMX instructions being supported by the processor and enabled in MASM, I include EMMS only if _EMMS has been defined. I tried to use @Cpu for this, but at least for MASM 6.14 the return value is not affected by .MMX or .XMM. A new version of the macros with these and several other minor changes:
; ---------------------------------------------------------------------
; These two macros perform the grunt work involved in measuring the
; processor clock cycle count for a block of code. These macros must
; be used in pairs, and the block of code must be placed in between
; the clockctr_begin and clockctr_end macro calls. The clockctr_end
; macro returns the clock cycle count for a single pass through the
; block of code, corrected for the test loop overhead, in EAX.
;
; The loopcount parameter should be set to a relatively high value
; to produce repeatable results.
;
; Note that setting the priority to REALTIME_PRIORITY_CLASS is risky,
; as it will cause your process to preempt *all* other processes,
; including critical Windows processes.
; ---------------------------------------------------------------------
clockctr_begin MACRO loopcount:REQ, priority
LOCAL label
IFNDEF __clockctr__stuff__defined__
__clockctr__stuff__defined__ equ <1>
.data
__clockctr__qword__count__ dq 0
__clockctr__loop__count__ dd 0
__clockctr__loop__counter__ dd 0
.code
ENDIF
mov __clockctr__loop__count__, loopcount
IFNB <priority>
invoke GetCurrentProcess
invoke SetPriorityClass, eax, priority
ENDIF
xor eax, eax ;; Use same CPUID input value for each call
cpuid ;; Flush pipe & wait for pending ops to finish
rdtsc ;; Read Time Stamp Counter
push edx ;; Preserve high-order 32 bits of start count
push eax ;; Preserve low-order 32 bits of start count
mov __clockctr__loop__counter__, loopcount
@@:
sub __clockctr__loop__counter__, 1
;; Run an empty reference loop
jnz @B
xor eax, eax
cpuid
rdtsc
pop ecx ;; Recover low-order 32 bits of start count
sub eax, ecx ;; Low-order 32 bits of overhead count in EAX
pop ecx ;; Recover high-order 32 bits of start count
sbb edx, ecx ;; High-order 32 bits of overhead count in EDX
push edx ;; Preserve high-order 32 bits of overhead count
push eax ;; Preserve low-order 32 bits of overhead count
mov __clockctr__loop__counter__, loopcount
xor eax, eax
cpuid
rdtsc
push edx ;; Preserve high-order 32 bits of start count
push eax ;; Preserve low-order 32 bits of start count
label: ;; Start test loop
__clockctr__loop__label__ equ <label>
ENDM
clockctr_end MACRO
sub __clockctr__loop__counter__, 1
jnz __clockctr__loop__label__
xor eax, eax
cpuid
rdtsc
pop ecx ;; Recover low-order 32 bits of start count
sub eax, ecx ;; Low-order 32 bits of test count in EAX
pop ecx ;; Recover high-order 32 bits of start count
sbb edx, ecx ;; High-order 32 bits of test count in EDX
pop ecx ;; Recover low-order 32 bits of overhead count
sub eax, ecx ;; Low-order 32 bits of adjusted count in EAX
pop ecx ;; Recover high-order 32 bits of overhead count
sbb edx, ecx ;; High-order 32 bits of adjusted count in EDX
mov DWORD PTR __clockctr__qword__count__, eax
mov DWORD PTR __clockctr__qword__count__ + 4, edx
invoke GetCurrentProcess
invoke SetPriorityClass, eax, NORMAL_PRIORITY_CLASS
IFDEF _EMMS
EMMS
ENDIF
finit
fild __clockctr__qword__count__
fild __clockctr__loop__count__
fdiv
fistp __clockctr__qword__count__
mov eax, DWORD PTR __clockctr__qword__count__
ENDM
You can read CR4 bit 9. If it is set the processor supports MMX. The downside is it only works privilege level 0 ;)
Hi MichaelW,
Here the the new results on my PIV 2.66 GHz:
1
11
40
84
180
376
Not much difference between the two versions.
Hi Vortex,
I posted a new version of the macros in the Binary string to dword thread, then I lost track of this thread and forgot to update it. The attachment includes the new version, now with everything important aligned so the results do not vary with the alignment of the macro calls.
[attachment deleted by admin]