Hi all
i have found two masm macros for test the speed of function ,because i do not want to reinvent the wheel
i have included them inside #asm #endasm instruction,see the code below
the problem is the "stdout.put("ZeroMemory = " ,tst1, nl , "ZeroMemorySSE = " , tst2 );"
did not print any thing!!!!!!!!!!
is that a bug or error in the code?
the code
program speed_test;
#include( "stdlib.hhf" );
#asm
include \masm32\include\windows.inc
c_msvcrt typedef PROTO C :VARARG
externdef __imp__GetCurrentProcess@0:PTR c_msvcrt
GetCurrentProcess equ <__imp__GetCurrentProcess@0>
externdef __imp__SetPriorityClass@8:PTR c_msvcrt
SetPriorityClass equ <__imp__SetPriorityClass@8>
; ---------------------------------------------------------------------
; These two macros perform the grunt work involved in measuring the
; processor clock cycle count for a block of code. These macros must
; be used in pairs, and the block of code must be placed in between
; the counter_begin and counter_end macro calls. The counter_end macro
; returns the clock cycle count for a single pass through the block of
; code, corrected for the test loop overhead, in EAX.
;
; These macros require a .586 or higher processor directive.
;
; If your code is using MMX instructions and not executing an EMMS
; at the end of each MMX instruction sequence, defining the symbol
; _EMMS will cause the ctr_end macro to insert an EMMS in front of
; the FPU instructions.
;
; The loopcount parameter should be set to a relatively high value to
; produce repeatable results.
;
; Note that setting the priority parameter to REALTIME_PRIORITY_CLASS
; involves some risk, as it will cause your process to preempt *all*
; other processes, including critical Windows processes. Setting the
; priority parameter to HIGH_PRIORITY_CLASS instead will significantly
; reduce the risk, and in most cases will produce the same cycle count.
; ---------------------------------------------------------------------
counter_begin MACRO loopcount:REQ, priority
LOCAL label
IFNDEF __counter__stuff__defined__
__counter__stuff__defined__ equ <1>
.data
ALIGN 8 ;; Optimal alignment for QWORD
__counter__qword__count__ dq 0
__counter__loop__count__ dd 0
__counter__loop__counter__ dd 0
.code
ENDIF
mov __counter__loop__count__, loopcount
IFNB <priority>
invoke GetCurrentProcess
invoke SetPriorityClass, eax, priority
ENDIF
xor eax, eax ;; Use same CPUID input value for each call
cpuid ;; Flush pipe & wait for pending ops to finish
rdtsc ;; Read Time Stamp Counter
push edx ;; Preserve high-order 32 bits of start count
push eax ;; Preserve low-order 32 bits of start count
mov __counter__loop__counter__, loopcount
xor eax, eax
cpuid ;; Make sure loop setup instructions finish
ALIGN 16 ;; Optimal loop alignment for P6
@@: ;; Start an empty reference loop
sub __counter__loop__counter__, 1
jnz @B
xor eax, eax
cpuid ;; Make sure loop instructions finish
rdtsc ;; Read end count
pop ecx ;; Recover low-order 32 bits of start count
sub eax, ecx ;; Low-order 32 bits of overhead count in EAX
pop ecx ;; Recover high-order 32 bits of start count
sbb edx, ecx ;; High-order 32 bits of overhead count in EDX
push edx ;; Preserve high-order 32 bits of overhead count
push eax ;; Preserve low-order 32 bits of overhead count
xor eax, eax
cpuid
rdtsc
push edx ;; Preserve high-order 32 bits of start count
push eax ;; Preserve low-order 32 bits of start count
mov __counter__loop__counter__, loopcount
xor eax, eax
cpuid ;; Make sure loop setup instructions finish
ALIGN 16 ;; Optimal loop alignment for P6
label: ;; Start test loop
__counter__loop__label__ equ <label>
ENDM
counter_end MACRO
sub __counter__loop__counter__, 1
jnz __counter__loop__label__
xor eax, eax
cpuid ;; Make sure loop instructions finish
rdtsc ;; Read end count
pop ecx ;; Recover low-order 32 bits of start count
sub eax, ecx ;; Low-order 32 bits of test count in EAX
pop ecx ;; Recover high-order 32 bits of start count
sbb edx, ecx ;; High-order 32 bits of test count in EDX
pop ecx ;; Recover low-order 32 bits of overhead count
sub eax, ecx ;; Low-order 32 bits of adjusted count in EAX
pop ecx ;; Recover high-order 32 bits of overhead count
sbb edx, ecx ;; High-order 32 bits of adjusted count in EDX
mov DWORD PTR __counter__qword__count__, eax
mov DWORD PTR __counter__qword__count__ + 4, edx
invoke GetCurrentProcess
invoke SetPriorityClass, eax, NORMAL_PRIORITY_CLASS
IFDEF _EMMS
EMMS
ENDIF
finit
fild __counter__qword__count__
fild __counter__loop__count__
fdiv
fistp __counter__qword__count__
mov eax, DWORD PTR __counter__qword__count__
ENDM
; ---------------------------------------------------------------------
; These two macros perform the grunt work involved in measuring the
; execution time in milliseconds for a specified number of loops
; through a block of code. These macros must be used in pairs, and
; the block of code must be placed in between the timer_begin and
; timer_end macro calls. The timer_end macro returns the elapsed
; milliseconds for the entire loop in EAX.
;
; These macros utilize the high-resolution performance counter.
; The return value will be zero if the high-resolution performance
; counter is not available.
;
; If your code is using MMX instructions and not executing an EMMS
; at the end of each MMX instruction sequence, defining the symbol
; _EMMS will cause the timer_end macro to insert an EMMS in front of
; the FPU instructions.
;
; The loopcount parameter should be set to a relatively high value to
; produce repeatable results.
;
; Note that setting the priority parameter to REALTIME_PRIORITY_CLASS
; involves some risk, as it will cause your process to preempt *all*
; other processes, including critical Windows processes. Setting the
; priority parameter to HIGH_PRIORITY_CLASS instead will significantly
; reduce the risk, and in most cases will produce very nearly the same
; result.
; ---------------------------------------------------------------------
timer_begin MACRO loopcount:REQ, priority
LOCAL label
IFNDEF __timer__stuff__defined__
__timer__stuff__defined__ equ <1>
.data
ALIGN 8 ;; Optimal alignment for QWORD
__timer__pc__frequency__ dq 0
__timer__pc__count__ dq 0
__timer__loop__counter__ dd 0
__timer__dw_count__ dd 0
.code
ENDIF
invoke QueryPerformanceFrequency, ADDR __timer__pc__frequency__
.IF eax != 0
IFNB <priority>
invoke GetCurrentProcess
invoke SetPriorityClass, eax, priority
ENDIF
invoke QueryPerformanceCounter, ADDR __timer__pc__count__
push DWORD PTR __timer__pc__count__ + 4
push DWORD PTR __timer__pc__count__
mov __timer__loop__counter__, loopcount
ALIGN 16 ;; Optimal loop alignment for P6
@@: ;; Start an empty reference loop
sub __timer__loop__counter__, 1
jnz @B
invoke QueryPerformanceCounter, ADDR __timer__pc__count__
pop ecx ;; Recover low-order 32 bits of start count
sub DWORD PTR __timer__pc__count__, ecx
pop ecx ;; Recover high-order 32 bits of start count
sbb DWORD PTR __timer__pc__count__ + 4, ecx
push DWORD PTR __timer__pc__count__ + 4 ;; Overhead count
push DWORD PTR __timer__pc__count__ ;; Overhead count
invoke QueryPerformanceCounter, ADDR __timer__pc__count__
push DWORD PTR __timer__pc__count__ + 4 ;; Start count
push DWORD PTR __timer__pc__count__ ;; Start count
mov __timer__loop__counter__, loopcount
ALIGN 16 ;; Optimal loop alignment for P6
label: ;; Start test loop
__timer__loop__label__ equ <label>
.ENDIF
ENDM
timer_end MACRO
sub __timer__loop__counter__, 1
jnz __timer__loop__label__
invoke QueryPerformanceFrequency, ADDR __timer__pc__frequency__
.IF eax != 0
invoke QueryPerformanceCounter, ADDR __timer__pc__count__
pop ecx ;; Recover low-order 32 bits of start count
sub DWORD PTR __timer__pc__count__, ecx
pop ecx ;; Recover high-order 32 bits of start count
sbb DWORD PTR __timer__pc__count__ + 4, ecx
pop ecx ;; Recover low-order 32 bits of overhead count
sub DWORD PTR __timer__pc__count__, ecx
pop ecx ;; Recover high-order 32 bits of overhead count
sbb DWORD PTR __timer__pc__count__ + 4, ecx
invoke GetCurrentProcess
invoke SetPriorityClass, eax, NORMAL_PRIORITY_CLASS
IFDEF _EMMS
EMMS
ENDIF
finit
fild __timer__pc__count__
fild __timer__pc__frequency__
fdiv
mov __timer__dw_count__, 1000
fild __timer__dw_count__
fmul
fistp __timer__dw_count__
mov eax, __timer__dw_count__
.ELSE
xor eax, eax ;; No performance counter
.ENDIF
ENDM
#endasm
storage
buf : char[1024];
tst1 : uns32;
tst2 : uns32;
#macro ZeroMemory(dest,bytes);
push(eax);
push(ecx);
push(edi);
lea (edi,dest);
xor (eax,eax);
mov (bytes, ecx);
rep.stosb();
pop (edi);
pop (ecx);
pop (eax);
#endmacro
#macro ZeroMemorySSE(dest,bytes);
push(ecx);
push(edi);
lea (edi,dest);
mov (bytes, ecx);
pxor(xmm0,xmm0);
shr (12,ecx);
lop:
movaps(xmm0,[edi]);
movaps(xmm0,[edi+16]);
movaps(xmm0,[edi+32]);
movaps(xmm0,[edi+48]);
add (16*4,edi);
sub (1*4,ecx);
jnz ( lop );
pop (edi);
pop (ecx);
#endmacro
begin speed_test;
#emit( " counter_begin 1000, REALTIME_PRIORITY_CLASS ");
ZeroMemory(buf,@size(buf));
#emit( " counter_end ");
mov(tst1,eax);
#emit( " counter_begin 1000, REALTIME_PRIORITY_CLASS ");
ZeroMemorySSE(buf,@size(buf));
#emit( " counter_end ");
mov(tst2,eax);
stdout.put("ZeroMemory = " ,tst1, nl , "ZeroMemorySSE = " , tst2 );
end speed_test;
why are you moving tst1 and tst2 to eax? if the macros return in eax, you should be placing the value of eax in the variables, not the other way around.
yes , i was wrong.
it is a bit confused , don't know why hla does not use the mov as other assemblers!!!!!!!!
any way , thanks for your help.
here is the result in my computer
ZeroMemory = 562
ZeroMemorySSE = 29
here is the correct code , my be it is useful for someone else.
program speed_test;
#asm
include \masm32\include\windows.inc
c_msvcrt typedef PROTO C :VARARG
externdef __imp__GetCurrentProcess@0:PTR c_msvcrt
GetCurrentProcess equ <__imp__GetCurrentProcess@0>
externdef __imp__SetPriorityClass@8:PTR c_msvcrt
SetPriorityClass equ <__imp__SetPriorityClass@8>
; ---------------------------------------------------------------------
; These two macros perform the grunt work involved in measuring the
; processor clock cycle count for a block of code. These macros must
; be used in pairs, and the block of code must be placed in between
; the counter_begin and counter_end macro calls. The counter_end macro
; returns the clock cycle count for a single pass through the block of
; code, corrected for the test loop overhead, in EAX.
;
; These macros require a .586 or higher processor directive.
;
; If your code is using MMX instructions and not executing an EMMS
; at the end of each MMX instruction sequence, defining the symbol
; _EMMS will cause the ctr_end macro to insert an EMMS in front of
; the FPU instructions.
;
; The loopcount parameter should be set to a relatively high value to
; produce repeatable results.
;
; Note that setting the priority parameter to REALTIME_PRIORITY_CLASS
; involves some risk, as it will cause your process to preempt *all*
; other processes, including critical Windows processes. Setting the
; priority parameter to HIGH_PRIORITY_CLASS instead will significantly
; reduce the risk, and in most cases will produce the same cycle count.
; ---------------------------------------------------------------------
counter_begin MACRO loopcount:REQ, priority
LOCAL label
IFNDEF __counter__stuff__defined__
__counter__stuff__defined__ equ <1>
.data
ALIGN 8 ;; Optimal alignment for QWORD
__counter__qword__count__ dq 0
__counter__loop__count__ dd 0
__counter__loop__counter__ dd 0
.code
ENDIF
mov __counter__loop__count__, loopcount
IFNB <priority>
invoke GetCurrentProcess
invoke SetPriorityClass, eax, priority
ENDIF
xor eax, eax ;; Use same CPUID input value for each call
cpuid ;; Flush pipe & wait for pending ops to finish
rdtsc ;; Read Time Stamp Counter
push edx ;; Preserve high-order 32 bits of start count
push eax ;; Preserve low-order 32 bits of start count
mov __counter__loop__counter__, loopcount
xor eax, eax
cpuid ;; Make sure loop setup instructions finish
ALIGN 16 ;; Optimal loop alignment for P6
@@: ;; Start an empty reference loop
sub __counter__loop__counter__, 1
jnz @B
xor eax, eax
cpuid ;; Make sure loop instructions finish
rdtsc ;; Read end count
pop ecx ;; Recover low-order 32 bits of start count
sub eax, ecx ;; Low-order 32 bits of overhead count in EAX
pop ecx ;; Recover high-order 32 bits of start count
sbb edx, ecx ;; High-order 32 bits of overhead count in EDX
push edx ;; Preserve high-order 32 bits of overhead count
push eax ;; Preserve low-order 32 bits of overhead count
xor eax, eax
cpuid
rdtsc
push edx ;; Preserve high-order 32 bits of start count
push eax ;; Preserve low-order 32 bits of start count
mov __counter__loop__counter__, loopcount
xor eax, eax
cpuid ;; Make sure loop setup instructions finish
ALIGN 16 ;; Optimal loop alignment for P6
label: ;; Start test loop
__counter__loop__label__ equ <label>
ENDM
counter_end MACRO
sub __counter__loop__counter__, 1
jnz __counter__loop__label__
xor eax, eax
cpuid ;; Make sure loop instructions finish
rdtsc ;; Read end count
pop ecx ;; Recover low-order 32 bits of start count
sub eax, ecx ;; Low-order 32 bits of test count in EAX
pop ecx ;; Recover high-order 32 bits of start count
sbb edx, ecx ;; High-order 32 bits of test count in EDX
pop ecx ;; Recover low-order 32 bits of overhead count
sub eax, ecx ;; Low-order 32 bits of adjusted count in EAX
pop ecx ;; Recover high-order 32 bits of overhead count
sbb edx, ecx ;; High-order 32 bits of adjusted count in EDX
mov DWORD PTR __counter__qword__count__, eax
mov DWORD PTR __counter__qword__count__ + 4, edx
invoke GetCurrentProcess
invoke SetPriorityClass, eax, NORMAL_PRIORITY_CLASS
IFDEF _EMMS
EMMS
ENDIF
finit
fild __counter__qword__count__
fild __counter__loop__count__
fdiv
fistp __counter__qword__count__
mov eax, DWORD PTR __counter__qword__count__
ENDM
; ---------------------------------------------------------------------
; These two macros perform the grunt work involved in measuring the
; execution time in milliseconds for a specified number of loops
; through a block of code. These macros must be used in pairs, and
; the block of code must be placed in between the timer_begin and
; timer_end macro calls. The timer_end macro returns the elapsed
; milliseconds for the entire loop in EAX.
;
; These macros utilize the high-resolution performance counter.
; The return value will be zero if the high-resolution performance
; counter is not available.
;
; If your code is using MMX instructions and not executing an EMMS
; at the end of each MMX instruction sequence, defining the symbol
; _EMMS will cause the timer_end macro to insert an EMMS in front of
; the FPU instructions.
;
; The loopcount parameter should be set to a relatively high value to
; produce repeatable results.
;
; Note that setting the priority parameter to REALTIME_PRIORITY_CLASS
; involves some risk, as it will cause your process to preempt *all*
; other processes, including critical Windows processes. Setting the
; priority parameter to HIGH_PRIORITY_CLASS instead will significantly
; reduce the risk, and in most cases will produce very nearly the same
; result.
; ---------------------------------------------------------------------
timer_begin MACRO loopcount:REQ, priority
LOCAL label
IFNDEF __timer__stuff__defined__
__timer__stuff__defined__ equ <1>
.data
ALIGN 8 ;; Optimal alignment for QWORD
__timer__pc__frequency__ dq 0
__timer__pc__count__ dq 0
__timer__loop__counter__ dd 0
__timer__dw_count__ dd 0
.code
ENDIF
invoke QueryPerformanceFrequency, ADDR __timer__pc__frequency__
.IF eax != 0
IFNB <priority>
invoke GetCurrentProcess
invoke SetPriorityClass, eax, priority
ENDIF
invoke QueryPerformanceCounter, ADDR __timer__pc__count__
push DWORD PTR __timer__pc__count__ + 4
push DWORD PTR __timer__pc__count__
mov __timer__loop__counter__, loopcount
ALIGN 16 ;; Optimal loop alignment for P6
@@: ;; Start an empty reference loop
sub __timer__loop__counter__, 1
jnz @B
invoke QueryPerformanceCounter, ADDR __timer__pc__count__
pop ecx ;; Recover low-order 32 bits of start count
sub DWORD PTR __timer__pc__count__, ecx
pop ecx ;; Recover high-order 32 bits of start count
sbb DWORD PTR __timer__pc__count__ + 4, ecx
push DWORD PTR __timer__pc__count__ + 4 ;; Overhead count
push DWORD PTR __timer__pc__count__ ;; Overhead count
invoke QueryPerformanceCounter, ADDR __timer__pc__count__
push DWORD PTR __timer__pc__count__ + 4 ;; Start count
push DWORD PTR __timer__pc__count__ ;; Start count
mov __timer__loop__counter__, loopcount
ALIGN 16 ;; Optimal loop alignment for P6
label: ;; Start test loop
__timer__loop__label__ equ <label>
.ENDIF
ENDM
timer_end MACRO
sub __timer__loop__counter__, 1
jnz __timer__loop__label__
invoke QueryPerformanceFrequency, ADDR __timer__pc__frequency__
.IF eax != 0
invoke QueryPerformanceCounter, ADDR __timer__pc__count__
pop ecx ;; Recover low-order 32 bits of start count
sub DWORD PTR __timer__pc__count__, ecx
pop ecx ;; Recover high-order 32 bits of start count
sbb DWORD PTR __timer__pc__count__ + 4, ecx
pop ecx ;; Recover low-order 32 bits of overhead count
sub DWORD PTR __timer__pc__count__, ecx
pop ecx ;; Recover high-order 32 bits of overhead count
sbb DWORD PTR __timer__pc__count__ + 4, ecx
invoke GetCurrentProcess
invoke SetPriorityClass, eax, NORMAL_PRIORITY_CLASS
IFDEF _EMMS
EMMS
ENDIF
finit
fild __timer__pc__count__
fild __timer__pc__frequency__
fdiv
mov __timer__dw_count__, 1000
fild __timer__dw_count__
fmul
fistp __timer__dw_count__
mov eax, __timer__dw_count__
.ELSE
xor eax, eax ;; No performance counter
.ENDIF
ENDM
#endasm
#include( "stdlib.hhf" );
#include("macros/macros.hhf")
storage
buf : char[1024];
tst1 : uns32;
tst2 : uns32;
#macro ZeroMemory(dest,bytes);
USE(eax, ecx, edi)
lea (edi,dest);
xor (eax,eax);
mov (bytes, ecx);
rep.stosb();
ENDUSE;
#endmacro
#macro ZeroMemorySSE(dest,bytes);
USE(ecx, edi)
lea (edi,dest);
mov (bytes, ecx);
pxor(xmm0,xmm0);
shr (6,ecx);
lop:
movaps(xmm0,[edi]);
movaps(xmm0,[edi+16]);
movaps(xmm0,[edi+32]);
movaps(xmm0,[edi+48]);
add (16*4,edi);
sub (1*4,ecx);
jnz ( lop );
ENDUSE;
#endmacro
begin speed_test;
#emit( " counter_begin 10000000, REALTIME_PRIORITY_CLASS ");
ZeroMemory(buf,@size(buf));
#emit( " counter_end ");
mov(eax,tst1);
#emit( " counter_begin 10000000, REALTIME_PRIORITY_CLASS ");
ZeroMemorySSE(buf,@size(buf));
#emit( " counter_end ");
mov(eax,tst2);
stdout.put("ZeroMemory = " ,(type uns32 tst1), nl , "ZeroMemorySSE = " , tst2 );
end speed_test;