I am using material from here.
http://www.df.lth.se/~john_e/fr_gems.html
340 cycle can't be right for a simple xor.
; ?? Showing 340 cycles for xor ax,ax = 340/733 Mhz = .46 seconds
;
; timeit.asm TASM code
; Help from Shoorick,
; time = cycles/frequency
.model small
.486
.stack 200h
.data
time_cycles dd ?
cycle dd ?
cpuid_cycle dd ?
mycountlow dd ?
mycounthigh dd ?
.code
include monitor.asm
start:
mov ax,@data
mov ds,ax
call monitor_init
time_start
xor ax,ax
time_stop
mov [mycountlow],eax
mov [mycounthigh],edx
;mov eax,123456789
; print low cycle count
call PrtDec
xor eax,eax
; print high cycle count
mov eax,edx
call PrtDec
mov ax,4c00h
int 21h
; Print any number in eax
PrtDec proc
push eax
push ecx
push edx
mov ecx,0ffffffffh
push ecx
mov ecx,10
pd1:
mov edx,0
div ecx
add dl,30h
push edx
cmp eax,0
jne pd1
pd2:
pop edx
cmp edx,0ffffffffh
je pd3
mov ah,2
int 21h
jmp pd2
pd3:
pop edx
pop ecx
pop eax
ret
PrtDec endp
;exit:
;mov ax,4c00h
;int 21h
end start
; monitor.asm Used with timeit.asm
; Performance monitoring package
; Help from Shoorick
;
; define PProPII if your CPU is a Pentium Pro or a Pentium II
;
;
; monitor_init
; initializes the package
;
; time_start
; start cycle count here
;
; time_stop
; stop counting here
;
; the package can not do nested measurements, since the macro
; returns all cycles in the same variable
;
; define cpuid and rdtsc instructions via macros
; this is not necessary is your assembler supports them
monitor_init:
IFDEF PProPII
pushfd
pushad
mov ecx,3
getcpuidtime:
cpuid
;rdtsc Not support with Tasm 3.1
db 0fh,031h
mov [cycle],eax
cpuid
;rdtsc
db 0fh,031h
sub eax,[cycle]
mov [cpuid_cycle],eax
dec ecx
jnz getcpuidtime
popad
popf
ENDIF
ret
; time_start - start timing point here
;
; input:
; none
;
; output:
; time_cycles initialized
;
; destroys:
; eax, ebx, ecx, edx
; eflags
time_start MACRO
IFDEF PProPII
cpuid
ENDIF
;rdtsc
db 0fh,031h
mov [time_cycles],eax
mov [time_cycles+4],edx
ENDM
; time_stop - stop timing point here
;
; input: none
;
; output:
; eax = low cycle count
; edx = high cycle count
;
; destroys:
; eax, ebx, ecx, edx
; eflags
time_stop MACRO
IFDEF PProPII
cpuid
ENDIF
;rdtsc
db 0fh,031h
sub eax,[time_cycles]
sbb edx,[time_cycles+4]
IFDEF ProPII
sub eax,[cpuid_cycle]
sbb edx,0
ENDIF
ENDM
Wouldn't CPUID destroy ECX?
With DOS I always used to use the 8253 (PIT) at 1.19 MHz
MichaelW wrote a set of macros for timing 16-bit code
http://www.masm32.com/board/index.php?topic=12540.msg96548#msg96548
(that thread should be made sticky!!!)
Quote from: dedndave on May 27, 2010, 03:39:17 PM
MichaelW wrote a set of macros for timing 16-bit code
http://www.masm32.com/board/index.php?topic=12540.msg96548#msg96548
(that thread should be made sticky!!!)
Thanks, I will see how it compares to my fixed version. At least I think so. :-)
; timeit.asm TASM code
; Help from Shoorick,
; time = cycles/frequency (733 Mhz for my machine)
; Shows 34 cycles for this block of code = 34/733 = .046 seconds
.model small
.486
.stack 200h
.data
time_cycles dd ?
cycle dd ?
cpuid_cycle dd ?
mycountlow dd ?
mycounthigh dd ?
.code
include monitor.asm
start:
mov ax,@data
mov ds,ax
call monitor_init
time_start
mov ax,45000 ; ax:dx contains 90,000,000
; ax = 4a80h
; dx = 55dh
; so, 55d4a80h = 90,000,000
mov bx,2000
mul bx
time_stop
mov [mycountlow],eax
mov [mycounthigh],edx
;mov eax,123456789
; print low cycle count
call PrtDec
int 3
xor eax,eax ; clear eax
;if edx is empty, we leave
cmp edx,0
je outta_here
; print high cycle count if necessary
mov eax,edx
call PrtDec
outta_here:
mov ax,4c00h
int 21h
; Print any number in eax
PrtDec proc
push eax
push ecx
push edx
mov ecx,0ffffffffh
push ecx
mov ecx,10
pd1:
mov edx,0
div ecx
add dl,30h
push edx
cmp eax,0
jne pd1
pd2:
pop edx
cmp edx,0ffffffffh
je pd3
mov ah,2
int 21h
jmp pd2
pd3:
pop edx
pop ecx
pop eax
ret
PrtDec endp
;exit:
;mov ax,4c00h
;int 21h
end start
Supposed to run under DOS, but here on a PII 433 MHz under NT Workstation, XOR should be one cycle or less depending on the execution ports.
LoopTime (980204) Copyright (C) 1992-1998 by Micro Solutions Inc., DeKalb IL.
Instruction Cycles Speed CPU
NOP 0.4 0.001 us Pentium II (Mendocino)
CLC 0.9 0.002 us 443.12 MHz
XOR DX,DX 0.8 0.002 us
XCHG DX,DX 1.9 0.004 us
MOV DX,DX 0.8 0.002 us
JMP $+2 5.5 0.012 us
LOOP $+2 11.2 0.025 us
DEC CX & JNZ $+2 3.0 0.007 us
IN AL,DX (0x025C) 1626.0 3.669 us 272525.67 Bps
OUT DX,AL 1738.3 3.923 us 254922.53 Bps
IN AX,DX 1783.6 4.025 us 496879.56 Bps
OUT DX,AX 1665.9 3.759 us 531997.15 Bps
IN EAX,DX 1728.1 3.900 us 1025666.83 Bps
OUT DX,EAX 1750.7 3.951 us 1012430.60 Bps
REP INSB 117.5 0.265 us 3771318.79 Bps
REP OUTSB 127.3 0.287 us 3480434.98 Bps
REP INSW 187.6 0.423 us 4724676.15 Bps
REP OUTSW 167.7 0.378 us 5285870.53 Bps
REP INSD 179.9 0.406 us 9851490.77 Bps
REP OUTSD 169.5 0.383 us 10456109.30 Bps
INC DX 0.7 0.001 us
ADD DX,2 0.8 0.002 us
ADD DL,DH 0.8 0.002 us
ADD DX,DX 0.8 0.002 us
SHL DX,1 0.6 0.001 us
PUSH AX & POP AX 1.9 0.004 us
XCHG DX,[DI] 20.0 0.045 us (Memory 22 ns)
MOV DX,[DI] 0.8 0.002 us
MOV [DI],DX 1.6 0.004 us
The single IO instructions have a huge overhead because each one traps, whereas the REP'd versions trap once.
Here, bodged up a little bit to run on a 3 GHz Prescott P4 (16-bit code, some nasty branch penalties)
LoopTime (980204) Copyright (C) 1992-1998 by Micro Solutions Inc., DeKalb IL.
Instruction Cycles Speed CPU
NOP 0.2 0.000 us Pentium IV
CLC 8.0 0.003 us 3138.18 MHz
XOR DX,DX 0.8 0.000 us
XCHG DX,DX 1.7 0.001 us
MOV DX,DX 0.8 0.000 us
JMP $+2 1.9 0.001 us
LOOP $+2 20.0 0.006 us
DEC CX & JNZ $+2 7.2 0.002 us
IN AL,DX (0x025C) 10235.6 3.262 us 306594.20 Bps
OUT DX,AL 9297.7 2.963 us 337521.82 Bps
IN AX,DX 9540.4 3.040 us 657873.40 Bps
OUT DX,AX 9676.6 3.083 us 648614.32 Bps
IN EAX,DX 9495.0 3.026 us 1322028.72 Bps
OUT DX,EAX 10480.0 3.340 us 1197779.17 Bps
REP INSB 653.7 0.208 us 4800539.76 Bps
REP OUTSB 703.4 0.224 us 4461265.22 Bps
REP INSW 769.5 0.245 us 8156164.58 Bps
REP OUTSW 690.7 0.220 us 9087397.93 Bps
REP INSD 742.3 0.237 us 16910908.34 Bps
REP OUTSD 722.9 0.230 us 17365430.78 Bps
INC DX 1.0 0.000 us
ADD DX,2 0.8 0.000 us
ADD DL,DH 3.0 0.001 us
ADD DX,DX 2.1 0.001 us
SHL DX,1 2.3 0.001 us
PUSH AX & POP AX 8.2 0.003 us
XCHG DX,[DI] 95.2 0.030 us (Memory 15 ns)
MOV DX,[DI] 1.0 0.000 us
MOV [DI],DX 1.7 0.001 us
Quote from: Magnum on May 27, 2010, 02:47:04 PM
call monitor_init
time_start
xor ax,ax
time_stop
mov [mycountlow],eax
mov [mycounthigh],edx
;mov eax,123456789
; print low cycle count
call PrtDec
xor eax,eax
; print high cycle count
mov eax,edx
call PrtDec
mov ax,4c00h
int 21h
; Print any number in eax
PrtDec proc
push eax
push ecx
push edx
mov ecx,0ffffffffh
push ecx
mov ecx,10
pd1:
mov edx,0
div ecx
add dl,30h
push edx
cmp eax,0
jne pd1
pd2:
pop edx
cmp edx,0ffffffffh
je pd3
mov ah,2
int 21h
jmp pd2
pd3:
pop edx
pop ecx
pop eax
ret
PrtDec endp
;exit:
;mov ax,4c00h
;int 21h
end start
I added this.
So, would I have to divide my results by 4 to get the average number of cycles?
mov ecx,4 ; Execute test code 4 times
meassureloop:
push ecx
time_start
mov eax,130000d ; something slow
mov ecx,260000d ; 26 decimal
cdq ; 1 clock
idiv ecx ; divide eax by ecx 43 clocks