News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

16 bit performance timer

Started by Magnum, May 27, 2010, 02:47:04 PM

Previous topic - Next topic

Magnum

I am using material from here.

http://www.df.lth.se/~john_e/fr_gems.html

340 cycle can't be right for a simple xor.



; ?? Showing 340 cycles for xor ax,ax = 340/733 Mhz = .46 seconds
;
; timeit.asm TASM code
; Help from Shoorick,
;             time = cycles/frequency
.model small
.486
.stack 200h

.data

time_cycles dd ?
cycle dd ?
cpuid_cycle dd ?
mycountlow dd ?
mycounthigh dd ?

.code

include monitor.asm

start:

mov ax,@data
mov ds,ax

call monitor_init
time_start
xor ax,ax
time_stop
mov [mycountlow],eax
mov [mycounthigh],edx

             ;mov          eax,123456789
             ; print low cycle count
             call         PrtDec
             xor eax,eax

             ; print high cycle count
             mov eax,edx
             call         PrtDec
             
             mov          ax,4c00h
             int          21h
; Print any number in eax             
PrtDec       proc   
             push         eax
             push         ecx
             push         edx
             mov          ecx,0ffffffffh
             push         ecx
             mov          ecx,10
pd1:
             mov          edx,0
             div          ecx
             add          dl,30h
             push         edx
             cmp          eax,0
             jne          pd1
pd2:
             pop          edx
             cmp          edx,0ffffffffh
             je           pd3
             mov          ah,2
             int          21h
             jmp          pd2
pd3:
             pop          edx
             pop          ecx
             pop          eax
             ret
PrtDec       endp

;exit:
;mov ax,4c00h
;int 21h

end start


; monitor.asm Used with timeit.asm
; Performance monitoring package
; Help from Shoorick
;
; define PProPII if your CPU is a Pentium Pro or a Pentium II
;
;
;   monitor_init
;     initializes the package
;
;   time_start
;     start cycle count here
;
;   time_stop
;     stop counting here
;
;   the package can not do nested measurements, since the macro
;   returns all cycles in the same variable
;
; define cpuid and rdtsc instructions via macros
; this is not necessary is your assembler supports them

monitor_init:

  IFDEF PProPII
          pushfd
          pushad
 
          mov     ecx,3
  getcpuidtime:
          cpuid

          ;rdtsc Not support with Tasm 3.1
          db      0fh,031h

          mov     [cycle],eax
          cpuid

          ;rdtsc
          db      0fh,031h

          sub     eax,[cycle]
          mov     [cpuid_cycle],eax
          dec     ecx
          jnz     getcpuidtime
 
          popad
          popf
  ENDIF
          ret

; time_start - start timing point here
;
; input:
;   none
;
; output:
;   time_cycles initialized
;
; destroys:
;   eax, ebx, ecx, edx
;   eflags

time_start MACRO

  IFDEF PProPII
          cpuid
  ENDIF
          ;rdtsc
          db      0fh,031h
          mov     [time_cycles],eax
          mov     [time_cycles+4],edx
  ENDM

; time_stop - stop timing point here
;
; input: none
;
; output:
;   eax = low cycle count
;   edx = high cycle count
;
; destroys:
;   eax, ebx, ecx, edx
;   eflags

time_stop   MACRO

  IFDEF PProPII
          cpuid
  ENDIF
        ;rdtsc
        db      0fh,031h
        sub     eax,[time_cycles]
        sbb     edx,[time_cycles+4]

  IFDEF ProPII
          sub     eax,[cpuid_cycle]
          sbb     edx,0
  ENDIF

ENDM

Have a great day,
                         Andy

clive

Wouldn't CPUID destroy ECX?

With DOS I always used to use the 8253 (PIT) at 1.19 MHz
It could be a random act of randomness. Those happen a lot as well.

dedndave

MichaelW wrote a set of macros for timing 16-bit code

http://www.masm32.com/board/index.php?topic=12540.msg96548#msg96548

(that thread should be made sticky!!!)

Magnum

Quote from: dedndave on May 27, 2010, 03:39:17 PM
MichaelW wrote a set of macros for timing 16-bit code

http://www.masm32.com/board/index.php?topic=12540.msg96548#msg96548

(that thread should be made sticky!!!)

Thanks, I will see how it compares to my fixed version. At least I think so. :-)


; timeit.asm TASM code
; Help from Shoorick,
;             time = cycles/frequency (733 Mhz for my machine)
; Shows 34 cycles for this block of code = 34/733 = .046 seconds
.model small
.486
.stack 200h

.data

time_cycles dd ?
cycle dd ?
cpuid_cycle dd ?
mycountlow dd ?
mycounthigh dd ?

.code

include monitor.asm

start:

mov ax,@data
mov ds,ax

call monitor_init
time_start

mov          ax,45000   ; ax:dx contains 90,000,000
                                     ; ax = 4a80h
                                     ; dx = 55dh
                                     ; so,  55d4a80h = 90,000,000
             mov          bx,2000
             mul          bx


time_stop
mov [mycountlow],eax
mov [mycounthigh],edx

             ;mov          eax,123456789
             ; print low cycle count
             call         PrtDec
int 3
             xor eax,eax ; clear eax

             ;if edx is empty, we leave
             cmp edx,0
             je outta_here
             
             ; print high cycle count if necessary
             mov eax,edx
             call         PrtDec
outta_here:             
             mov          ax,4c00h
             int          21h
; Print any number in eax             
PrtDec       proc   
             push         eax
             push         ecx
             push         edx
             mov          ecx,0ffffffffh
             push         ecx
             mov          ecx,10
pd1:
             mov          edx,0
             div          ecx
             add          dl,30h
             push         edx
             cmp          eax,0
             jne          pd1
pd2:
             pop          edx
             cmp          edx,0ffffffffh
             je           pd3
             mov          ah,2
             int          21h
             jmp          pd2
pd3:
             pop          edx
             pop          ecx
             pop          eax
             ret
PrtDec       endp

;exit:
;mov ax,4c00h
;int 21h

end start

Have a great day,
                         Andy

clive

Supposed to run under DOS, but here on a PII 433 MHz under NT Workstation, XOR should be one cycle or less depending on the execution ports.

LoopTime (980204) Copyright (C) 1992-1998 by Micro Solutions Inc., DeKalb IL.

Instruction Cycles Speed CPU

NOP    0.4 0.001 us Pentium II (Mendocino)
CLC    0.9 0.002 us 443.12 MHz
XOR DX,DX    0.8 0.002 us
XCHG DX,DX    1.9 0.004 us
MOV DX,DX    0.8 0.002 us
JMP $+2    5.5 0.012 us
LOOP $+2   11.2 0.025 us
DEC CX & JNZ $+2    3.0 0.007 us
IN AL,DX (0x025C) 1626.0 3.669 us 272525.67 Bps
OUT DX,AL 1738.3 3.923 us 254922.53 Bps
IN AX,DX 1783.6 4.025 us 496879.56 Bps
OUT DX,AX 1665.9 3.759 us 531997.15 Bps
IN EAX,DX 1728.1 3.900 us 1025666.83 Bps
OUT DX,EAX 1750.7 3.951 us 1012430.60 Bps
REP INSB 117.5 0.265 us 3771318.79 Bps
REP OUTSB 127.3 0.287 us 3480434.98 Bps
REP INSW 187.6 0.423 us 4724676.15 Bps
REP OUTSW 167.7 0.378 us 5285870.53 Bps
REP INSD 179.9 0.406 us 9851490.77 Bps
REP OUTSD 169.5 0.383 us 10456109.30 Bps
INC DX    0.7 0.001 us
ADD DX,2    0.8 0.002 us
ADD DL,DH    0.8 0.002 us
ADD DX,DX    0.8 0.002 us
SHL DX,1    0.6 0.001 us
PUSH AX & POP AX    1.9 0.004 us
XCHG DX,[DI]   20.0 0.045 us (Memory 22 ns)
MOV DX,[DI]    0.8 0.002 us
MOV [DI],DX    1.6 0.004 us


The single IO instructions have a huge overhead because each one traps, whereas the REP'd versions trap once.

Here, bodged up a little bit to run on a 3 GHz Prescott P4 (16-bit code, some nasty branch penalties)

LoopTime (980204) Copyright (C) 1992-1998 by Micro Solutions Inc., DeKalb IL.

Instruction Cycles Speed CPU

NOP    0.2 0.000 us Pentium IV
CLC    8.0 0.003 us 3138.18 MHz
XOR DX,DX    0.8 0.000 us
XCHG DX,DX    1.7 0.001 us
MOV DX,DX    0.8 0.000 us
JMP $+2    1.9 0.001 us
LOOP $+2   20.0 0.006 us
DEC CX & JNZ $+2    7.2 0.002 us
IN AL,DX (0x025C) 10235.6 3.262 us 306594.20 Bps
OUT DX,AL 9297.7 2.963 us 337521.82 Bps
IN AX,DX 9540.4 3.040 us 657873.40 Bps
OUT DX,AX 9676.6 3.083 us 648614.32 Bps
IN EAX,DX 9495.0 3.026 us 1322028.72 Bps
OUT DX,EAX 10480.0 3.340 us 1197779.17 Bps
REP INSB 653.7 0.208 us 4800539.76 Bps
REP OUTSB 703.4 0.224 us 4461265.22 Bps
REP INSW 769.5 0.245 us 8156164.58 Bps
REP OUTSW 690.7 0.220 us 9087397.93 Bps
REP INSD 742.3 0.237 us 16910908.34 Bps
REP OUTSD 722.9 0.230 us 17365430.78 Bps
INC DX    1.0 0.000 us
ADD DX,2    0.8 0.000 us
ADD DL,DH    3.0 0.001 us
ADD DX,DX    2.1 0.001 us
SHL DX,1    2.3 0.001 us
PUSH AX & POP AX    8.2 0.003 us
XCHG DX,[DI]   95.2 0.030 us (Memory 15 ns)
MOV DX,[DI]    1.0 0.000 us
MOV [DI],DX    1.7 0.001 us
It could be a random act of randomness. Those happen a lot as well.

Magnum

Quote from: Magnum on May 27, 2010, 02:47:04 PM


call monitor_init
time_start
xor ax,ax
time_stop
mov [mycountlow],eax
mov [mycounthigh],edx

             ;mov          eax,123456789
             ; print low cycle count
             call         PrtDec
             xor eax,eax

             ; print high cycle count
             mov eax,edx
             call         PrtDec
             
             mov          ax,4c00h
             int          21h
; Print any number in eax             
PrtDec       proc   
             push         eax
             push         ecx
             push         edx
             mov          ecx,0ffffffffh
             push         ecx
             mov          ecx,10
pd1:
             mov          edx,0
             div          ecx
             add          dl,30h
             push         edx
             cmp          eax,0
             jne          pd1
pd2:
             pop          edx
             cmp          edx,0ffffffffh
             je           pd3
             mov          ah,2
             int          21h
             jmp          pd2
pd3:
             pop          edx
             pop          ecx
             pop          eax
             ret
PrtDec       endp

;exit:
;mov ax,4c00h
;int 21h

end start

I added this.
So, would I have to divide my results by 4 to get the average number of cycles?


       mov     ecx,4   ; Execute test code 4 times
meassureloop:
        push    ecx

time_start

mov eax,130000d  ; something slow
mov ecx,260000d ; 26 decimal
cdq      ; 1 clock
idiv ecx ; divide eax by ecx 43 clocks


Have a great day,
                         Andy