News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Division by Constants

Started by dedndave, June 28, 2009, 06:39:54 PM

Previous topic - Next topic

drizz

dedndave,

my proc is only optimal, there is even faster routine by lingo but you'll have to search the board for it...
The truth cannot be learned ... it can only be recognized.

Jimg

Copyrighted, and used in a copyrighted program, are two entirely different things.
A copyrighted work may include any number of other copyrighted segments without affecting the original copyrights.
However, prohibiting use of a routine in a commercial program simply because it is commercial, is somewhat disingenuous.

For what it's worth, I'm with Drizz.  Any code posted by me on this forum is hereby released to the public domain and may be freely used for any purpose without restrictions or requiring reference as to origin.

jj2007

Quote from: Jimg on June 29, 2009, 07:49:04 PM
Copyrighted, and used in a copyrighted program, are two entirely different things.
A copyrighted work may include any number of other copyrighted segments without affecting the original copyrights.
However, prohibiting use of a routine in a commercial program simply because it is commercial, is somewhat disingenuous.

For what it's worth, I'm with Drizz.  Any code posted by me on this forum is hereby released to the public domain and may be freely used for any purpose without restrictions or requiring reference as to origin.

Jim, I am not an expert on these things. All I say is use my code but don't restrict its usage by others. I don't want to pay royalties for my own code. It is free - for everybody.

jj2007

Quote from: drizz on June 29, 2009, 07:32:26 PM
dedndave,

my proc is only optimal, there is even faster routine by lingo but you'll have to search the board for it...

You mean b2a3264?

dedndave

#19
i found a flaw in Drizz's latest - lol
these 2 lines are missing...

        OPTION  PROLOGUE:PrologueDef
        OPTION  EPILOGUE:EpilogueDef

i am going to put Drizz1, Drizz2, Dixon, and Lingo in a program to test them for correct values
while i am at it, i may as well get average timing results over the range of values
depends on how long it takes to run - lol

jj2007

Quote from: dedndave on June 29, 2009, 07:00:22 PM
here is some comparison timing
although, there seems to be a flaw someplace
still, Drizz's routine will be about 7 times faster than mine
i think i can convert my method a bit
i may not beat Drizz, but i can get somewhere in the same galaxy - lol

Congrats, you are in the same galaxy!
Sherlock Holmes found a missing counter_end after the call to drizz' algo :(


[attachment deleted by admin]

dedndave

#21
well, i know i can make mine faster, but i doubt it is worth the effort - lol
although, i am confident it returns the correct string for all input values
still, it is nice to know what i am working with
i think we need to update our timer macros, Jochen - lol

Drizz's latest has no branch's - lol
i like it

dedndave

Quotei am going to put Drizz1, Drizz2, Dixon, and Lingo in a program to test them for correct values
while i am at it, i may as well get average timing results over the range of values
depends on how long it takes to run - lol

well, so much for that idea - lol
just going through an empty loop for all 18,446,744,073,709,551,616 values would take about 408 years on my machine
(if i did my calculations right)
if i test every 65536th value, i can run an empty loop in a couple days

how do we verify our routines ?
ideas ? suggestions ?
even some tiny fraction of reliability testing is better than 0 - lol

drizz

#23
Testing for border values is always a good idea.

0,
1,10,100,...,10^19,
9,99,999,...,10^19-1,
0Fh,0FFh,0FFFh,...,2^64-1

btw, here's my latest code, fast non lut routine

;*****************************************************************
;
; uint64-to-string:
;
; split the uint64 value to two 10-digit numbers by magic divider method
; if possible use uint32-to-string nested subroutine, upper 10-digits will
; always fit to 32bit, but lower 10-digit must be adjusted dividing by 10 
; (again with magic divider method), then the remainder can be passed to
; uint32-to-string.
;
; uint32-to-string:
;
; split the uint32 value to two 5-digit numbers by magic divider method,
; then scale the 5-digit part by 2^32/10000 (68DB9h) to get decimal
; digit in edx (upper dword) after multiplication, subsequent digits
; are got multiplying by 10
;
;*****************************************************************

New_U64ToStr proc Value:QWORD, lpszBuffer:DWORD
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
locals = 4*4

sub esp,locals
mov [esp+0*4],ebp
mov [esp+1*4],esi
mov [esp+2*4],edi
mov [esp+3*4],ebx
mov esi,[esp+1*4][locals]; a0
mov edi,[esp+2*4][locals]; a1
mov ebp,[esp+3*4][locals]; lpszBuffer
test edi,edi
jnz @F
call DOFULL32NLZ
mov eax,ebp; lpszBuffer
mov ebp,[esp+0*4]
mov esi,[esp+1*4]
mov edi,[esp+2*4]
mov ebx,[esp+3*4]
add esp,locals
sub eax,[esp+3*4]
ret 3*4
@@:
_mul_64x64_top64 0BDEDD5BFh, 0DBE6FECEh; /10000000000

shr edi,1
mov esi,edi
jz TOP10ZERO

call DOFULL32NLZ
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; *1000000000000 == 02_540BE400h
;_mul_32x64_no_overflow
mov eax,0540BE400h; = b0
mul edi
lea edx,[2*edi+edx]; hidword is 2
;endm

mov esi,[esp+1*4][locals]; a0
mov edi,[esp+2*4][locals]; a1
sub esi,eax
sbb edi,edx
mov [esp+1*4][locals],esi; a0
mov [esp+2*4][locals],edi; a1
mov ebx,esi; a0
jnz @F
; the rest also fits to 32bit **
call DOFULL32LZ
mov eax,ebp; lpszBuffer
mov ebp,[esp+0*4]
mov esi,[esp+1*4]
mov edi,[esp+2*4]
mov ebx,[esp+3*4]
add esp,locals
sub eax,[esp+3*4]
ret 3*4

; ** If it does not there will be no leading zero
; so below code also works in this case
TOP10ZERO:
mov esi,[esp+1*4][locals]; a0
mov edi,[esp+2*4][locals]; a1
mov ebx,esi
@@:
add esi,1
adc edi,0
_mul_64x64_top64 01B478423h, 0A7C5AC47h; /100000
shrd esi,edi,16
mov eax,100000
mov ecx,esi
mov esi,ebx
mul ecx
sub esi,eax
mov eax,68DB9h;2^32/10000
mul ecx
mov ebx,ecx
mov ecx,10
call DOFULL32NLZSTART

shrd esi,edi,16
mov eax,100000
mul esi
sub ebx,eax
mov ecx,10
mov eax,68DB9h;2^32/10000
mul esi
mov esi,ebx
call DOFULL32NLZSTART

mov eax,ebp; lpszBuffer
mov ebp,[esp+0*4]
mov esi,[esp+1*4]
mov edi,[esp+2*4]
mov ebx,[esp+3*4]
add esp,locals
sub eax,[esp+3*4]
ret 3*4

;; esi == value
;; ebp == buffer
DOFULL32NLZ::
mov eax,0A7C5AC47h; magic div /100000
mul esi
add eax,0A7C5AC47h; correction for 0FFFFFFFFh
adc edx,0
mov ecx,10
shr edx,16
imul eax,edx,100000
sub esi,eax;/100000 remainder
mov ebx,edx
mov eax,68DB9h;2^32/10000
mul edx
DOFULL32NLZSTART:
test ebx,ebx
jz NEXT5
cmp ebx,9999
ja DIGIT0
cmp ebx,999
ja DIGIT1
add eax,eax
cmp ebx,99
lea eax,[eax*4+eax]
ja DIGIT2
add eax,eax
cmp ebx,9
lea eax,[eax*4+eax]
ja DIGIT3
add eax,eax
test ebx,ebx
lea eax,[eax*4+eax]
jnz DIGIT4
NEXT5:
mov eax,68DB9h;2^32/10000
mul esi
cmp esi,9999
ja DIGIT5
cmp esi,999
ja DIGIT6
add eax,eax
cmp esi,99
lea eax,[eax*4+eax]
ja DIGIT7
add eax,eax
cmp esi,9
lea eax,[eax*4+eax]
ja DIGIT8
lea edx,[esi+'0']
jmp DIGIT9

DOFULL32LZ:
mov eax,0A7C5AC47h; magic div /100000
mul esi
add eax,0A7C5AC47h; correction for 0FFFFFFFFh
adc edx,0
mov ecx,10
shr edx,16
imul eax,edx,100000
sub esi,eax;/100000 remainder
mov ebx,edx
mov eax,68DB9h;2^32/10000
mul edx

DIGIT0:
add dl,'0'
mov [ebp],dl
inc ebp
i = 1
rept 7
@CatStr(<DIGIT>,%(i)):
if i eq 0; first five
elseif i eq 5; next five
mov eax,68DB9h;2^32/10000
mul esi
elseif i gt 0
mul ecx
endif
add dl,'0'
mov [ebp],dl
inc ebp
i = i + 1
endm
DIGIT8:
mul ecx
add dl,'0'
mov [ebp],dl
inc ebp
mul ecx
add dl,'0'
DIGIT9:
mov [ebp],dx
inc ebp
retn
OPTION PROLOGUE:PROLOGUEDEF
OPTION EPILOGUE:EPILOGUEDEF
New_U64ToStr endp
%echo New_U64ToStr: @CatStr(%($-New_U64ToStr)) bytes

; edi::esi *= x
_mul_64x64_top64 macro __x_lo:req, __x_hi:req
mov eax,dword ptr __x_lo; = b0
mul esi;__y_lo; get a0*b0 = d1:d0
mov ecx,edx; d1
mov eax,dword ptr __x_hi; = b0
mul esi;__y_lo; get a1*b0 = e1:e0
add ecx,eax;e0
mov esi,0
adc esi,edx;e1
mov eax,dword ptr __x_lo; =b1
mul edi;__y_hi; get a0*b1 = f1:f0
add ecx,eax;f0
mov eax,dword ptr __x_hi; =b1
adc esi,edx;f1
mov edx,edi
mov edi,0
adc edi,edi
mul edx;__y_hi; get a1*b1 = g1:g0
add esi,eax
adc edi,edx
endm

New_Int64ToStr proc QwValue:QWORD,pBuffer:DWORD
mov eax,[esp+1*4];U64.Lo
mov edx,[esp+2*4];U64.Hi
mov ecx,[esp+3*4];buff
test edx,edx
jns New_U64ToStr
mov byte ptr [ecx],'-'
xor eax,-1
xor edx,-1
inc ecx
add eax,1
adc edx,0
invoke New_U64ToStr,edx::eax,ecx
inc eax
ret 3*4
New_Int64ToStr endp

New_Int32ToStr proc dwValue,pBuffer
mov eax,[esp+4];dwValue
mov edx,[esp+8];pBuffer
test eax,eax
jns New_U32ToStr
mov byte ptr [edx],'-'
neg eax
push ebp
push ebx
push esi
mov esi,eax
lea ebp,[edx+1]
call DOFULL32NLZ
mov eax,ebp;lpszBuffer
pop esi
pop ebx
pop ebp
sub eax,[esp+2*4]; strlen
ret 2*4
New_Int32ToStr endp

New_U32ToStr proc dwValue,pBuffer
push ebp
push ebx
push esi
mov esi,[esp+1*4][3*4]; a0
mov ebp,[esp+2*4][3*4]; lpszBuffer
call DOFULL32NLZ
mov eax,ebp;lpszBuffer
pop esi
pop ebx
pop ebp
sub eax,[esp+2*4]; strlen
ret 2*4
New_U32ToStr endp



edit: code updated to slightly improved verison + added suplemental procs
The truth cannot be learned ... it can only be recognized.

dedndave

cool Drizz - is it faster ?
(lol - magic diver)

drizz

 :bg, it's not faster than the LUT routine (pdixon, lingo) but it is very close.
it is 2-3 times faster than my previous functions.

The truth cannot be learned ... it can only be recognized.

dedndave

close is great - the Dixon method is a little clumsy with the table and all
1 kb to convert to decimal - lol
i see this one has branches - i was a fan of the last one   :(