News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Division by Constants

Started by dedndave, June 28, 2009, 06:39:54 PM

Previous topic - Next topic

dedndave

as i mentioned in a previous post, yahoo is shutting down geocities free webhosting (scheduled for Oct 26th, 2009)
as such, i have been rifling through all my geocities bookmarks and trying to extract any "goodies" i may want later on
some of my bookmarks are for code tricks and tips
while following some of the links, i found this little jewel
compressed, it is small enough to share on a forum link

of course, there has been a recent trend of multiplying to divide
this is not a new technique, at all
but it has seen some recent popularity
this PDF shows some extensions on the idea

[attachment deleted by admin]

ramguru

Thanks, I actually was looking recently for div_by_10 substitution.
But .. I doubt that it would be faster  :lol w/o div instruction.
Anyway this kind of brain-food taste good :}

dedndave

#2
i am working on a division by 10 also
or division by 100, actually (i will use AAM to convert 2 digits at once)
the problem with multiplying to divide is that the remainder is not typically sought
the remainder information is there, it just has to be re-multiplied
i am working on a solution
i am trying to make a fast 64-bit unsigned integer to decimal ascii routine without using the FPU
i have a good working routine, now, but i am trying to improve on it

ramguru

hope you'll share it soon with some comparison results  :U

dedndave

i'd be happy to send you my current routine - lol
i intend to write a little paper on it, as well, before sharing it with the masm32 community

jj2007

Quote from: dedndave on June 28, 2009, 07:22:28 PM
i'd be happy to send you my current routine - lol
i intend to write a little paper on it, as well, before sharing it with the masm32 community

You know drizz' version, I suppose... I am eager to see some timings :bg

dedndave

well, yes, but his uses mmx
i would be interested to see how fast my current virgin is
i think it will be surprising
also, mine is written to pass the qword in edx:eax and saves no registers
adding push's/pop's is easy enough
i need someone who is familiar with the current set of timing macros - lol

Jimg

Was that a typo or a Freudian slip? :bg

And drizz's routine isn't mmx.

dedndave

lol
so it isn't
that thing sure is fast - gonna be hard to improve on it

drizz

Quote from: dedndave on June 29, 2009, 01:57:51 AMthat thing sure is fast - gonna be hard to improve on it
well, i can try...  ::)



comment #
- fixed point arithmetic conversion of Unsigned 64 integer to string -
- Multiply 64bit unsigned integer by 2^127/10^19
- make correction to prevent precision loss
- first digit will be at bit offset 127 (0 or 1)
- [1]8446744073709551615, first digit can only be 0 or 1
- after handling the first digit, we get subsequent digits multiplying by 10
- the "point" is moved to end so all other digits are constructed from shifted "out" bits (multiplying by 10)
#

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
_U64ToStrNLZ proc val:QWORD, pbuff:PTR BYTE

__locals equ 4*4+4*4+4;;//4*4=regs + 4*4=128bit temp
__saveregs equ <dword ptr [esp]>
__128bittmp equ <dword ptr [esp+4*4]>
__NLZmask equ <dword ptr [esp+4*4+4*4]>
__val_hi equ <dword ptr [esp+__locals+2*4]>
__val_lo equ <dword ptr [esp+__locals+1*4]>
__pbuff equ <dword ptr [esp+__locals+3*4]>
__cnst_hi equ 0EC1E4A7Dh
__cnst_lo equ 0B69561A5h
__correction_lo equ 0E30437FBh
__correction_hi equ 0AD7B4F1Bh

sub esp,__locals
mov __saveregs[0*4],ebp
mov __saveregs[1*4],esi
mov __saveregs[2*4],edi
mov __saveregs[3*4],ebx

;;// 64bit x 64bit = 128bit result
mov eax,__cnst_lo;//__y_lo; = b0
mul __val_lo;;// get a0*b0 = d1:d0
mov esi,eax ;mov __128bittmp[0*4],eax;;//d0
mov ecx,edx;;//d1
mov eax,__cnst_lo;//__y_lo; = b0
xor ebx,ebx
mul __val_hi;;// get a1*b0 = e1:e0
add ecx,eax;;//e0
adc ebx,edx;;//e1
mov eax,__cnst_hi;//__y_hi; =b1
mul __val_lo;;// get a0*b1 = f1:f0
add ecx,eax;;//f0
adc ebx,edx;;//f1
mov edi,ecx;mov __128bittmp[1*4],ecx
mov ecx,0
mov eax,__cnst_hi;//__y_hi; =b1
adc ecx,ecx
mul __val_hi;;// get a1*b1 = g1:g0
add eax,ebx;;//g0
adc edx,ecx;;//g1
mov ebp,__pbuff
;// -------------------------------
;// -------------------------------
xor ebx,ebx
add esi,__correction_lo
adc edi,__correction_hi
adc eax,ebx
adc edx,ebx

;// first digit 0 or 1, zero not written

mov ebx,edx
sar ebx,31
lea ecx,[ebx+'2']
mov byte ptr [ebp],cl
sub ebp,ebx
mov __NLZmask,ebx

;// second digit, 128 bits needed
;// shift 2:
;// - account for the first digit
;// - multiply by 2

shld ebx,edx,2
shld edx,eax,2
shld eax,edi,2
shld edi,esi,2
shl esi,2
and ebx,1; only bit 0
;// mul by 5
mov __128bittmp[0*4],esi
mov __128bittmp[1*4],edi
mov __128bittmp[2*4],eax
mov __128bittmp[3*4],edx
mov ecx,ebx
shld ebx,edx,2
shld edx,eax,2
shld eax,edi,2
shld edi,esi,2
shl esi,2
add esi,__128bittmp[0*4]
adc edi,__128bittmp[1*4]
adc eax,__128bittmp[2*4]
adc edx,__128bittmp[3*4]
adc ecx,ebx; X*2 + X*5
cmp ecx,1
sbb ebx,ebx
xor ebx,-1
add ecx,'0'
or __NLZmask,ebx
mov byte ptr [ebp],cl
sub ebp,__NLZmask

;// third digit, 128 bits needed
xor ebx,ebx
add esi,esi
adc edi,edi
adc eax,eax
adc edx,edx
adc ebx,ebx
mov __128bittmp[0*4],esi
mov __128bittmp[1*4],edi
mov __128bittmp[2*4],eax
mov __128bittmp[3*4],edx
mov ecx,ebx
shld ebx,edx,2
shld edx,eax,2
shld eax,edi,2
shld edi,esi,2
shl esi,2
add esi,__128bittmp[0*4]
adc edi,__128bittmp[1*4]
adc eax,__128bittmp[2*4]
adc edx,__128bittmp[3*4]
adc ecx,ebx
cmp ecx,1
sbb ebx,ebx
xor ebx,-1
add ecx,'0'
or __NLZmask,ebx
mov byte ptr [ebp],cl
sub ebp,__NLZmask

;// mul by 10 the rest not using lower qword anymore
;184
;   46744073709551615

REPT 16
xor ecx,ecx
add eax,eax;
adc edx,edx;
adc ecx,ecx
mov esi,eax;
mov edi,edx;
mov ebx,ecx
shld ecx,edx,2;
shld edx,eax,2;
shl eax,2;
add eax,esi;
adc edx,edi;
adc ecx,ebx
cmp ecx,1
sbb ebx,ebx
xor ebx,-1
add ecx,'0'
or __NLZmask,ebx
mov byte ptr [ebp],cl
sub ebp,__NLZmask
ENDM

xor ecx,ecx
add eax,eax;
adc edx,edx;
adc ecx,ecx
mov esi,eax;
mov edi,edx;
mov ebx,ecx
shld ecx,edx,2;
shld edx,eax,2;
shl eax,2;
add eax,esi;
adc edx,edi;
adc ecx,ebx
cmp ecx,1
sbb ebx,ebx
xor ebx,-1
add ecx,'0'
mov edx,__pbuff
or __NLZmask,ebx
mov word ptr [ebp],cx
sub ebp,__NLZmask
mov eax,ebp
;// -------------------------------
mov ebp,__saveregs[0*4]
mov esi,__saveregs[1*4]
mov edi,__saveregs[2*4]
mov ebx,__saveregs[3*4]
add esp,__locals
sub eax,edx
ret 8+4
_U64ToStrNLZ endp
The truth cannot be learned ... it can only be recognized.

Jimg

Hi drizz-

What's the license on your routine?   Every good one I've found so far (e.g.. jj's float$, Raymond's fbulib) prohibits commercial usage, something I just can't understand.

dedndave

i have one you may use freely - it uses the standard multiple precision division method
not even close to being as fast as the one Drizz wrote, though - lol
(mine is smaller - lol - big deal)

        .DATA

AscBuf  DB      '01234567890123456789',0  ;20 ASCII digits

        .CODE

Asc64   PROC

;Convert 64-bit unsigned integer to ASCII decimal string
;
;Call With: EDX:EAX= QWORD value to convert
;
;  Returns: EDI= Offset into AscBuf of first numchar

        std
        mov     edi,offset AscBuf+18
        mov     ecx,edx
        xchg    eax,esi
        mov     ebx,100

Asc64a: xor     edx,edx
        xchg    eax,ecx
        div     ebx
        xchg    eax,ecx
        xchg    eax,esi
        div     ebx
        xchg    eax,esi
        xchg    eax,edx
        aam
        xchg    al,ah
        or      ax,3030h
        stosw
        mov     eax,ecx
        or      eax,esi
        jnz     Asc64a

        inc     edi
        inc     edi
        cld
        cmp byte ptr [edi],30h       ;leading 0 ?
        jz      Asc64b               ;yes - supress it

        ret                          ;no - done

Asc64b: inc     edi
        ret

Asc64   ENDP

drizz

Quote from: Jimg on June 29, 2009, 04:32:36 PMWhat's the license on your routine?   Every good one I've found so far (e.g.. jj's float$, Raymond's fbulib) prohibits commercial usage, something I just can't understand.
All source code posted on this forum under alias "drizz" is in the public domain.
The truth cannot be learned ... it can only be recognized.

dedndave

here is some comparison timing
although, there seems to be a flaw someplace
still, Drizz's routine will be about 7 times faster than mine
i think i can convert my method a bit
i may not beat Drizz, but i can get somewhere in the same galaxy - lol

[attachment deleted by admin]

jj2007

Quote from: Jimg on June 29, 2009, 04:32:36 PM
Hi drizz-

What's the license on your routine?   Every good one I've found so far (e.g.. jj's float$, Raymond's fbulib) prohibits commercial usage, something I just can't understand.

It's not that complicated: Once code is posted here as "public domain", the original authors will not accept to see it copyrighted by somebody who asks money for it...