sometimes, we want to use multiply-to-divide, but we also want the remainder after division
the concept in this proc may be used for many things
i use multiply-by-inverse-to-divide with a little extra resolution, then re-constitute the fraction bits into the remainder (or modulo)
i have tested all 9,999 values
i think this is a bit faster than what we have been using - i am very pleased with the speed
on my speed-funky prescott, it is about 50% faster than using DIV
if i remove the AAM's and ascii stuff, and just compare division, it is roughly twice as fast as a DIV
of course, we don't generally use it in PROC form...
(source file attached)
;-------------------------------------------------------------------------------------
A9999 PROC
;convert value in eax (0 to 9,999) into four ASCII digits
;DednDave, 9-2009
;uses increased-precision multiply-by-inverse-to-divide
;(requires 3 more bits of resolution than normal to acquire the remainder)
;multiply by 5243
mov edx,eax
shl eax,1
add edx,eax
shl eax,2
add edx,eax
shl eax,1
add edx,eax
shl eax,1
add edx,eax
shl eax,1
add edx,eax
shl eax,4
add edx,eax
shl eax,2
add eax,edx
;divide by 8
shr eax,3
;fraction in edx
xor edx,edx
mov dl,ah
;multiply the fraction by 100 and round it off to get the remainder
shl edx,2
mov ax,dx
shl edx,3
add ax,dx
shl edx,1
add ax,dx
shl al,1
adc ah,0
;low-order digits in al - split into two digits in ax
mov al,ah
aam
;rearrange bytes - high order digits in ah
bswap eax
;high-order digits in al - split into two digits in ax
mov al,ah
aam
;rearrange two high-order digits
xchg al,ah
;make all four digits ASCII numeric characters
or eax,30303030h
ret
A9999 ENDP
;-------------------------------------------------------------------------------------
I use fixed point arithmetic EDX.EAX
mov edx,4294968; Round ((2^32) / 1000)
mov ebx,10
mul edx
mov ecx,edx
mul ebx
mov ch,dl
mul ebx
push edx
mul ebx
pop eax
mov ah,dl
shl eax,16
lea eax,[eax+ecx+'0000']
i got 29 cycles for yours - 18 for mine - 34 for DIV by 10000
EDIT - hang on - i was measuring only the DIV part of mine - lol
that is fast, Drizz
i'll see if i can get it down, now that i know what i am up against - lol
i know the AAM's and BSWAP are killing me
EDIT - probably the best i could do is to match you, and that won't be easy - lol - i'll play with it
dang - i used to smoke with those shifts and rotates in the old days
let me correct that, Drizz
i have yours at 21 cycles on a prescott - only 5 times faster than mine - lol
i didn't realize how fast a MUL was
kind of kills the shift/rotate advantage
your code does use a few registers, though
i am trying something new (very old, actually) - this is a small piece of it
for what i am doing, all the registers are full
i will move some things around
Just for fun :
:eek
.data
align 16
mn1 label oword
dd 429497, 10, 4294968, 10;(2^32/10^3)/10, (2^32/10^3)
mn2 label oword
dd 42949680, 0, 429496800, 0;(2^32/10^3)*10, (2^32/10^3)*100
.code
movd xmm0,eax
movdqu xmm2,mn1
pshufd xmm0,xmm0,01000100b
movdqu xmm1,xmm0
pmuludq xmm0,xmm2
psrldq xmm2,4
pmuludq xmm1,mn2
pmuludq xmm0,xmm2
pmuludq xmm1,xmm2
pshufd xmm0,xmm0,00001101b
pshufd xmm1,xmm1,00001101b
punpcklqdq xmm0,xmm1
packsswb xmm0,xmm0
packsswb xmm0,xmm0
movd eax,xmm0
add eax,'0000'
lol - that is fast, Drizz
i got 18 clocks on a prescott
that's the same as a single AAM, i think
what i really need is code to...
divide a value from 0 to 2,629,999 by 10,000 - i need the quotient (0 to 262) and the remainder (0 to 9999)
i don't know SSE, yet
EDIT - it looks like i could adapt that code to do it - let me try
Quote from: dedndave on September 05, 2009, 03:16:28 PM
EDIT - it looks like i could adapt that code to do it - let me try
ok, then this is a spoiler :lol
drag cursor over to reveal :bdg
int 3
pushad
lea edi,buffy
mov eax,2629999
;------------------
mov ebx,eax
mov edx,0D1B71759h
mul edx
shr edx,0Dh
imul eax,edx,10000
sub ebx,eax
;------------------
;edx = 262
;ebx = 9999
;------------------
mov eax,4294968; Round ((2^32) / 1000)
mov esi,10
mul edx
mov ecx,edx
mul esi
mov ch,dl
mul esi
mov ebp,edx
mul esi
mov eax,ebp
mov ah,dl
shl eax,16
lea eax,[eax+ecx+'0000']
;------------------
mov [edi],eax
;------------------
mov eax,4294968; Round ((2^32) / 1000)
mov esi,10
mul ebx
mov ecx,edx
mul esi
mov ch,dl
mul esi
mov ebp,edx
mul esi
mov eax,ebp
mov ah,dl
shl eax,16
lea eax,[eax+ecx+'0000']
;------------------
mov [edi+4],eax
mov byte ptr [edi+8],0
;------------------
popad
lol - yah - i can figure out how to fixed-point multiply to get the quotient and remainder with x86
here is the thing, Drizz
i have a little math i am playing with for conversion algorithms
it could be used for float 2 string, string 2 float, bignums, integers, and so on - conversion in either direction
it should be faster than what we have been using, but, my optimizing skills haven't developed yet - not at all with mmx/sse - lol
i was playing with a bignum2string routine in another thread, so i thought i'd try it there, first
so - i can write the code - put it in here - then watch you guys make it go fast :(
or - i can try to make it go fast, myself
then, put it in here and watch you guys make it go even faster - lol :(
i guess i could save us both a lot of work if i just shared the math with you to start with
then - i can watch you guys make it go fast - lol
;drizz code ~18 clock cycles
mov ebx,eax
mov edx,0D1B71759h ;3,518,437,209 = 2^45/10,000
mul edx
shr edx,0Dh
imul eax,edx,10000
sub ebx,eax
my code is about 30 clock cycles
lol - how do you do that ?
i have learned 2 things from Drizz, this week...
1) mul is faster than you think
2) i have more stuff to learn from Drizz :lol
are you running a pentium ? i think on one of those they made MUL really fast but LEA really slow
hiya Mike
yah - a p4 prescott
not the smartest cpu
i get funny clock numbers from it usually
but, it seems to perform ok in practical use
EDIT - my code has 2 mul's in it, too
it works, but it just isn't quite as fast
i can probably monkey with the numbers and get rid of the shift/adc at the end
but it would still be slower and looks like it ought to be faster - lol
mov edx,109951163 ;= 2^40/10,000
mul edx
rol eax,8
mov ah,dl
shr edx,8
mov cx,dx
mov dx,10000
mul dx
shl eax,1
adc dx,0
on my machine, a bswap is about the same as shifting a dword reg by 8
but notice - i don't have the complex imul that drizz does
he must have a cheat-sheet or something - lol
Dave,
On a PIV try and replace the partial register reads and writes with a 32 bit register where possible.
mov edx,109951163 ;= 2^40/10,000
mul edx
rol eax,8
mov ah,dl
shr edx,8
;; mov cx,dx
mov ecx, edx
;; mov dx,10000
mov edx, 10000
mul dx
shl eax,1
adc dx,0
I don't have a test piece to play with it but the idea is to stick with 32 bit operations if possible. On a PIV reading and writing to AH is slow.
i should have known better, Hutch - lol
well - at least for the mov cx,dx
but, with the mov reg32,immed, i thought reducing the size of the immediate operand would help - maybe not
my best solution is to use what Drizz wrote
nothing beats experience
(and - "if you can't beat 'em, join 'em" - lol)
i can't believe it - i managed to improve Drizz's code
mov edx,4294968 ;Round ((2^32) / 1000)
mov ebx,10
mul edx
mov ecx,edx
mul ebx
mov ch,dl
mul ebx
; push edx
; mul ebx
; pop eax
; mov ah,dl
; shl eax,16
; lea ecx,[eax+ecx+'0000']
bswap ecx
mov ch,dl
mul ebx
lea ecx,[edx+ecx+'0000']
bswap ecx
a bswap takes as many clocks as 8 shifts on a dword reg
so, by exchanging the "shl eax,16" with 2 bswap's
we eliminate the push/pop and save a whole 2 clock cycles !!! ::)
(i wanted the output in ecx in this case - it was in eax originally - no diff there)
i was playing around with the 32-bits-at-a-time thing
there seems to be little difference between "mov dx,10000" and "mov edx,10000" in this particular case
HOWEVER, i have learned something very important from that...
when accessing memory, it is best to access it in 32-bit chunks, and 32-bit aligned
that makes a big difference in everything
i suspect it may be more prominent on the p4's
but, if you need a byte at [esi], it is best to make sure esi mod 4=0, and get the whole dword
then, manipulate bytes in register to get what you want
whether i produce a good routine or not, this has been a great learning experience
thanks again Drizz and Hutch :U
Hi,
Just for fun, a four multiply version. The individual ADDS could
be made into one ADD with the buffer.
Regards,
Steve
; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
; BINary to DECimal conversion with multiplies. Assume unsigned numbers.
; The number in AX is stored to a four digit number buffer.
; September 2008, start double word version.
; September 2009, rework for four digits.
; INPUT: EAX contains the four digit number ( 0 ... 9999 ) to be
; converted to decimal ASCII.
; EDI points to buffer start.
; Uses: EAX, ECX, EDX
Bin2DecM:
MOV EDX,00418938H ; Multiplier to get leading digit of four digit
; number into low byte of EDX (DL).
MUL EDX ; Do a fixed point multiply to get result.
ADD DL,'0' ; Convert binary to ASCII.
MOV [EDI],DL
MOV ECX,10 ; Multiplier to get remaining digits.
MUL ECX ; Second digit.
ADD DL,'0'
MOV [EDI+1],DL
MUL ECX
ADD DL,'0'
MOV [EDI+2],DL
MUL ECX
ADD DL,'0'
MOV [EDI+3],DL
RET
we have one like that
this one converts all four bytes at once
;----------------------------------------------------------------
; code by Drizz
;
;this snippet converts a value in eax from base 10,000
;(0 to 9999) into 4 ASCII numeric characters
;----------------------------------------------------------------
mov edx,4294968 ;2^32/1000
mov ebx,10 ;per digit
mul edx ;extend first digit
mov ecx,edx ;digit 1 in CL
mul ebx ;second digit
mov ch,dl ;digit 2 in CH
mul ebx ;third digit
;----------------------------------------------------------------
;this section of Drizz's snippet was replaced below
;----------------------------------------------------------------
;;;; push edx ;save digit 3
;;;; mul ebx ;forth digit
;;;; pop eax ;digit 3 in AL
;;;; mov ah,dl ;digit 4 in AH
;;;; shl eax,16 ;relocate digits 3 & 4
;;;; lea ecx,[eax+ecx+'0000'] ;combine and make ASCII
;----------------------------------------------------------------
;replacement section by DednDave - saves 2 clock cycles ;)
;----------------------------------------------------------------
bswap ecx ;digits 1 & 2 up high
mov ch,dl ;digit 3 in CH
mul ebx ;digit 4 in DL
lea ecx,[edx+ecx+'0000'] ;combine and make ASCII
bswap ecx ;re-order bytes
;----------------------------------------------------------------
Drizz also made this one
i think this is now my all-time second favorite snippet of code
it will handle any value from 0 to 4294967295
;----------------------------------------------------------------
; code by Drizz
;
;this snippet divides the value in eax by 10000 with remainder
;very cool, Drizz - i really like this piece of code ;)
;----------------------------------------------------------------
;eax = dividend
mov ebx,eax ;original in ebx
mov edx,3518437209 ;2^45/10000
mul edx ;extend quotient
shr edx,13 ;quotient
imul eax,edx,10000 ;reconstruct for remainder
sub ebx,eax ;remainder=orig-10000*quot
;edx = quotient
;ebx = remainder
;----------------------------------------------------------------
by combining the two "Drizz snippets", we have one that will do 8 digits at a time
(base 100,000,000 to ASCII)
mov ebx,eax ;original in ebx
mov edx,3518437209 ;2^45/10000
mul edx ;extend quotient
shr edx,13 ;quotient
imul eax,edx,10000 ;reconstruct for remainder
neg eax
push edx
add eax,ebx ;remainder=orig-10000*quot
mov edx,4294968 ;2^32/1000
mov ebx,10 ;per digit
mul edx ;extend first digit
mov ecx,edx ;digit 1 in CL
mul ebx ;second digit
mov ch,dl ;digit 2 in CH
mul ebx ;third digit
bswap ecx ;digits 1 & 2 up high
mov ch,dl ;digit 3 in CH
mul ebx ;digit 4 in DL
lea ecx,[edx+ecx+'0000'] ;combine and make ASCII
bswap ecx ;re-order bytes
pop eax
mov [edi+4],ecx
mov edx,4294968 ;2^32/1000
mov ebx,10 ;per digit
mul edx ;extend first digit
mov ecx,edx ;digit 1 in CL
mul ebx ;second digit
mov ch,dl ;digit 2 in CH
mul ebx ;third digit
bswap ecx ;digits 1 & 2 up high
mov ch,dl ;digit 3 in CH
mul ebx ;digit 4 in DL
lea ecx,[edx+ecx+'0000'] ;combine and make ASCII
bswap ecx ;re-order bytes
mov [edi],ecx
all we need now is a Drizz snippet to divide 64-bits by 100,000,000 :bg
let me see if i can "do what Drizz would do" - lol
i guess the answer is to use the first snippet 4 times to divide 64 bits
I'm sure i mentioned this tool (http://www.masm32.com/board/index.php?topic=9937.msg72815#msg72815) by qWord already.
This thread started with 9999 to ascii, now you are talking 64bits; there are plenty of threads discussing dword2ascii/qword2ascii already ... use the search button :P
here it is anyway ... :8)
;; edi::esi == 18446744073709551615
or edi,-1
or esi,-1
mov ebx,esi
_mul_64x64_top64_2 esi, edi, 8461CEFDh, 0ABCC7711h; /100000000
shrd esi,edi,26
shr edi,26
mov eax,100000000
mul esi
sub ebx,eax
; ebx == first 8 digits ("09551615")
; edi::esi == top 12 digits ("184467440737")
mov ebx,esi
_mul_64x64_top64_2 esi, edi, 8461CEFDh, 0ABCC7711h; /100000000
shrd esi,edi,26
imul eax,esi,100000000
sub ebx,eax
; ebx == second 8 digits ("67440737")
; esi == top 4 digits ("1844")
; A1::A0 = (A1::A0 * B1::B0) >> 64
_mul_64x64_top64_2 macro A0:req,A1:req, B0:req,B1:req
mov eax,dword ptr B0
mul A0
mov ecx,edx; d1
mov eax,dword ptr B1
mul A0
add ecx,eax;e0
mov A0,0
adc A0,edx;e1
mov eax,dword ptr B0
mul A1
add ecx,eax;f0
mov eax,dword ptr B1
adc A0,edx;f1
mov ecx,0
mov edx,A1
adc ecx,ecx
mul edx
mov A1,ecx
add A0,eax
adc A1,edx
endm
many thanks, Drizz
none of the previous threads discuss using ling long kai fang - lol - or horners rule, either
at least not for this (it is used all the time for decimal to binary, of course)
what i want to do is compare divide and conquer (which, i am sure has a better name, someplace) with ling long kai fang
i am working on bignum to ascii routines, not just 64-bit
recently, you (Drizz), Jochen, and I all wrote a few 64-bit to ascii routines
i have a few versions of yours around here someplace
i think yours were fastest if i remember
at any rate, i am trying to apply things i have learned along the way
the mul-to-div stuff is great help, of course
my original ling long kai fang code loaded bytes from the input value and output words (base 256 to base 10,000)
the routine i have been working on (when i have time) is an attempt to reading and writing dwords only (a tip from Hutch)
in essence, i am converting base 4,294,967,296 to base 100,000,000
this generates some pretty big "carry" values - lol - but i am working through it
i wish i knew how to write code for sse/sse2 - but that can come later
Use magic divider method.
Magic = CCCCCCCD for 32bit
Magic = CCCCCCCC CCCCCCCD for 64bit
Magic = CCCCCCCC CCCCCCCC CCCCCCCD for 96bit
Magic = (n-1) dup(CCCCCCCC) CCCCCCCD for 32*n-bit
same method as for 32bit unsigned integers.
But the question is: Is using div instruction faster than the overhead of using magic divider?
i can use mul by constants
in one place, i need to divide a 57 bit value (105_F5DF_FF00_0000h max) by 390,625 with remainder
the resulting quotient can be 38 bits (2B_F31D_9943h max)
i am not sure which one is worse - lol
another one - i need to divide a 59 bit value (5F5_E12A_F31D_9943h max) by 100,000,000 with remainder
the resulting quotient can be 33 bits (1_0000_0734h max)
and the easy one.....
divide a 33 bit value (1_05F5_E0FFh max) by 10,000 with remainder
the resulting quotient can be 19 bits (6_B4C8h max)
i can probably use your previous examples to help
i know how to multiple-precision divide and multiply
at this point, i am writing the routine with DIV's
i have even outlined a fast way to handle signed values
once i get it up and running, i will go back and replace these three critical operations
using the mul-to-div 2 or even 4 times is ok
i like that, because i can split the code off and test all the values
there is no way to test a piece of code that does such large divisions in one step
there are just to many input values to test for
it looks like i might get to spend some time on it, today :U
i will post the code (DIV version) when i get it tested
i have started a different thread and posted LLKF version 1...
http://www.masm32.com/board/index.php?topic=12363.new#new