News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Multiprecision square root

Started by bruce1948, August 17, 2009, 12:21:03 AM

Previous topic - Next topic

bruce1948

I'm currently working on a multiprecision unsigned integer package and find that the algorithm I'm using does not always give accurate results. 75% seem to be OK but about 25% are off base. The small ones (<= 96 bits are spot on) but as i go towards 512 bits some are ok and and some are not. Any one  know of a better way of calculating large unsigned square roots. Code follows:

; *********************************************************************
;
; Procedure: mpSqrt
;
; Function: Computes integer square root result = sqrt(x)
;           where result and x are multiprecision integers of ndigits
;           Uses xk1 = 1/2(xk + x / xk) and iterates until x/xk >= xk
;           leaving xk1 as the square root.
;
; Entry:    x   =   a pointer to the multiprecision number whose
;                   square root is to be taken
;
;           result  =   a pointer to the multiprecision number to
;                       hold the square root of x.
;
; Exit:     result  =   square root of x
;
; *********************************************************************
mpSqrt proc USES esi ebx ecx edx edi, result:unsigned_ptr, x:unsigned_ptr, ndigits:size_t       
        local xk:unsigned_ptr
        local xk1:unsigned_ptr
        local t:unsigned_ptr
        local tmp:unsigned_ptr
        ; *************************************************************
        ; Allocate memory for variables.
        ; Do not need to zero allocated memory as mpAlloc uses LMEM_ZEROINIT
        ; *************************************************************
        mov ebx, ndigits
        shl ebx, 2                                  ; need 4 * ndigits bytes for each local
        push ebx                                    ; save ebx
        shl ebx, 2                                  ; need 4 locals
        invoke mpAlloc, ebx
        pop ebx
        mov xk, eax                                 ; store the returned pointer
        add eax, ebx                                ; point at next local
        mov xk1, eax                                ; store the pointer
        add eax, ebx                                ; point at next local
        mov t, eax                                  ; store the pointer
        add eax, ebx                                ; point at next local
        mov tmp, eax                                ; store the pointer
        ; *************************************************************
        ; if x <= 1 copy x to result
        ; *************************************************************
        cld
        invoke mpShortCmp, x, 1, ndigits
        .if sdword ptr eax == -1 || eax == 0
            mov esi, x
            mov edi, result
            xor ecx, ecx                            ; loop counter
            .while ecx < ndigits
                movsd                               ; copy x to result
                inc ecx
            .endw
            jmp mpSqrtEnd
        .endif
        ; xk = x / 2
        invoke mpShiftRight, xk, x, 1, ndigits
        ; *************************************************************
        ; Set up complete - find the square root
        ; *************************************************************
        mov ebx, 1
        .while ebx == 1
            ; t = x / xk (tmp is used for the unused remainder)
            invoke mpDivide, t, tmp, x, ndigits, xk, ndigits
            ; xk1 = (xk + t)/2 (tmp holds the reult of the addition)
            invoke mpAdd, tmp, xk, t, ndigits
            invoke mpShiftRight, xk1, tmp, 1, ndigits
            ; if (t >= xk)  break
            invoke mpCompare, t, xk, ndigits
            .break .if sdword ptr eax >= 0
            ; Set xk = xk1
            mov esi, xk1
            mov edi, xk
            xor ecx, ecx                            ; loop counter
            .while ecx < ndigits
                movsd                               ; copy xk1 to xk
                inc ecx
            .endw
        .endw
        ; *************************************************************
        ; copy square root of x (xk1) to result
        ; *************************************************************
        mov esi, xk1
        mov edi, result
        xor ecx, ecx                            ; loop counter
        .while ecx < ndigits
            movsd                               ; copy xk1 to result
            inc ecx
        .endw
mpSqrtEnd:
        invoke mpFree, xk       
        ret
mpSqrt endp


dedndave

long ago i wrote a simple one for 16-bit integers (i think it was - lol)
let me see if i can find it - it is not what you want, but you can get the concept
it is a loop - i more or less did what they taught us in grade-school - lol

dedndave

ok - so it's a 24-bit routine - lol
as i said, it's a loop, so it's not very fast
but, the loop is all shifts and adds - it could be re-written to be much faster on a pentium

;-----------------------------------------------
;
SQINT   PROC    NEAR
;
;  This is a 24-bit integer squareroot routine.
; It calculates the squareoroot, then rounds
; it to the nearest integer.
;
; Call With: DL:AX = 24 bit integer (0-FF:FFFFh)
;
;   Returns: AX = rounded squareroot (0-1000h)
;            BX = DX = CH = 0
;            CF = clear
;            CL, BP, SI, DI invalid
;
        MOV     DH,DL
        MOV     DL,AH
        MOV     BH,AL
        MOV     AX,0C000h
        MOV     BL,AL
        MOV     CX,8
;
SQINT0: TEST    AX,DX
        JNZ     SQINT3
;
        ROR     AX,1
        ROR     AX,1
        LOOP    SQINT0
;
        MOV     CL,4
;
SQINT1: TEST    AH,BH
        JNZ     SQINT2
;
        SHR     AH,1
        SHR     AH,1
        LOOP    SQINT1
;
        RET
;
SQINT2: SUB     CL,4
;
SQINT3: SHL     CL,1
        ADD     CL,9
        XOR     AX,AX
        MOV     SI,AX
        MOV     DI,AX
        MOV     BP,AX
;
SQINT4: SHL     BX,1
        RCL     DX,1
        RCL     DI,1
        RCL     BP,1
        SHL     BX,1
        RCL     DX,1
        RCL     DI,1
        RCL     BP,1
        STC
        RCL     SI,1
        RCL     AX,1
        SUB     DI,SI
        SBB     BP,AX
        JNC     SQINT6
;
        ADD     DI,SI
        ADC     BP,AX
        DEC     SI
        LOOP    SQINT4
;
SQINT5: SHL     SI,1
        RCL     AX,1
        SHL     SI,1
        RCL     AX,1
        SHL     SI,1
        ADC     AX,CX
        RET
;
SQINT6: INC     SI
        LOOP    SQINT4
;
        JMP     SQINT5
;
SQINT   ENDP
;
;-----------------------------------------------

bruce1948

I shall study it and see if I can work anything out.

thanks

Bruce

Rockoon

If you cook up a multiprecision fixed/float log() and exp() functions, the accurate calculation of sqrt's becomes trivial

sqrt(x) = exp(0.5 * log(x))

I dont know how performance will compare with newtons progressive approximation, but just the same having log() and exp() are quite usefull for bignum stuff, because bignum multiplication and division are expensive where log() and exp() can turn those into additions and subtractions
When C++ compilers can be coerced to emit rcl and rcr, I *might* consider using one.

raymond

If it's for inclusion in a bignum library, I have to assume that you wish to keep everything in binary.

In the MixLib package available at

http://www.ray.masmcode.com/fixmath.html

you would find the source code of two procedures for extracting the square root, one for 32-bit numbers and the other for 64-bit numbers. Based on the latter (sqrt64.asm), you should be able to expand the process to any size number, using memory instead of registers.

If you wanted to extract the square root of a number in decimal string format and return the result in decimal string format with up to 9999 decimal digits, you would find code in the BCDtut package available at

http://www.ray.masmcode.com/BCDtut.html
When you assume something, you risk being wrong half the time
http://www.ray.masmcode.com

dedndave

i was just looking at Ray's 64-bit squareroot routine
i like his much better than my old 24-bit (16-bit code) one - lol
it's much cleaner and easier to expand to 512 bits, i think
log and exp functions is a good idea, too
if i remember correctly, those routines could be used for sine and cosine functions, as well

bruce1948

Plenty to think about here.


Thanks all

Bruce

FORTRANS

Thanks Raymond,

   I am putting together a set of fixed point routines, and
looking at your code had me review some of what I am doing.
Which now seem to have some design flaws.  You are using
a different algorithm for converting a fraction to ASCII than I
am using.  Do you have a quick pointer to a description?  I
like that you truncate properly.  I am printing some bogus
characters on the end.  I am varying precision and haven't
worked out the numbers yet.

   One of my test numbers is Pi with 100 fraction digits.


Debug large fixed point routines.
16:240
3.1415926535897932384626433832795028841971693993751058209749445923078164060462507223188
3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679

Debug large fixed point routines.
16:368
3.14159265358979323846264338327950288419716939937510582097494459230781640628620899862803482534211706789999999999838420518372309708
3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679

Debug large fixed point routines.

FixDiv algorithm.
Divisor   2.0
Dividend  3.14159265358979323846264338327950288419716939937510582097494459230781640628620899862803482534211706789999999999838420518372309708
Result    1.57079632679489661923132169163975144209858469968755291048747229615390820314310449931401741267105853394999999999836046981060995660


Regards,

Steve N.

dedndave

i don't think that's got enough precision, Steve ! - lol
those digits may be the true evaluation of the binary representation
if you want to know how many decimal digits a binary format can support, change the lsb by one and observe the difference

raymond

QuoteYou are using a different algorithm for converting a fraction to ASCII than I am using.

The algo I'm using is quite simple. In my Mixlib, I'm only carrying 16 bits for the fraction (which is equivalent to 5 decimal places). Those 16 bits xxxxh are thus a fraction of 10000h. The decimal fraction digits are thus
xxxxh * 100000 / 10000h
I then simply convert the result to ascii as for any other dword. A similar approach could be used for up to 31 fractional bits.

However, you are combining fixed point with BIG numbers which cannot use this simple algo. The only way I could see at present would be to use BCDs.

The number of significant decimal digits should be: number of fraction bits * 0.3

Start with a string of 0s
The most significant bit = 50000000........ with as many 0s as needed; add it if set
Divide by 2
The 2nd significant bit = 250000000.......; add it if set
Divide by 2
The 3rd significant bit = 125000000.......; add it if set
etc.
Convert all the BCD digits to ascii by adding 30h.

It's obviously slow but you only need it to display the occasional result.
When you assume something, you risk being wrong half the time
http://www.ray.masmcode.com

dedndave

QuoteThe number of significant decimal digits should be: number of fraction bits * 0.3
what is the math behind that formula Ray ?
is it an estimate ?

raymond

When you assume something, you risk being wrong half the time
http://www.ray.masmcode.com

dedndave

lol
i knew that - didn't know it could be used to calc digits - very cool
you might consider adding that little tid-bit to your tute - pardon me if it is already in there - i missed it
we use that ratio often - if you double power, it is ~3 Db gain: 10 log(P2/P1)
it is ~6 Db gain for doubling voltage (or current): 20 log(E2/E1)
a Db is a decibel (or 10 bel's), named after Alexander Graham Bell, of course

Eddy

Bruce,

(Referring to your first post)
Newtons method is an iterative method that always converges for floating point numbers, but for integer numbers, caution is necessary.
You must start with an 'initial value' that is guaranteed smaller than the root.
The following new calculated value will be LARGER than the root and will decrease with each iteration.
Iteration must stop when the new value starts to INCREASE (becomes larger than the previous value).
For maximum speed, the initial value is very important. The closer to the root, the faster.
Worst case, a non-optimal intial value could cause the algorithm to 'oscillate' and never converge to a solution, atleast for integers.

Kind regards
Eddy
www.devotechs.com -- HIME : Huge Integer Math and Encryption library--