News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Float to string without fpu

Started by drizz, September 15, 2008, 02:32:32 AM

Previous topic - Next topic

drizz

I've always wanted to write this kind of float conversion function, but figuring how to deal with floating point binary was 'pita'. Finally code that works ( and actually float string formating was just as painful ).

This is a preliminary version open for suggestions and optimizations :) and bug reports :)


; SEEEEEEEEEEEFFFFFFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
; 10000000000000000000000000000000 00000000000000000000000000000000
; 01111111111100000000000000000000 00000000000000000000000000000000
; 00000000000011111111111111111111 11111111111111111111111111111111
;            100000000000000000000 00000000000000000000000000000000 ; implied

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

R8_BIAS equ 1023
R8_MANT equ 52

R8ToStr proc r8:REAL8, lpBuffer:PTR

push ebp
push esi
push edi
push ebx
locals = 32+3*4
tmpbuff equ <[esp]>
iTrail equ <dword ptr [esp+32]>
iExp equ <dword ptr [esp+32+4]>
nSign equ <dword ptr [esp+32+4+4]>
add esp,-locals

mov ecx,10000000000000000000000000000000b
mov ebx,[esp+2*4][4*4][locals]
mov esi,[esp+1*4][4*4][locals]
and ecx,ebx; sign bit
and ebx,not 10000000000000000000000000000000b
mov nSign,ecx
mov edi,ebx
shr ebx,20; 01111111111100000000000000000000
and edi,00000000000011111111111111111111b
cmp ebx,11111111111b
je @@_NaN_Infinity
mov eax,esi
or eax,edi
or eax,ebx
jz @@Zero
sub ebx,R8_BIAS; exponent
or  edi,00000000000100000000000000000000b; high 20 bits + 1 implied
xor ebp,ebp
mov iExp,ebx
;; 52bits in edi::esi
.if sdword ptr ebx > 63
shld edi,esi,63-R8_MANT
shl esi,63-R8_MANT
.repeat
call __div10
; mod 10
lea ecx,[eax*4+eax]
add ecx,ecx
sub esi,ecx

bsr ecx,edx
neg ecx
add ecx,31
shld edx,eax,cl
shl eax,cl

mov ebx,iExp
sub ebx,ecx

or esi,eax
mov edi,edx

mov iExp,ebx
add ebp,1
.until sdword ptr ebx <= 63
mov ecx,63
sub ecx,ebx
shrd esi,edi,cl
shr edi,cl
.else
.while sdword ptr ebx < R8_MANT
.while ! (edi & 0F0000000h)
mov eax,esi
mov edx,edi
shld edi,esi,2
shl esi,2
add esi,eax
adc edi,edx
add ebx,1
dec ebp
.endw
bsr ecx,edi
sub ecx,31-4
sbb edx,edx
not edx
and ecx,edx
shrd esi,edi,cl
shr edi,cl
add ebx,ecx
.endw
lea ecx,[ebx-R8_MANT]
shld edi,esi,cl
shl esi,cl
.endif
; mov edx,nSign
; pushad
; shr edx,1
; sbb edx,edx
; and edx,'-'-'+'
; add edx,'+'
; invoke printf,T("%c%I64u.0e%i",13,10),edx,edi::esi,ebp
; popad
; job done, now just the hard part - formating

;; adjust number to 16 digits 2386F26FC0FFFFh
.while edi >= 2386F2h; LOW = 6FC0FFFF
.break .if edi == 2386F2h && esi < 6FC0FFFFh
call __div10
mov esi,eax
mov edi,edx
add ebp,1;; increase exponent
.endw

;; round it if needed (if 16 digit) 38D7EA4C67FFFh
.if edi>=38D7Eh;A4C67FFF
.if ! (edi == 38D7Eh && esi < 0A4C67FFFh)
add esi,5
adc edi,0
call __div10
add ebp,1; increase exponent
mov esi,eax
mov edi,edx
.endif
.endif

mov iExp,ebp
;; trailing zero count
xor ebp,ebp
jmp @F
.repeat
mov esi,eax
mov edi,edx
add ebp,1
@@: call __div10
lea ecx,[eax*4+eax]
neg ecx
add ecx,ecx
add ecx,esi
.until !zero?
mov iTrail,ebp
xor ebp,ebp
jmp @F
.repeat
call __div10
@@: lea ecx,[eax*4+eax]
neg ecx
lea ecx,[ecx*2+esi+'0']
mov tmpbuff[ebp],cl
add ebp,1
mov esi,eax
mov edi,edx
or eax,edx
.until zero?

mov ecx,nSign
mov esi,[esp+3*4][4*4][locals]
add ecx,ecx
mov edx,'-'
mov edi,iExp; exp
sbb ecx,ecx
and edx,ecx
mov [esi],dl
sub esi,ecx

add edi,iTrail
xchg esi,ebp
.if zero?;; exponent is 0
.repeat
mov al,tmpbuff[esi-1]
mov [ebp],al
add ebp,1
sub esi,1
.until zero?

.elseif (sdword ptr edi >=-15 && sdword ptr edi < 0)
;; check for format without exp
add edi,esi
.if sdword ptr edi <= 0
mov [ebp],word ptr '.0'
add ebp,2
.while sdword ptr edi < 0
mov [ebp],byte ptr '0'
add ebp,1
add edi,1
.endw
.repeat
mov al,tmpbuff[esi-1]
mov [ebp],al
add ebp,1
sub esi,1
.until zero?
.else
.repeat
mov al,tmpbuff[esi-1]
mov [ebp],al
add ebp,1
sub edi,1
.if zero?
mov [ebp],byte ptr '.'
add ebp,1
.endif
sub esi,1
.until zero?
.endif
.else
;
mov al,tmpbuff[esi-1]
mov [ebp],al
add ebp,1
sub esi,1
jz @F

mov [ebp],byte ptr '.'
add ebp,1
.repeat
mov al,tmpbuff[esi-1]
mov [ebp],al
add ebp,1
add edi,1
sub esi,1
.until zero?
@@:
mov [ebp],byte ptr 'e'
add ebp,1

mov eax,edi

cdq
and edx,'-'-'+'
add edx,'+'
mov [ebp],dl
add ebp,1

; abs
cdq
xor eax,edx
sub eax,edx
mov edi,0CCCCCCCDh; magic
mov ecx,eax
mul edi
shr edx,3
lea ebx,[edx*4+edx]
neg ebx
lea ebx,[ebx*2+ecx+'0']
mov eax,edx
.if edx
mov ecx,eax
mul edi
shr edx,3
lea esi,[edx*4+edx]
neg esi
lea esi,[esi*2+ecx+'0']
mov eax,edx
.if edx
mov ecx,eax
mul edi
shr edx,3
lea eax,[edx*4+edx]
neg eax
lea eax,[eax*2+ecx+'0']
mov [ebp],al
add ebp,1
.endif
mov eax,esi
mov [ebp],al
add ebp,1
.endif
mov [ebp],bl
add ebp,1

.endif

@@Done:
mov byte ptr [ebp],0
mov eax,ebp
sub eax,[esp+3*4][4*4][locals]

add esp,locals
pop ebx
pop edi
pop esi
pop ebp
ret 3*4

@@_NaN_Infinity:
mov ecx,nSign
mov ebp,[esp+3*4][4*4][locals]
add ecx,ecx
mov edx,'-'
sbb ecx,ecx
and edx,ecx
mov [ebp],dl
sub ebp,ecx
mov dword ptr [ebp],'#.1'
mov eax,edi
or eax,esi
.if !eax
mov eax,'FNI'
mov [ebp+3],eax
add ebp,6
.elseif edi & 10000000000000000000b
mov eax,'NANQ'
mov [ebp+3],eax
add ebp,7
.elseif ! (edi & 10000000000000000000b)
mov eax,'NANS'
mov [ebp+3],eax
add ebp,7
.else
mov eax,'DNI'
mov [ebp+3],eax
add ebp,6
.endif
jmp @@Done

@@_Subnormal:
mov ebp,[esp+3*4][4*4][locals]
mov dword ptr [ebp],'!RRE'
add ebp,4
jmp @@Done

@@Zero:
mov ebp,[esp+3*4][4*4][locals]
mov byte ptr [ebp],'0'
add ebp,1
jmp @@Done

;; div <edi::esi> by 10
;; ret <edx::eax>
align 8
__div10:
; div 10
mov eax,0CCCCCCCDh; = b0
mul esi; get a0*b0 = d1:d0
mov ecx,edx;d1
mov eax,0CCCCCCCDh; = b0
xor ebx,ebx
mul edi; get a1*b0 = e1:e0
add ecx,eax;e0
adc ebx,edx;e1
mov eax,0CCCCCCCCh; =b1
mul esi; get a0*b1 = f1:f0
add ecx,eax;f0
adc ebx,edx;f1
mov ecx,0
mov eax,0CCCCCCCCh; =b1
adc ecx,ecx
mul edi; get a1*b1 = g1:g0
add eax,ebx;g0
adc edx,ecx;g1
shrd eax,edx,3
shr edx,3;;------ quotient in edx::eax
retn

R8ToStr endp

ps. I'll probably extend this to TByte...



Enjoy
The truth cannot be learned ... it can only be recognized.

jj2007

Quote from: drizz on September 15, 2008, 02:32:32 AM
I've always wanted to write this kind of float conversion function, but figuring how to deal with floating point binary was 'pita'. Finally code that works ( and actually float string formating was just as painful ).

Great code, my talented friend!  :cheekygreen:
I have added it to my testbed, see attachment (search for R8ToStr in FloatStr.asm, console assemble & link) and timings below.

************* Timings on a Core Duo Celeron M: **********************

457 cycles for 4*float$         1.23456789012346e-07
1087 cycles for 4*R8ToStr       1.23456789012346e-7
604 cycles for 4*FloatToStr     1234.568

Code sizes and FPU register preservation:
float$          size=823, all ST regs preserved
FloatToStr      size=895, ST 6-8 trashed
R8ToStr         size=919, no ST regs used
Ray's lib       size=700, all ST regs preserved
crt sprintf     size=???, all ST regs preserved

Credits to drizz for the qwtoa algo

423 cycles for FloatToStr       1.234568e-007
448 cycles for float$ REAL4     1.23456793517107e-05
437 cycles for float$ REAL8     1.23456789012346e-07
436 cycles for float$ REAL10    0.00123456789012346
1078 cycles for R8ToStr         1.23456789012346e-7
1090 cycles for Ray's lib       0.001235
4262 cycles for sprintf         1.234568e-007

---------
602 cycles for FloatToStr       1.234568
435 cycles for float$ REAL4     1.23456788063049
420 cycles for float$ REAL8     1.23456789012346
434 cycles for float$ REAL10    1.23456789012346
966 cycles for R8ToStr          1.23456789012346
1108 cycles for Ray's lib       1.234568
4442 cycles for sprintf         1.234568

---------
603 cycles for FloatToStr       1234.568
437 cycles for float$ REAL4     1234.56787109375
419 cycles for float$ REAL8     1234.56789012346
434 cycles for float$ REAL10    1234.56789012346
908 cycles for R8ToStr          1234.56789012346
1107 cycles for Ray's lib       1234.567890
4452 cycles for sprintf         1234.568

[attachment deleted by admin]

drizz

yes timigns are not that great but it's a start :)
635 cycles for FloatToStr       1.234568e-007
600 cycles for float$ REAL4     1.23456793517107e-05
572 cycles for float$ REAL8     1.23456789012346e-07
545 cycles for float$ REAL10    0.00123456789012346
1177 cycles for R8ToStr         1.23456789012346e-7
940 cycles for Ray's lib        0.001235
5404 cycles for sprintf         1.234568e-007

---------
662 cycles for FloatToStr       1.234568
550 cycles for float$ REAL4     1.23456788063049
524 cycles for float$ REAL8     1.23456789012346
546 cycles for float$ REAL10    1.23456789012346
1095 cycles for R8ToStr         1.23456789012346
942 cycles for Ray's lib        1.234568
5642 cycles for sprintf         1.234568

---------
647 cycles for FloatToStr       1234.568
546 cycles for float$ REAL4     1234.56787109375
521 cycles for float$ REAL8     1234.56789012346
544 cycles for float$ REAL10    1234.56789012346
1036 cycles for R8ToStr         1234.56789012346
942 cycles for Ray's lib        1234.567890
5600 cycles for sprintf         1234.568
The truth cannot be learned ... it can only be recognized.

jj2007

Quote from: drizz on September 15, 2008, 12:28:20 PM
yes timigns are not that great but it's a start :)[

A great start ;-)

There is one oddity in the third-last block:
---------
616 cycles for FloatToStr       1234.568
448 cycles for float$ REAL4     1234.56787109375
429 cycles for float$ REAL8     1234.56789012346
446 cycles for float$ REAL10    1234.56789012346
929 cycles for R8ToStr          1234.56789012346
1110 cycles for Ray's lib       1234.567890
4556 cycles for sprintf         1234.568

---------
482 cycles for FloatToStr       1.234568e+123
465 cycles for float$ REAL4     1.23456789275539e+23
458 cycles for float$ REAL8     1.23456789012346e+123
478 cycles for float$ REAL10    1.23456789012346e+123
4315 cycles for R8ToStr <---------------------------------------------        1.23456789012346e+123
1182 cycles for Ray's lib       1.234567890123457E+0123
5928 cycles for sprintf         1.234568e+123

---------
464 cycles for FloatToStr       -1.234568e-123
477 cycles for float$ REAL4     -1.23456786887352e-23
465 cycles for float$ REAL8     -1.23456789012346e-123
473 cycles for float$ REAL10    -1.23456789012346e-123
3530 cycles for R8ToStr         -1.23456789012346e-123
1109 cycles for Ray's lib       -0.000000
6085 cycles for sprintf         -1.234568e-123

---------
11 cycles for FloatToStr        0
69 cycles for float$ REAL4      0
64 cycles for float$ REAL8      0
68 cycles for float$ REAL10     0
16 cycles for R8ToStr           0
345 cycles for Ray's lib        ERROR
694 cycles for sprintf          0

Also, the FPU lib version throws an error for the 0.0, which did not happen before... :dazzled:

jj2007

Quote from: jj2007 on September 15, 2008, 03:05:36 PM

11 cycles for FloatToStr        0
69 cycles for float$ REAL4      0
64 cycles for float$ REAL8      0
68 cycles for float$ REAL10     0
16 cycles for R8ToStr           0
345 cycles for Ray's lib        ERROR
694 cycles for sprintf          0

Also, the FPU lib version throws an error for the 0.0, which did not happen before... :dazzled:

Solved. I had fed a Real8 to Ray's lib :red

herge


Hi jj2007:



369 cycles for 4*float$    1.23456789012346e-07
866 cycles for 4*R8ToStr    1.23456789012346e-7
642 cycles for 4*FloatToStr 1234.568



Code sizes and FPU register preservation:
float$    size=823, all ST regs preserved
FloatToStr size=895, ST 6-8 trashed
R8ToStr  size=919, no ST regs used
Ray's lib size=700, all ST regs preserved
crt sprintf size=???, all ST regs preserved

------- New float$ Macro: -------------------
Divide MyReal10 (=1.2345678e9)
by 12345678 (=1.2e7, in eax)
add 11.1111    (an immediate real)
Result= 111.111100000000 ok?
-- This para printed by one line of code! ---


Marketing report:
Sales were up 3.2% in 2007

Code:
print float$("\nMarketing report:\nSales were up %2f% in 2007\n",
Sales2007/Sales2006-1*100)

finit is ON Version 1.3, 14 September 2008
Credits to drizz for the qwtoa algo

423 cycles for FloatToStr 1.234568e-007
379 cycles for float$ REAL4 1.23456793517107e-05
384 cycles for float$ REAL8 1.23456789012346e-07
378 cycles for float$ REAL10 0.00123456789012346
854 cycles for R8ToStr    1.23456789012346e-7
1177 cycles for Ray's lib 0.001235
4050 cycles for sprintf  1.234568e-007

---------
636 cycles for FloatToStr 1.234568
364 cycles for float$ REAL4 1.23456788063049
366 cycles for float$ REAL8 1.23456789012346
368 cycles for float$ REAL10 1.23456789012346
764 cycles for R8ToStr    1.23456789012346
1185 cycles for Ray's lib 1.234568
4486 cycles for sprintf  1.234568

---------
643 cycles for FloatToStr 1234.568
363 cycles for float$ REAL4 1234.56787109375
366 cycles for float$ REAL8 1234.56789012346
363 cycles for float$ REAL10 1234.56789012346
722 cycles for R8ToStr    1234.56789012346
1245 cycles for Ray's lib 1234.567890
4358 cycles for sprintf  1234.568

---------
454 cycles for FloatToStr 1.234568e+123
370 cycles for float$ REAL4 1.23456789275539e+23
388 cycles for float$ REAL8 1.23456789012346e+123
396 cycles for float$ REAL10 1.23456789012346e+123
3588 cycles for R8ToStr    1.23456789012346e+123
1233 cycles for Ray's lib 1.234567890123457E+0123
5337 cycles for sprintf  1.234568e+123

---------
442 cycles for FloatToStr -1.234568e-123
386 cycles for float$ REAL4 -1.23456786887352e-23
392 cycles for float$ REAL8 -1.23456789012346e-123
395 cycles for float$ REAL10 -1.23456789012346e-123
2665 cycles for R8ToStr    -1.23456789012346e-123
1168 cycles for Ray's lib -0.000000
5781 cycles for sprintf  -1.234568e-123

---------
8 cycles for FloatToStr 0
63 cycles for float$ REAL4 0
58 cycles for float$ REAL8 0
61 cycles for float$ REAL10 0
14 cycles for R8ToStr    0
398 cycles for Ray's lib ERROR
581 cycles for sprintf  0



Regards herge
// Herge born  Brussels, Belgium May 22, 1907
// Died March 3, 1983
// Cartoonist of Tintin and Snowy

ToutEnMasm

Hello,
Perhaps someone can also test some functions of the strsafe.lib ?.
a sample is here
http://www.masm32.com/board/index.php?topic=8022.msg58718#msg58718

drizz

#7
Update to my function for converting floats without fpu.

Now it's faster than all other  :dance: (it's also bigger  :lol)

Converts binary float to decimal float with two 64bit multiplications with precalculated values (If anyone is interested in Mathematica formulas (trivial) for making tables i will post them ).
Basically exponent is divided by 64, quotient is used to reduce large part, and remainder for remaining part.

X(10) * 2binExp -> Y(10) * 10decExp

Y(10) = (X(10) * Table1[binExp/64]) * Table2[binExp%64]
decExp = Table3[binExp/64] + Table4[binExp%64]

Y is then converted using my uint64tostr function.

X/Y is the mantissa scaled to 64bits.

check it out!

updated attachment

[attachment deleted by admin]
The truth cannot be learned ... it can only be recognized.

dedndave

very cool Drizz - let me play with it for a while...

jj2007

Yes it's cool :U

Celeron M:

Testing float$
        1234567890123456789 digits precision
PI      3.14159265358979323846 (there are many more digits...)
Str$    3.14159265358979324
crt     3.14159265358979310       (CRT printf or sprintf)
R8ToStr 3,14159265358979          (drizz)
FloatTo 3.141593                  (Masm32 lib FloatToStr)

Log2(e) 1.4426950408889634070
Str$    1.44269504088896341
crt     1.44269504088896340
R8ToStr 1,44269504088896          (drizz)
FloatTo 1.442695

Lg2(10) 3.3219280948873623480
Str$    3.32192809488736235
crt     3.32192809488736220
R8ToStr 3,32192809488736          (drizz)
FloatTo 3.321928

Lg10(2) 0.3010299956639811952
Str$    0.301029995663981195
crt     0.30102999566398120
R8ToStr 0,301029995663981         (drizz)
FloatTo 0.30103

Lge(2)  0.6931471805599453094
Str$    0.6931471805599453095
crt     0.69314718055994529
R8ToStr 0,693147180559945         (drizz)
FloatTo 0.6931472

Testing float$/printf
PI      3.1415926535897932384626433832795
Str$    3.14159265358979324
crt     3.14159265358979310

295     Str$('%7f', MyPI)       3.141593
474     cycles for Str$         3.14159265358979
363     cycles for R8ToStr      3,14159265358979
611     cycles for FloatToStr
4307    cycles for crt_sprintf
96      cycles for dwtoa

[attachment deleted by admin]

drizz

small formating bug found, please re-download.
The truth cannot be learned ... it can only be recognized.

ToutEnMasm

Quote
                Intel(R) Celeron(R) CPU 2.80GHz
Microsoft Windows XP Professional Build Service Pack 3 2600

Testing float$
        1234567890123456789 digits precision
PI      3.14159265358979323846 (there are many more digits...)
Str$    3.14159265358979324
crt     3.14159265358979310       (CRT printf or sprintf)
R8ToStr 3,14159265358979          (drizz)
FloatTo 3.141593                  (Masm32 lib FloatToStr)

Log2(e) 1.4426950408889634070
Str$    1.44269504088896341
crt     1.44269504088896340
R8ToStr 1,44269504088896          (drizz)
FloatTo 1.442695

Lg2(10) 3.3219280948873623480
Str$    3.32192809488736235
crt     3.32192809488736220
R8ToStr 3,32192809488736          (drizz)
FloatTo 3.321928

Lg10(2) 0.3010299956639811952
Str$    0.301029995663981195
crt     0.30102999566398120
R8ToStr 0,301029995663981         (drizz)
FloatTo 0.30103

Lge(2)  0.6931471805599453094
Str$    0.6931471805599453095
crt     0.69314718055994529
R8ToStr 0,693147180559945         (drizz)
FloatTo 0.6931472

Testing float$/printf
PI      3.1415926535897932384626433832795
Str$    3.14159265358979324
crt     3.14159265358979310

535     Str$('%7f', MyPI)       3.141593
850     cycles for Str$         3.14159265358979
605     cycles for R8ToStr      3,14159265358979
2208    cycles for FloatToStr
7648    cycles for crt_sprintf
152     cycles for dwtoa


Farabi

 :dazzled:
OMG. Thats 8 times faster than MS sprintf.
Those who had universe knowledges can control the world by a micro processor.
http://www.wix.com/farabio/firstpage

"Etos siperi elegi"

Astro

I wish I understood this stuff (it's not the language in this case...).

Even though I can't see how it works, I understand less cycles = faster, so awesome work on that!  :U

Best regards,
Astro.

drizz

Well i hope this helps you understand better.

bits:
SEEEEEEEEEEEFFFFFFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
10000000000000000000000000000000 00000000000000000000000000000000 ; sign
01111111111100000000000000000000 00000000000000000000000000000000 ; exponent
00000000000011111111111111111111 11111111111111111111111111111111 ; fraction
00000000000100000000000000000000 00000000000000000000000000000000 ; implied bit

Memory Layout for REAL8 would look like this (little endian):

dwords:
DD FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF,SEEEEEEEEEEEFFFFFFFFFFFFFFFFFFFF
bytes:
DB FFFFFFFF,FFFFFFFF,FFFFFFFF,FFFFFFFF, FFFFFFFF,FFFFFFFF,EEEEFFFF,SEEEEEEE
bits:
   FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFFFFFFEEEEEEEEEEES

I'm going to skip all the explanation for bias and special cases and go right to the conversion.

Assuming we already substracted bias and checked for zero,infinity,..etc.

we have a number that is represented like this:

(S)   F   *   2^   E
      (2)          (2)

to convert to human base (base 10) we need to transform the number to

(S)   F'   *   10^   E'
      (2)            (2)


it requires a multiplication by certain 10^N/2^M ratio (for negative E other way around, i will write about the first case),

because (2^E/2^M)=1 (where E=M),  10^N remains (i.e. 10^E')

prior to multiplication we adjust the Fraction F to be full 64bit number by shifting.

00000000000011111111111111111111 11111111111111111111111111111111 ; fraction
00000000000100000000000000000000 00000000000000000000000000000000 ; implied bit

000000000001.11111111111111111111 11111111111111111111111111111111

first we shift bits to the left by 11 , Exponent E is decreased by 11 (the bits that occupied sign and exponent)

111111111111.11111111111111111111 11111111111111111111100000000000

now we "shift" (virtual)decimal point to the right, Exponent E is decreased by 52

11111111111111111111111111111111 11111111111111111111100000000000.

only thing that is left is multiplication of F with precomputed 10^N/2^M ratio

64Bit * 64Bit = 128Bit result

Now, we don't want all 128bits of the result, only the top 64 "precise" bits.
We account for that in the ( 10^N/2^M ) ratio by adding 64 to M, so in fact we are multiplying by
10^N/2^(64+M). (chopping off lower 64 bits is the same as (shifting right) dividing by 2^64)
furthermore we require the ratio to be as close to 2^64 as possible so  we don't loose precision, hence values for N will not be equidistant, hence we need another table for exponents N

Doing this with only one table (one multiplication) for ratio and one for exponents is possible but would also take too much space.
So i split the operation to exact multiple and remainder (of 64) tables.

The rest is just rounding and formatting...

The truth cannot be learned ... it can only be recognized.