Here we are again.
A MASM32 only exercise [food for thought] about loops, formatting numbers
and a final call to an API, the mythical msgbox, to display the results.
We start with a routine in MASM32 that Dave provided elsewhere:
http://www.masm32.com/board/index.php?topic=13912.0
the routine executes 5 iteration of a nested loop that generates all
the combinations of 4 integer numbers ranging 1 - 100 and calculate
the total amount of combinations and the elapsed CPU cycles.
This is the original routine:
include \masm32\include\masm32rt.inc
.586
include \masm32\macros\timers.asm
LoopCount = 250
;----------------------------------------------------------
.code
_main proc
;restrict execution to a single core
INVOKE GetCurrentProcess
INVOKE SetProcessAffinityMask,eax,1
;count and display combinations
call combo
print ustr$(eax),' combinations',13,10
;run timing test 5 times
mov ecx,5
time00: push ecx
counter_begin LoopCount,REALTIME_PRIORITY_CLASS
call combo
counter_end
print ustr$(eax),' clock cycles',13,10
pop ecx
dec ecx
jnz time00
;terminate
inkey
exit
_main endp
;----------------------------------------------------------
combo proc
push ebx
push ecx
push edx
push esi
xor ebx,ebx
mov eax,ebx
combo1: inc ebx
mov ecx,ebx
combo2: inc ecx
mov edx,ecx
combo3: inc edx
mov esi,edx
combo4: inc esi
inc eax
cmp esi,100
jb combo4
cmp edx,99
jb combo3
cmp ecx,98
jb combo2
cmp ebx,97
jb combo1
pop esi
pop edx
pop ecx
pop ebx
ret
combo endp
;----------------------------------------------------------
end _main
and this is the unformatted output:
3921225 combinations
15996963 clock cycles
15819197 clock cycles
15802771 clock cycles
15769103 clock cycles
15820043 clock cycles
What are we going to do?
First we create or CALL an existing function [I actually don't know
if one is already there] to format the numbers with comma delimiters:
3921225 combinations ===> 3,921,225 combinations
That's just because I like numbers in that fashion :P
And at the end of the routine we call a Win32 API to display
the results inside a Message Box.
With Dave's permission we are going to add these small features
to the MASM32 routine.
I have to confess that I am only partially aware of how to do that,
so any help or suggestion is welcome :wink
Frank
i'm working on figuring out the way to make the number comma separated without using a new set of memory,
i am a little stuck though because you have to work backwards up the string to avoid overwriting a value you stil
l need, but i only know how to calculate where to move them going forward.
can easily make this a macro instead of function when i finish figuring it out.
comma separated unsigned-int-to string
csustr$()
include \masm32\include\masm32rt.inc
.data?
value dd ?
.data
numb DWORD 4294967295
.code
start:
cls
; i decided because ustr$ designates 20 bytes in data section
; and is only using 11 of it, and i am at most adding three commas,
; it would be ok to use the memory it provides, if your macro
; for ustr changes, you may have a problem
mov eax,ustr$(numb)
; i am sure there is a better way to
; get strlen. may even be able to calculate
; based off the dword
xor ecx,ecx
sub ecx,1
count:
add ecx,1
movzx edx,byte ptr [eax+ecx]
test edx,edx
jnz count
;0123456789ABC
;4,294,967,295
;4294967295
;9->c
;8->b
;7->a
;6->8
;5->7
;4->6
;3->4
;2->3
;1->2
;0->0
;01234
;1,000
;1000
;3->4
;2->3
;1->2
;0->0
;012345
;10,000
;10000
;4->5
;3->4
;2->3
;1->1
;0->0
movzx edx,byte ptr [eax+09h]
mov [eax+0Ch], dl
movzx edx,byte ptr[eax+08h]
mov [eax+0Bh], dl
movzx edx,byte ptr[eax+07h]
mov [eax+0Ah], dl
movzx edx,byte ptr[eax+06h]
mov [eax+08h], dl
movzx edx,byte ptr[eax+05h]
mov [eax+07h], dl
movzx edx,byte ptr[eax+04h]
mov [eax+06], dl
movzx edx,byte ptr[eax+03h]
mov [eax+04h], dl
movzx edx,byte ptr[eax+02h]
mov [eax+03h], dl
movzx edx,byte ptr[eax+01h]
mov [eax+02h], dl
mov byte ptr [eax+01h],2Ch
mov byte ptr [eax+05h],2Ch
mov byte ptr [eax+09h],2Ch
pushad
print eax,13,10
popad
print str$(ecx)
inkey
exit
end start
the C-runtime library may have a function that already does this
printf comes to mind - it does a lot of different numeric formatting
Quote from: dedndave on May 06, 2010, 03:16:02 AM
the C-runtime library may have a function that already does this
printf comes to mind - it does a lot of different numeric formatting
well i could just use another 14 bytes and do it without c runtime library :) i was just trying to figure out how to use the same 20 without too long of code.
without looking at the print macro, i don't know if it is left justified or right
at any rate, memory is not so precious that you wouldn't want to use your own buffer
i bet JJ has a routine for this already written for MASM Basic, too :bg
it isn't a hard routine to write, though
it's just that windows uses some function or other all the time to display file sizes
apparently you can sell the function for $19.99 http://www.windows7download.com/win7-insert-commas-into-numbers-in-multiple-text-files-software/qlzlwxxm.html
damn !!! - lol
well - i would start by finding the end of the string
then work toward the beginning
your function could use a right-justified buffer to make the code simple
Quote from: joemc on May 06, 2010, 03:37:24 AM
Quote from: dedndave on May 06, 2010, 03:16:02 AM
the C-runtime library may have a function that already does this
printf comes to mind - it does a lot of different numeric formatting
well i could just use another 14 bytes and do it without c runtime library :) i was just trying to figure out how to use the same 20 without too long of code.
I was thinking, last night before falling asleep, to use the 20 bytes string
returned from ustr$ to do the job:
As Dave says:
Quote from: dedndave on May 06, 2010, 04:09:42 AM
damn !!! - lol
well - i would start by finding the end of the string
then work toward the beginning
your function could use a right-justified buffer to make the code simple
We could start from the end of the string, using 3 bytes at a time.
But I think it's better to use an other variable string to build the formatted one.
source string: "23456789" ===>
target string: "xx,xxx,xxx"
1) we get the 3 rightmost character of souce string and put them into the
3 rightmost characters of target string:
"789" ===> "xx,xxx,789" using the appropriate address
2) the same with the second group of 3 characters:
"456" ====> "xx,456,789" using the appropriate address
3) the last 2 characters go to the appropriate position.
Of course we have to prebuild the target string depending on the lenght
of the source string, and we have to check if moving 3, 2, or 1 byte
when we move the leftmost group of characters.
We could use an array of prebuild target strings and use the lenght of
the variable to format as an index to the array, so we get the proper
target format without many checking, using some 50-80 bytes for the array,
but no CMP or any check at all.
It shouldn't be too difficult.
If it is a function, it returns the formatted integer number as a
comma separated string.
If it is MACRO, you know what it does, I actually don't.
If it is a Proc, it could use the variables defined in the
.data section.
That is one possible solution in ASM.
The solutions could be many, let's try some of them.
OOps, I have to delete this one. :P
Quote from: dedndave on May 06, 2010, 03:46:49 AM
i bet JJ has a routine for this already written for MASM Basic, too :bg
Well, there is no in-built function, but it is straightforward:
Quoteinclude \masm32\MasmBasic\MasmBasic.inc ; http://www.masm32.com/board/index.php?topic=12460
.data
MyFatNumber1 dq 123456789012345678 ; 123,456,789,012,345,678
MyFatNumber2 dq 12345678901234567
MyFatNumber3 dq 1234567890123456
MyFatNumber4 dq 123456789012345
.data?
buffer db 1024 dup(?)
Init
[/color] push esi
push edi
push ebx
ct = 0
REPEAT 4
ct = ct + 1
@CatStr(<mov esi, Str$(q:MyFatNumber>, %ct, <)>)
mov edi, offset buffer
push edi
push Len(esi)
cdq
mov ecx, 3
div ecx
dec edx ; edx is 0, 2, 1, 0
.if Sign? ; we need 2, 1, 0, 2
add edx, 3
.endif
pop ecx ; len of source
mov al, ","
.Repeat
movsb
dec ecx
.Break .if Zero?
dec edx
.if Sign?
stosb
add edx, 3
.endif
.Until 0
xor eax, eax
stosd ; set a zero delimiter
pop edi
Print edi, CrLf$
ENDM
pop ebx
pop edi
pop esi
Inkey Str$("\n\nYour puter has run %3f hours since the last boot, give it a break!", Timer()/3600000)
Exit
end start
123,456,789,012,345,678
12,345,678,901,234,567
1,234,567,890,123,456
123,456,789,012,345
Quote from: jj2007 on May 06, 2010, 11:08:46 AM
Quote from: dedndave on May 06, 2010, 03:46:49 AM
i bet JJ has a routine for this already written for MASM Basic, too :bg
Well, there is no in-built function, but it is straightforward:
Quoteinclude \masm32\MasmBasic\MasmBasic.inc ; http://www.masm32.com/board/index.php?topic=12460
.data
MyFatNumber1 dq 123456789012345678 ; 123,456,789,012,345,678
MyFatNumber2 dq 12345678901234567
MyFatNumber3 dq 1234567890123456
MyFatNumber4 dq 123456789012345
.data?
buffer db 1024 dup(?)
Init
[/color] push esi
push edi
push ebx
ct = 0
REPEAT 4
ct = ct + 1
@CatStr(<mov esi, Str$(q:MyFatNumber>, %ct, <)>)
mov edi, offset buffer
push edi
push Len(esi)
cdq
mov ecx, 3
div ecx
dec edx ; edx is 0, 2, 1, 0
.if Sign? ; we need 2, 1, 0, 2
add edx, 3
.endif
pop ecx ; len of source
mov al, ","
.Repeat
movsb
dec ecx
.Break .if Zero?
dec edx
.if Sign?
stosb
add edx, 3
.endif
.Until 0
xor eax, eax
stosd ; set a zero delimiter
pop edi
Print edi, CrLf$
ENDM
pop ebx
pop edi
pop esi
Inkey Str$("\n\nYour puter has run %3f hours since the last boot, give it a break!", Timer()/3600000)
Exit
end start
123,456,789,012,345,678
12,345,678,901,234,567
1,234,567,890,123,456
123,456,789,012,345
Thanks jj :U
So it's time to have a look at your masmbasic library :P
For an old friend of BASIC dialects like me, it should be fine.
may I ask you if there are the asm sources of your lib
functions in order to study them?
Frank
First off i want to make clear that this is by no means optimized code for anything, it was thrown together just to see if it would work:
;
;Test of string formatting routine.
;
;Input string = 1234567890123456789012345
; (25 decimal characters)
;
;Output string = 1,234,567,890,123,456,789,012,345
; (33 decimal characters)
;
;Press enter to exit...
FormatString PROTO pInput:DWORD,pOutput:DWORD
FormatString PROC pInput:DWORD,pOutput:DWORD
;Get length of string
mov edi,pInput
xor eax,eax
or ecx,-1
repne scasb
add ecx,1
not ecx
;Get number of iterations plus remainder
mov eax,ecx
mov ecx,3
xor edx,edx
div ecx
;Set source and destination registers, string length and then move across remainder as left most of string
mov esi,pInput
mov edi,pOutput
mov ecx,edx
rep movsb
;Set counter to number of iterations
mov ecx,eax
;Set EDX to AND mask
mov edx,0FFFFFFh
;We'll start each part here with the ',' and then take a full DWORD from EDI as we know there is the NULL byte at the end.
mov ebx,','
@@ThreeCharLoop:
;Save delimiter
mov byte ptr [edi],bl
add edi,1
;Load source DWORD
lodsd
;Strip upper byte of EAX with AND mask in EDX
and eax,edx
;Save DWORD to destination as null terminated 3 char string
stosd
;There is a byte overlap from the LODSD/STOSD, decrement source and destination by one byte
sub esi,1
sub edi,1
;Loop like this until ECX == 0
sub ecx,1
jnz @@ThreeCharLoop
;Get length of output string
mov edi,pOutput
xor eax,eax
or ecx,-1
repne scasb
add ecx,1
not ecx
;Set return value
mov eax,ecx
Ret
FormatString EndP
HR,
Ghandi
Another thought also, you could eliminate the destination parameter and use stack memory to accommodate the input string, copy the input and then build the output string in the input buffer using the stack copy for reference. Vice versa, you could use the stack memory to hold the temp string and then once formatting has taken place you could copy the resulting string across to the input buffer.
HR,
Ghandi
Quote from: Ghandi on May 06, 2010, 12:17:31 PM
First off i want to make clear that this is by no means optimized code for anything, it was thrown together just to see if it would work:
;
;Test of string formatting routine.
;
;Input string = 1234567890123456789012345
; (25 decimal characters)
;
;Output string = 1,234,567,890,123,456,789,012,345
; (33 decimal characters)
;
;Press enter to exit...
FormatString PROTO pInput:DWORD,pOutput:DWORD
FormatString PROC pInput:DWORD,pOutput:DWORD
;Get length of string
mov edi,pInput
xor eax,eax
or ecx,-1
repne scasb
add ecx,1
not ecx
;Get number of iterations plus remainder
mov eax,ecx
mov ecx,3
xor edx,edx
div ecx
;Set source and destination registers, string length and then move across remainder as left most of string
mov esi,pInput
mov edi,pOutput
mov ecx,edx
rep movsb
;Set counter to number of iterations
mov ecx,eax
;Set EDX to AND mask
mov edx,0FFFFFFh
;We'll start each part here with the ',' and then take a full DWORD from EDI as we know there is the NULL byte at the end.
mov ebx,','
@@ThreeCharLoop:
;Save delimiter
mov byte ptr [edi],bl
add edi,1
;Load source DWORD
lodsd
;Strip upper byte of EAX with AND mask in EDX
and eax,edx
;Save DWORD to destination as null terminated 3 char string
stosd
;There is a byte overlap from the LODSD/STOSD, decrement source and destination by one byte
sub esi,1
sub edi,1
;Loop like this until ECX == 0
sub ecx,1
jnz @@ThreeCharLoop
;Get length of output string
mov edi,pOutput
xor eax,eax
or ecx,-1
repne scasb
add ecx,1
not ecx
;Set return value
mov eax,ecx
Ret
FormatString EndP
HR,
Ghandi
Thanks Ghandi. :U
This is also very good, it is MASM without many external functions
or lib, and a good exercise to see what we can do with MASM instructions.
I've a lot to study now with 3 different samples of how we can do the
number formatting task. :P
Quote from: Ghandi on May 06, 2010, 12:27:23 PM
Another thought also, you could eliminate the destination parameter and use stack memory to accommodate the input string, copy the input and then build the output string in the input buffer using the stack copy for reference. Vice versa, you could use the stack memory to hold the temp string and then once formatting has taken place you could copy the resulting string across to the input buffer.
HR,
Ghandi
Even more to study :bg
Quote from: frktons on May 06, 2010, 11:27:51 AM
So it's time to have a look at your masmbasic library :P
For an old friend of BASIC dialects like me, it should be fine.
may I ask you if there are the asm sources of your lib
functions in order to study them?
Frank
Frank,
PM me your email address. Be warned, it's fairly complex stuff...
It is also important to note that the routine i posted above does not preserve registers but should, so i would change the function header with:
FormatString PROC uses ebx edx esi edi pInput:DWORD,pOutput:DWORD
Quote from: Ghandi on May 06, 2010, 02:05:01 PM
It is also important to note that the routine i posted above does not preserve registers but should, so i would change the function header with:
FormatString PROC uses ebx edx esi edi pInput:DWORD,pOutput:DWORD
Thanks Ghandi,
as I have time I'll give it a try. :U
While I was travelling for work, I came up with a basic idea for
formatting huge numbers [qword ones] or 32 bit integer numbers
with basic MASM OPCODES.
Let's start with the simpler ones, 32 bit numbers.
The idea is to allocate an array of string bytes [10 elements]
with value: "0" ... "9", and DIVide the 32 bit value by ten
using the remainder to get the corresponding byte of the array,
and placing it into the buiding formatted string, starting from right to left.
The quotient of the DIV is used for the next DIV until it
becomes zero.
Counting three bytes, before the 4th we place a "," and the counter
is zeroed.
What do you think?
Am I using the worst of all possible algos?
Could it be done just for exercising some basic opcodes?
Is it better to use binary or BCD format numbers for this task?
Should I use three 32 bits general registers for dividend, divisor, quotient?
For 64 bit numbers is it better to use MMX registers?
Have they any particular opcodes I have to be aware of?
Thanks
Frank
Here's a code fragment using basic opcodes, the IDIV's are quite expensive, but it should be easy enough to follow.
I make no claims that it is super efficient.
It uses the stack to deal with the fact digits come out backward from the division.
I divide by 1000 to get groups of three digits, and then divide that remainder by 10 three times to get the digits out.
It basically tests for 0-9, 0-99, or 0-999 to determine how many digits it needs when then current value is less than 1000
Personally I'd use a pair of IDIV's to handle 64-bit values.
; EAX = 32-bit number
; ESI = string buffer for NUL terminated ASCII
; Uses ESI,EDI,EAX,ECX,EDX
push 0 ; Mark stack end with NUL
divloop:
mov ecx,1000 ; Divide into 3 digit groups
xor edx,edx ; Clear high order 32-bit for divide
idiv ecx ; eax = edx:eax / ecx, edx = edx:eax % ecx
mov edi,eax ; Save division result
mov ecx,10 ; Subdivide in 10's
mov eax,edx ; Get remainder
or edi,edi ; Still number left, so at least 3 digits in remainder
jnz digit000
cmp eax,10 ; remainder has one digit
jb digit0
cmp eax,100 ; remainder has two digits
jb digit00
digit000: ; 3 digits
xor edx,edx ; Clear high order 32-bit for divide
idiv ecx ; eax = edx:eax / ecx, edx = edx:eax % ecx
add edx,30h ; += '0'
push edx ; Stack
digit00: ; 2 digits
xor edx,edx ; Clear high order 32-bit for divide
idiv ecx ; eax = edx:eax / ecx, edx = edx:eax % ecx
add edx,30h ; += '0'
push edx ; Stack
digit0: ; 1 digit
xor edx,edx ; Clear high order 32-bit for divide
idiv ecx ; eax = edx:eax / ecx, edx = edx:eax % ecx
add edx,30h ; += '0'
push edx ; Stack
mov eax,edi ; Recover remaining number
or eax,eax ; Zero?
jz poploop
push 2Ch ; Comma added to groups of three digits
jmp divloop
poploop:
pop eax ; Recover next digit
mov [esi],al ; Add to string
inc esi
or eax,eax ; Was it a NUL?
jnz poploop
1
12
128
1,000
10,000
100,000
256,256,256
1,234,567,890
0
4,294,967,295
Quote from: clive on May 14, 2010, 05:28:49 PM
Here's a code fragment using basic opcodes, the IDIV's are quite expensive, but it should be easy enough to follow.
I make no claims that it is super efficient.
It uses the stack to deal with the fact digits come out backward from the division.
I divide by 1000 to get groups of three digits, and then divide that remainder by 10 three times to get the digits out.
It basically tests for 0-9, 0-99, or 0-999 to determine how many digits it needs when then current value is less than 1000
Personally I'd use a pair of IDIV's to handle 64-bit values.
; EAX = 32-bit number
; ESI = string buffer for NUL terminated ASCII
; Uses ESI,EDI,EAX,ECX,EDX
push 0 ; Mark stack end with NUL
divloop:
mov ecx,1000 ; Divide into 3 digit groups
xor edx,edx ; Clear high order 32-bit for divide
idiv ecx ; eax = edx:eax / ecx, edx = edx:eax % ecx
mov edi,eax ; Save division result
mov ecx,10 ; Subdivide in 10's
mov eax,edx ; Get remainder
or edi,edi ; Still number left, so at least 3 digits in remainder
jnz digit000
cmp eax,10 ; remainder has one digit
jb digit0
cmp eax,100 ; remainder has two digits
jb digit00
digit000: ; 3 digits
xor edx,edx ; Clear high order 32-bit for divide
idiv ecx ; eax = edx:eax / ecx, edx = edx:eax % ecx
add edx,30h ; += '0'
push edx ; Stack
digit00: ; 2 digits
xor edx,edx ; Clear high order 32-bit for divide
idiv ecx ; eax = edx:eax / ecx, edx = edx:eax % ecx
add edx,30h ; += '0'
push edx ; Stack
digit0: ; 1 digit
xor edx,edx ; Clear high order 32-bit for divide
idiv ecx ; eax = edx:eax / ecx, edx = edx:eax % ecx
add edx,30h ; += '0'
push edx ; Stack
mov eax,edi ; Recover remaining number
or eax,eax ; Zero?
jz poploop
push 2Ch ; Comma added to groups of three digits
jmp divloop
poploop:
pop eax ; Recover next digit
mov [esi],al ; Add to string
inc esi
or eax,eax ; Was it a NUL?
jnz poploop
1
12
128
1,000
10,000
100,000
256,256,256
1,234,567,890
0
4,294,967,295
Thanks Clive, this routine will help me a lot
to understand how to use the stack and the DIV/IDIV opcodes. :U
Actually, the last divide by 10 isn't necessary, as eax will be 0-9 at this point.
digit0: ; 1 digit
add eax,30h ; += '0'
push eax ; Stack
mov eax,edi ; Recover remaining number
Quote from: clive on May 15, 2010, 01:09:47 AM
Actually, the last divide by 10 isn't necessary, as eax will be 0-9 at this point.
digit0: ; 1 digit
add eax,30h ; += '0'
push eax ; Stack
mov eax,edi ; Recover remaining number
Very good auto-optimization :clap:
Quote from: clive on May 14, 2010, 05:28:49 PM
It uses the stack to deal with the fact digits come out backward from the division.
This is also a good use of the stack structure :U
Quote from: clive on May 14, 2010, 05:28:49 PM
Personally I'd use a pair of IDIV's to handle 64-bit values.
You are not talking about 64 bit MMX registers, are you?
Actually I am not sure how to perform 64 bits DIV/IDIV operations. :red
Would you be so kind to show me an example?
Thanks
Frank
DIV and IDIV are slow instructions
if you are trying to make a faster routine, there are a few tricks that can be used
one is to multiply by the reciprocal of the divisor (MUL instructions are fairly fast)
this is called "multiply-to-divide" or sometimes called "magic number division"
so, instead of dividing by 10, multiply the value by n^2/10, where n^2 is some power of 2 (convenient to use a register width if practical)
another trick is to use a larger divisor to extract more than one digit at a time
dividing by 10,000 will get you 4 decimal digits at a time, for example
this can also prevent divide overflow problems encountered when dividing large values by small ones
then, there is Horner's Rule or, as i like to call it, Ling Long Kai Fang
it is a little bit involved to explain, but it is really a simple math identity that is worth knowing
some time ago, i wrote a few routines to convert large integers to ASCII decimal strings
the routines combine both Ling Long Kai Fang and Multiply-to-Divide techniques
http://www.masm32.com/board/index.php?topic=12363.msg94779#msg94779
download the LLKF9_1 attachment from that thread for some code
i am not a super-duper programmer, but the routines work fairly well :lol
they use no SSE or FPU instructions, so there is plenty of room for improvement
Dividing a 64-bit number with a 32-bit number, getting at 64-bit quotient, and 32-bit remainder.
; edx:eax = edx:eax / ecx
; ebx = edx:eax % ecx
mov ebx,eax ; save lower bits
mov eax,edx ; move high order bits into lower bits
xor edx,edx ; clear upper bits
div ecx ; long division, handle high order first
xchg eax,ebx ; swap low order with quotient
div ecx ; then low order using high order remainder
xchg edx,ebx ; switch high order quotient with remainder
Equivalent to previous, using reciprocals.
; EAX = 32-bit number
; ESI = string buffer for NUL terminated ASCII
; Uses ESI,EDI,EAX,ECX,EDX
push 0
divloop:
mov edx,04189374Ch ; 2^40/1,000
mov ecx,eax ; save the dividend
mul edx ; multiply to divide
shr edx,8 ; adjust the quotient
imul eax,edx,1000 ; find the product
sub ecx,eax ; subtract the difference
; edx = quotient
; ecx = remainder
mov eax,ecx
mov edi,edx
or edi,edi
jnz digit000
cmp eax,10
jb digit0
cmp eax,100
jb digit00
digit000:
mov edx,0CCCCCCCDh ; 2^35/10
mov ecx,eax ; save the dividend
mul edx ; multiply to divide
shr edx,3 ; adjust the quotient
imul eax,edx,10 ; find the product
sub ecx,eax ; subtract the difference
;edx = quotient
;ecx = remainder
mov eax,edx
add ecx,30h
push ecx
digit00:
mov edx,0CCCCCCCDh ; 2^35/10
mov ecx,eax ; save the dividend
mul edx ; multiply to divide
shr edx,3 ; adjust the quotient
imul eax,edx,10 ; find the product
sub ecx,eax ; subtract the difference
;edx = quotient
;ecx = remainder
mov eax,edx
add ecx,30h
push ecx
digit0:
add eax,030h
push eax
mov eax,edi
or eax,eax
jz poploop
push 2Ch
jmp divloop
poploop:
pop eax
mov [esi],al
inc esi
or eax,eax
jnz poploop
As Dave indicates MUL instructions are much faster, they can get pipelined to produce a result in 5-10 cycles, verses 75-100 cycles of divides. This is an order of magnitude comparison, and will obviously vary between CPU architectures.
The divide by 1,000 was to get the comma spacing efficiently, we could divide by 1,000,000 and then subdivide that. There are many ways to do it, and there is a trade off between compactness, clarity, and efficiency.
Divide by 10,000 works well if you don't want commas and can handle 4 digits at a time.
Quote from: dedndave on May 15, 2010, 12:42:13 PM
DIV and IDIV are slow instructions
if you are trying to make a faster routine, there are a few tricks that can be used
Hi Dave, I suspect there are lots of ways to optimize a slow DIV operation.
Actually I'm just trying to understand how DIV/IDIV work. After that, I'll
jump into speeding up the routine with "Ling Long Kai Fang and Multiply-to-Divide techniques"
and the like. :P
Quote from: clive on May 15, 2010, 01:33:19 PM
Dividing a 64-bit number with a 32-bit number, getting at 64-bit quotient, and 32-bit remainder.
; edx:eax = edx:eax / ecx
; ebx = edx:eax % ecx
mov ebx,eax ; save lower bits
mov eax,edx ; move high order bits into lower bits
xor edx,edx ; clear upper bits
div ecx ; long division, handle high order first
xchg eax,ebx ; swap low order with quotient
div ecx ; then low order using high order remainder
xchg edx,ebx ; switch high order quotient with remainder
As I can see for a 64 bit DIV we are using a pair of 32 bit registers ::)
Does it mean the 64 bit MMX registers are slower or not fitted for the task?
::)
Quote from: clive on May 15, 2010, 01:37:17 PM
Equivalent to previous, using reciprocals.
; EAX = 32-bit number
; ESI = string buffer for NUL terminated ASCII
; Uses ESI,EDI,EAX,ECX,EDX
push 0
divloop:
mov edx,04189374Ch ; 2^40/1,000
mov ecx,eax ; save the dividend
mul edx ; multiply to divide
shr edx,8 ; adjust the quotient
imul eax,edx,1000 ; find the product
sub ecx,eax ; subtract the difference
; edx = quotient
; ecx = remainder
mov eax,ecx
mov edi,edx
or edi,edi
jnz digit000
cmp eax,10
jb digit0
cmp eax,100
jb digit00
digit000:
mov edx,0CCCCCCCDh ; 2^35/10
mov ecx,eax ; save the dividend
mul edx ; multiply to divide
shr edx,3 ; adjust the quotient
imul eax,edx,10 ; find the product
sub ecx,eax ; subtract the difference
;edx = quotient
;ecx = remainder
mov eax,edx
add ecx,30h
push ecx
digit00:
mov edx,0CCCCCCCDh ; 2^35/10
mov ecx,eax ; save the dividend
mul edx ; multiply to divide
shr edx,3 ; adjust the quotient
imul eax,edx,10 ; find the product
sub ecx,eax ; subtract the difference
;edx = quotient
;ecx = remainder
mov eax,edx
add ecx,30h
push ecx
digit0:
add eax,030h
push eax
mov eax,edi
or eax,eax
jz poploop
push 2Ch
jmp divloop
poploop:
pop eax
mov [esi],al
inc esi
or eax,eax
jnz poploop
All right, I have enough stuff to meditate upon. :P
Quote
As Dave indicates MUL instructions are much faster, they can get pipelined to produce a result in 5-10 cycles, verses 75-100 cycles of divides. This is an order of magnitude comparison, and will obviously vary between CPU architectures.
The divide by 1,000 was to get the comma spacing efficiently, we could divide by 1,000,000 and then subdivide that. There are many ways to do it, and there is a trade off between compactness, clarity, and efficiency.
Divide by 10,000 works well if you don't want commas and can handle 4 digits at a time
As a matter of fact I think the routines you proposed are exactly what I need
for the time being, the optimization task can wait some days more :U
Thanks to both of you for your help. :bg
Frank
ok - it's good to know how to use DIV, too :bg
as Clive suggested, you can "cascade" or "chain" DIV instructions together to divide larger quotients
the real limiting factor is the size of the divisor
if it is larger than machine-size (32 bits for many of us), you have to find some alternative to DIV/IDIV
Randy Hyde gave a pretty nice treatment on the subject in AoA
http://oopweb.com/Assembly/Documents/ArtOfAssembly/Volume/Chapter_9/CH09-4.html
With the little knowledge of C syntax I learned recently,
I assembled a routine, a function, to format integer numbers
with sign, comma or point or other separators, not only
british/american style, and the possibility to align the
resulting string towards right or left.
Now I'm trying to optimize it, so maybe a better algorithm
or some Assembly could help.
In the attached file there is the source code and the exe
compiled with Pelles' C.
The routine tests 50 million cycles of formatting an integer:
-1234567890
and takes about 6-7 ''.
The strange and impressive thing is that somebody, in Cprogramming
forum where I posted the function, compiled it under Linux with GCC
and got [with the same code] a performace of 50:1 compared with
mine on Win7 and Pelles'C. That was very impressive. Just using
Linux and a different compiler gives the same code an incredible boost. :eek
I'm not able to evaluate these results, but you have far more
experience than me so you could help me to understand better
what happened, and suggest me a better algorithm or some Assembly
to improve the performance.
At least I'd be happy to get the same speed Linux-guys have on
their systems with GCC optimization.
Any idea to improve the performance?
Thanks
Perhaps you should look at the assembler code emitted by GCC to understand what it is doing. (-S option as I recall, generates a .S file at least for my cygwin hosted GCC MIPS compiler)
Unless you are measuring TIME elapsed on the same processor (model, speed), it is not going to be a very effective measure. Your code takes 17 seconds on my system and I didn't even recompile it.
-Clive
Quote from: frktons on July 05, 2010, 04:34:51 PM
The strange and impressive thing is that somebody, in Cprogramming
forum where I posted the function, compiled it under Linux with GCC
and got [with the same code] a performace of 50:1 compared with
mine on Win7 and Pelles'C. That was very impressive. Just using
Linux and a different compiler gives the same code an incredible boost. :eek
Incredible indeed. The snippet below takes 8.9 seconds as compared to 12 seconds for yours. If I take Masm32 str$ instead of MasmBasic Str$, it can be done in just over 4 seconds - Str$() is an allrounder, str$ swallows integers only. Now maybe, if we put Lingo on it, that could be reduced to 2 seconds, but 12/50=0.24 seconds. No way! My advice: Test the output of that miraculous Linux proggie... or rather: run it through Olly. I would not be surprised if the compiler optimised a little bit by putting a fixed string or similar ;-)
include \masm32\MasmBasic\MasmBasic.inc
.data
num dd -1234567890
Init
push Timer()
For_ n=1 To 50000000
void Str$(num)
Next
void Timer()
pop edx
sub eax, edx
Inkey Str$("\nYour puter needed %f seconds for showing TheNum: ", eax/1000), Str$(num)
Exit
end start
Quote from: clive on July 06, 2010, 04:10:35 PM
Perhaps you should look at the assembler code emitted by GCC to understand what it is doing. (-S option as I recall, generates a .S file at least for my cygwin hosted GCC MIPS compiler)
Unless you are measuring TIME elapsed on the same processor (model, speed), it is not going to be a very effective measure. Your code takes 17 seconds on my system and I didn't even recompile it.
-Clive
In order to reveal some tricks used by GCC, I'm now going
to test the function with random generated numbers, to see if
the Linux/GCC is still capable of similar optimization with -O3 flag
as they said.
Quote from: jj2007 on July 06, 2010, 05:37:13 PM
Incredible indeed. The snippet below takes 8.9 seconds as compared to 12 seconds for yours. If I take Masm32 str$ instead of MasmBasic Str$, it can be done in just over 4 seconds - Str$() is an allrounder, str$ swallows integers only. Now maybe, if we put Lingo on it, that could be reduced to 2 seconds, but 12/50=0.24 seconds. No way! My advice: Test the output of that miraculous Linux proggie... or rather: run it through Olly. I would not be surprised if the compiler optimised a little bit by putting a fixed string or similar ;-)
include \masm32\MasmBasic\MasmBasic.inc
.data
num dd -1234567890
Init
push Timer()
For_ n=1 To 50000000
void Str$(num)
Next
void Timer()
pop edx
sub eax, edx
Inkey Str$("\nYour puter needed %f seconds for showing TheNum: ", eax/1000), Str$(num)
Exit
end start
Well, I was really impressed, so let's find out if there is a trick of the tail
somewhere for using always the same number, changing it with random ones. :green
Quote from: frktons on July 06, 2010, 06:29:08 PM
Well, I was really impressed, so let's find out if there is a trick of the tail
somewhere for using always the same number, changing it with random ones. :green
To find the same number you just run a cmp LastNum -> je and skip any other calculations
or use a jump table
Quote from: oex on July 06, 2010, 06:32:38 PM
Quote from: frktons on July 06, 2010, 06:29:08 PM
Well, I was really impressed, so let's find out if there is a trick of the tail
somewhere for using always the same number, changing it with random ones. :green
To find the same number you just run a cmp LastNum -> je and skip any other calculations
or use a jump table
With random numbers, changing them all the time I call the function, this
trick will not work? Jump table should be very big for covering all the
numbers, maybe dividing by 1000, and using the remainder for a smaller
jump table could be doable in order to improve the performance.
Sorry i misunderstood your post.... For the same number it would work for random numbers it wouldnt.... My suggestion was to explain how linux could be 50 times faster for the same number....
I didnt read up to the top only based it on your code you (er jj) just posted
For random numbers it would be difficult to make it any faster however for incrementing numbers there is plenty of ability to increase the speed if it is known the number is simply incrementing
str$ runs dwtoa function however 40000-40009 needs only one byte incremented etc not the full DW to A function run
ie cmp byte 2 (from right), 9 -> jne inc byte
As Clive already wrote, try to get hold of the assembler file generated. Or just use the loop counter to eliminate the "optimisation":
For_ n=1 To 50000000
void Str$(n)
Next
If the compiler can roll the MOD,DIV operations into a single operation it would double the speed.
I don't think your code handles the ZERO/SIGN cases particularly well, or the NUL termination of the string. You could eliminate the 'zero' flag if you are not using it anywhere else, and probably address the initialization of the string better. The 'neg' flag could be held locally.
Here was a quick hack of the code
void int_format(int num)
{
int x = 0;
int y = 0;
int remain = 0; // integer variable to store the remainder
int count = 0; // integer counter for positioning the separator
const int len_str = sizeof(buffer) - 1; // string length less the terminator NULL
const int ten = 10;
x = len_str;
if (num != 0)
{
zero = FALSE;
if (num < 0)
{
neg = TRUE;
num = -num; // transform number to positive if negative
}
else
neg = FALSE;
while(num)
{
if (count == 3)
{
count = 0;
buffer[x--] = sep;
}
buffer[x--] = '0' + (char)(num % ten);
num = num / ten;
count++;
}
if (sign == TRUE)
{
if (neg == TRUE)
buffer[x] = '-';
else
buffer[x] = '+';
}
else
x++;
}
else
{
buffer[x] = '0';
zero = TRUE;
}
if (alignment == 'L')
{
if (x == 0)
return;
for (y = 0; y < sizeof(buffer); y++, x++)
{
buffer[y] = buffer[x];
if (buffer[y] == '\0')
return;
} // end for
}
return;
}
Quote from: clive on July 06, 2010, 09:05:03 PM
If the compiler can roll the MOD,DIV operations into a single operation it would double the speed.
I don't think your code handles the ZERO/SIGN cases particularly well, or the NUL termination of the string. You could eliminate the 'zero' flag if you are not using it anywhere else, and probably address the initialization of the string better. The 'neg' flag could be held locally.
Here was a quick hack of the code
void int_format(int num)
{
int x = 0;
int y = 0;
int remain = 0; // integer variable to store the remainder
int count = 0; // integer counter for positioning the separator
const int len_str = sizeof(buffer) - 1; // string length less the terminator NULL
const int ten = 10;
x = len_str;
if (num != 0)
{
zero = FALSE;
if (num < 0)
{
neg = TRUE;
num = -num; // transform number to positive if negative
}
else
neg = FALSE;
while(num)
{
if (count == 3)
{
count = 0;
buffer[x--] = sep;
}
buffer[x--] = '0' + (char)(num % ten);
num = num / ten;
count++;
}
if (sign == TRUE)
{
if (neg == TRUE)
buffer[x] = '-';
else
buffer[x] = '+';
}
else
x++;
}
else
{
buffer[x] = '0';
zero = TRUE;
}
if (alignment == 'L')
{
if (x == 0)
return;
for (y = 0; y < sizeof(buffer); y++, x++)
{
buffer[y] = buffer[x];
if (buffer[y] == '\0')
return;
} // end for
}
return;
}
Thanks clive, it looks a little bit faster with your hack. :U
Yes something is to refine a little to make it work properly. :P
I have a strange display, with L after the formatted numbers,
do you know what they are? Errors or normal signs for Long numbers?
Testing version : 0.60
----------------------
The value of num is: -1234567890
The formatted value of num is: -1.234.567.890LL
Elapsed ClickTime: 1.404L
to perform 10.000.000L cycles of the formatting function
Pelles' C likes TRUE/FALSE in lower letters: true/false, and
I don't know why the stdbool.h was written this way.
This is faster than the lookup table?
buffer[x--] = '0' + (char)(num % ten);
In a later version I initialize the buffer this way:
char buffer[15] = {'\0'}; // string array for the formatted number
// 10 digits max + 3 separators max + sign + NULL
Is it better this way?
Microsoft C uses BOOL/TRUE/FALSE, I don't use Pelle
The 'L' are probably because you don't terminate the string. Most of the code is predicated on the fact that buffer[14] is NUL, but is never set as such. The "char buffer[15] = {'\0'};" does not achieve this, the first character is a NUL, but you are filling the buffer from the back.
x = len_str;
buffer[x--] = 0; // ADD THIS
if (num != 0)
A simple register/immediate addition should be more efficient than a memory read. A table would help above base 10 due to the non-linearity of the ASCII translation. The overall cost of the table might be hidden. The most time consuming operation is the divide and modulus function. While DIV will provide both, most C compilers aren't going to fold the '/' and '%' pair together.
Microsoft C/C++ 12.00 optimized, 1 IDIV, 1 reciprocal IMUL (12 seconds, 50M iterations)
; 143 : buffer[x--] = '0' + (char)(num % ten);
00232 8b c1 mov eax, ecx
00234 bd 0a 00 00 00 mov ebp, 10 ; 0000000aH
00239 99 cdq
0023a f7 fd idiv ebp
NOTE EAX here is num / ten
; 144 :
; 145 : num = num / ten;
0023c b8 67 66 66 66 mov eax, 1717986919 ; 66666667H
00241 80 c2 30 add dl, 48 ; 00000030H
00244 88 96 00 00 00
00 mov BYTE PTR _buffer[esi], dl
0024a f7 e9 imul ecx
0024c c1 fa 02 sar edx, 2
0024f 8b c2 mov eax, edx
00251 4e dec esi
00252 c1 e8 1f shr eax, 31 ; 0000001fH
00255 03 d0 add edx, eax
; 146 :
; 147 : count++;
00257 47 inc edi
00258 8b ca mov ecx, edx
0025a 85 c9 test ecx, ecx
0025c 75 c6 jne SHORT $L52929
Microsoft C/C++ 12.00 unoptimized, 2 IDIV (25 seconds, 50M iterations)
; 143 : buffer[x--] = '0' + (char)(num % ten);
003b1 8b 45 08 mov eax, DWORD PTR _num$[ebp]
003b4 99 cdq
003b5 f7 7d fc idiv DWORD PTR _ten$[ebp]
003b8 0f be ca movsx ecx, dl
003bb 83 c1 30 add ecx, 48 ; 00000030H
003be 8b 55 f0 mov edx, DWORD PTR _x$[ebp]
003c1 88 8a 00 00 00
00 mov BYTE PTR _buffer[edx], cl
003c7 8b 45 f0 mov eax, DWORD PTR _x$[ebp]
003ca 83 e8 01 sub eax, 1
003cd 89 45 f0 mov DWORD PTR _x$[ebp], eax
; 144 :
; 145 : num = num / ten;
003d0 8b 45 08 mov eax, DWORD PTR _num$[ebp]
003d3 99 cdq
003d4 f7 7d fc idiv DWORD PTR _ten$[ebp]
003d7 89 45 08 mov DWORD PTR _num$[ebp], eax
; 146 :
; 147 : count++;
003da 8b 4d f8 mov ecx, DWORD PTR _count$[ebp]
003dd 83 c1 01 add ecx, 1
003e0 89 4d f8 mov DWORD PTR _count$[ebp], ecx
; 148 : }
003e3 eb a1 jmp SHORT $L52885
Quote from: clive on July 07, 2010, 03:06:15 AM
Microsoft C uses BOOL/TRUE/FALSE, I don't use Pelle
The 'L' are probably because you don't terminate the string. Most of the code is predicated on the fact that buffer[14] is NUL, but is never set as such. The "char buffer[15] = {'\0'};" does not achieve this, the first character is a NUL, but you are filling the buffer from the back.
x = len_str;
buffer[x--] = 0; // ADD THIS
if (num != 0)
Well, according to what I've read so far, I had the naive assumption
that
char buffer[15] = {'\0'} was referring to all the bytes
of the string array, not only the first. ::)
I'll add the code you suggest and see what happens as soon as I get
home and put my hands on my pc. :P
Edit: OK clive, it worked, thanks :U
And the new version, smarter one, posted by malcom on
cprogramming forum:
int iMalcsUltimateFormat(long snum, char s[])
{
char *ps = s;
unsigned long num1 = snum, num2, num3, div;
if (snum < 0) {
*ps++ = '-';
num1 = -snum;
}
if (num1 < 10000) {
if (num1 < 10) goto L1;
if (num1 < 100) goto L2;
if (num1 < 1000) goto L3;
} else {
num2 = num1 / 10000;
num1 -= num2 * 10000;
if (num2 < 10000) {
if (num2 < 10) goto L5;
if (num2 < 100) goto L6;
if (num2 < 1000) goto L7;
} else {
num3 = num2 / 10000;
num2 -= num3 * 10000;
if (num3 >= 10) {
*ps++ = '0' + (char)(div = (num3*6554UL)>>16); num3 -= div*10;
*ps++ = ',';
}
*ps++ = '0' + (char)(num3);
}
*ps++ = '0' + (char)(div = (num2*8389UL)>>23); num2 -= div*1000;
L7:
*ps++ = '0' + (char)(div = (num2*5243UL)>>19); num2 -= div*100;
*ps++ = ',';
L6:
*ps++ = '0' + (char)(div = (num2*6554UL)>>16); num2 -= div*10;
L5:
*ps++ = '0' + (char)(num2);
}
*ps++ = '0' + (char)(div = (num1*8389UL)>>23); num1 -= div*1000;
*ps++ = ',';
L3:
*ps++ = '0' + (char)(div = (num1*5243UL)>>19); num1 -= div*100;
L2:
*ps++ = '0' + (char)(div = (num1*6554UL)>>16); num1 -= div*10;
L1:
*ps++ = '0' + (char)(num1);
*ps = '\0';
return ps - s;
}
is about 3 times faster than the unoptimized, crumpled solution I
posted and clive corrected a little.
The entire pgm for testing pourpose source and exe are attached.
This is the output it gives:
Testing version : 0.60
--------------------------
The value of num is: -1234567890
The formatted value of num is: -1,234,567,890
Elapsed ClickTime: 463
to perform 10,000,000 cycles of the formatting function
It looks like a better algorithm, nearly as fast as assembly code
somebody tested here. :U
Quote from: frktons on July 07, 2010, 04:13:45 PMIt looks like a better algorithm, nearly as fast as assembly code
Here is my fastest int2str routine in assembly (Int3264ToStr.asm in attachment)
http://www.masm32.com/board/index.php?topic=9906.msg93198#msg93198
The funny thing is that my C translation produces almost identical code when compiled. :)
Though, I did carefully cater the compiler.
uint32_t uint2str(uint32_t num, uint8_t *str)
{
uint8_t *p = str;
uint32_t val = num, first5, tmp;
uint64_t w = num;
w *= 0xA7C5AC47UL;
w += 0xA7C5AC47ULL;
first5 = (w >> 48) & 0xFFFFFFFFUL;
tmp = (int32_t)first5 * 100000L;
w = first5;
val -= tmp;
w *= 0x68DB9UL;
if (first5 == 0) goto NEXT5;
if (first5 > 9999) goto DIGIT0;
if (first5 > 999) goto DIGIT1;
w = (uint32_t)(w & 0xFFFFFFFFUL) * 10UL;
if (first5 > 99) goto DIGIT2;
w = (uint32_t)(w & 0xFFFFFFFFUL) * 10UL;
if (first5 > 9) goto DIGIT3;
w = (uint32_t)(w & 0xFFFFFFFFUL) * 10UL;
goto DIGIT4;
NEXT5: w=val;
w*=0x68DB9UL;
if (val > 9999) goto DIGIT5;
if (val > 999) goto DIGIT6;
w = (uint32_t)(w & 0xFFFFFFFFUL) * 10UL;
if (val > 99) goto DIGIT7;
w = (uint32_t)(w & 0xFFFFFFFFUL) * 10UL;
if (val > 9) goto DIGIT8;
*((uint16_t *)p)=(uint16_t)val + '\x30';
return 1;
DIGIT0: *p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT1: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT2: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT3: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT4: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
w=val;
w*=0x68DB9UL;
DIGIT5: *p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT6: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT7: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT8: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT9: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
*p = '\0';
return p-str;
}
PS No formatting!
Quote from: drizz on July 08, 2010, 06:21:10 AM
Quote from: frktons on July 07, 2010, 04:13:45 PMIt looks like a better algorithm, nearly as fast as assembly code
Here is my fastest int2str routine in assembly (Int3264ToStr.asm in attachment)
http://www.masm32.com/board/index.php?topic=9906.msg93198#msg93198
The funny thing is that my C translation produces almost identical code when compiled. :)
Though, I did carefully cater the compiler.
uint32_t uint2str(uint32_t num, uint8_t *str)
{
uint8_t *p = str;
uint32_t val = num, first5, tmp;
uint64_t w = num;
w *= 0xA7C5AC47UL;
w += 0xA7C5AC47ULL;
first5 = (w >> 48) & 0xFFFFFFFFUL;
tmp = (int32_t)first5 * 100000L;
w = first5;
val -= tmp;
w *= 0x68DB9UL;
if (first5 == 0) goto NEXT5;
if (first5 > 9999) goto DIGIT0;
if (first5 > 999) goto DIGIT1;
w = (uint32_t)(w & 0xFFFFFFFFUL) * 10UL;
if (first5 > 99) goto DIGIT2;
w = (uint32_t)(w & 0xFFFFFFFFUL) * 10UL;
if (first5 > 9) goto DIGIT3;
w = (uint32_t)(w & 0xFFFFFFFFUL) * 10UL;
goto DIGIT4;
NEXT5: w=val;
w*=0x68DB9UL;
if (val > 9999) goto DIGIT5;
if (val > 999) goto DIGIT6;
w = (uint32_t)(w & 0xFFFFFFFFUL) * 10UL;
if (val > 99) goto DIGIT7;
w = (uint32_t)(w & 0xFFFFFFFFUL) * 10UL;
if (val > 9) goto DIGIT8;
*((uint16_t *)p)=(uint16_t)val + '\x30';
return 1;
DIGIT0: *p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT1: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT2: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT3: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT4: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
w=val;
w*=0x68DB9UL;
DIGIT5: *p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT6: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT7: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT8: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
DIGIT9: w = (w & 0xFFFFFFFFUL) * 10;
*p++ = (uint8_t) (w >> 32) + 0x30;
*p = '\0';
return p-str;
}
PS No formatting!
Thanks drizz, I'll have a look :U
With random numbers the peformance is now pretty similar
among different compilers, so using the same number over and
over again gave the opportunity to the compiler to play some
nasty trick, and not doing the formatting when not necessary
that means all along the iteration except for the first time.
If some of you are interested in a faster solution to translate
into Assembly for your libraries, have a look Here (http://ridiculousfish.com/blog/archives/2010/02/15/labor-of-division-episode-1/).
See you