I read GetFileAttributesEx is one of the fastest, my question is, is fpulib the only easy way for me to display a files bytes thats over 4 gigs as a string?
"file size is highSize*2^32+lowSize"
hiya Cube
i seem to recall some 64-bit integer to decimal routines about 6 months ago
is that what you mean - i can probably find them
it was JJ, drizz, lingo, and myself, as well as a few others
this is the basis of one by Drizz that was pretty snappy :bg
macro
; A1::A0 = (A1::A0 * B1::B0) >> 64
_mul_64x64_top64_2 macro A0:req,A1:req, B0:req,B1:req
mov eax,dword ptr B0
mul A0
mov ecx,edx; d1
mov eax,dword ptr B1
mul A0
add ecx,eax;e0
mov A0,0
adc A0,edx;e1
mov eax,dword ptr B0
mul A1
add ecx,eax;f0
mov eax,dword ptr B1
adc A0,edx;f1
mov ecx,0
mov edx,A1
adc ecx,ecx
mul edx
mov A1,ecx
add A0,eax
adc A1,edx
endm
routine
;; edi::esi == 18446744073709551615
or edi,-1
or esi,-1
mov ebx,esi
_mul_64x64_top64_2 esi, edi, 8461CEFDh, 0ABCC7711h; /100000000
shrd esi,edi,26
shr edi,26
mov eax,100000000
mul esi
sub ebx,eax
; ebx == first 8 digits ("09551615")
; edi::esi == top 12 digits ("184467440737")
mov ebx,esi
_mul_64x64_top64_2 esi, edi, 8461CEFDh, 0ABCC7711h; /100000000
shrd esi,edi,26
imul eax,esi,100000000
sub ebx,eax
; ebx == second 8 digits ("67440737")
; esi == top 4 digits ("1844")
Something quick I knocked together to convert a 64-bit number to decimal ASCII in a C string. The output string needs 21 chars for maximal output including NUL.
I64toDecimalASCII PROC lo:DWORD, hi:DWORD, outstr:PTR BYTE
push ebx
mov eax,lo
mov edx,hi
mov ecx,10 ; Base 10 decimal
push 0 ; NUL
divloop:
; edx:eax = edx:eax / 10, ebx = edx:eax % 10
xor ebx,ebx
xchg eax,ebx
xchg eax,edx
div ecx
xchg eax,ebx
div ecx
xchg edx,ebx
add ebx,030h ; convert digit to ASCII
push ebx ; stack
mov ebx,eax ; edx:eax == 0
or ebx,edx
jnz divloop
mov ebx,outstr ; string, at least 21 chars
unstack:
pop eax ; unstack characters
mov [ebx],al
inc ebx
or eax,eax ; NUL
jnz unstack
pop ebx
ret
I64toDecimalASCII ENDP
I'm pretty sure the edx:eax = edx:eax / ecx, ebx = edx:eax % ecx, will work for all non-zero values of ecx, but I'd didn't check it exhaustively, just did the long division from memory.
thanks guys, I went with clives because its exactly what I needed, and it seems to work great :dance:(showed a 8gig file in bytes correctly :clap:)
dedndave, lingos code usually makes my head want to explode heh.
I hope hutch adds this function to the lib, perhaps in a getfilesize func, that way even large files are no problem
it looks quite similar to the one i had in that other thread
the other guys mopped the floor up with me - lol
that was a Drizz routine - which was quite fast
Lingo had one a bit faster, but it was large
if you take the macro above and expand it in the 2 places it is used, you can shrink the Drizz routine down a bit
if i recall, it was about 10 times faster than my multiple precision divide, like Clives
i can't seem to find that thread - it was last August or so
there is a difference between mine and Clive's
i divided by 100000000 and extracted 8 decimal digits per pass
it was losely based on some old 16-bit code that Paul Dixon had written long ago :bg
Lots of XCHG instructions which are pretty slow, they can be replaced by XOR's, for example:
xchg eax,ebx
; can be replaced by:
xor eax,ebx
xor ebx,eax
xor eax,ebx
Not sure how much of a speed advantage it will give though....
Also the div ecx can be replaced by reciprocal multiplication though you'll have to work out the registers (replace EDX in the routine with ECX so EDX is free):
mov edx, 3435973837
mul edx
shr edx, 3
mov eax,edx
I generally use the routine by The Svin, not sure how fast it is in comparison to any others though:
dq2ascii FRAME pqwValue,lpBuffer
uses ebx, esi, edi
LOCAL qtemp :Q
; The Svin
mov esi,[pqwValue]
mov edi,[lpBuffer]
mov edx,[esi]
mov eax,[esi+4]
cmp eax,0DE0B6B3h
jc >C1
jne >C2
cmp edx,0A7640000h
jc >C1
C2:
cmp eax,8AC72304h
jc >D1
jne >D2
cmp edx,89E80000h
jc >D1
D2:
mov B[edi],'1'
sub edx,89E80000h
lea edi,[edi+1]
sbb eax,8AC72304h
D1:
mov B[edi],'/'
:
inc B[edi]
sub edx,0A7640000h
sbb eax,0DE0B6B3h
jnc <
add edx,0A7640000h
adc eax,0DE0B6B3h
inc edi
C1:
mov [qtemp],edx
mov [qtemp+4],eax
sub esp,10
fild Q[qtemp]
fbstp T[esp]
xor esi,esi
:
pop eax
bswap eax
mov ebx,eax
mov ecx,eax
mov bl,bh
shr ecx,16
mov ah,al
shr bl,4
shr al,4
and bh,0fh
and ah,0fh
shl ebx,16
and eax,0FFFFh
mov edx,ecx
mov cl,ch
mov dh,dl
shr cl,4
shr dl,4
and ch,0fh
and dh,0fh
shl ecx,16
lea eax,[eax+ebx+30303030h]
lea edx,[edx+ecx+30303030h]
mov [edi+10],eax
mov [edi+14],edx
xor esi,1
lea edi,[edi-8]
jne <
mov ah,[esp]
add edi,16
mov al,ah
add esp,2
shr al,4
mov esi,[lpBuffer]
and eax,0f0fh
or eax,3030h
mov [edi],ax
cmp edi,esi
mov B[edi+18],0
jne >P1
mov ecx,-20
add edi,19
:
inc ecx
cmp B[edi+ecx],30h
je <
mov eax,ecx
js >Z0
neg eax
add esi,eax
:
mov al,[edi+ecx]
mov [esi+ecx],al
inc ecx
jne <
P1:
ret
Z0:
mov B[esi+1],0
jmp P1
endf
Edgar
here we go - found it
.DATA
AscBuf DB '01234567890123456789',0 ;20 ASCII digits
.CODE
Asc64 PROC
;Convert 64-bit unsigned integer to ASCII decimal string
;
;Call With: EDX:EAX= QWORD value to convert
;
; Returns: EDI= Offset into AscBuf of first numchar
std
mov edi,offset AscBuf+18
mov ecx,edx
xchg eax,esi
mov ebx,100
Asc64a: xor edx,edx
xchg eax,ecx
div ebx
xchg eax,ecx
xchg eax,esi
div ebx
xchg eax,esi
xchg eax,edx
aam
xchg al,ah
or ax,3030h
stosw
mov eax,ecx
or eax,esi
jnz Asc64a
inc edi
inc edi
cld
cmp byte ptr [edi],30h ;leading 0 ?
jz Asc64b ;yes - supress it
ret ;no - done
Asc64b: inc edi
ret
Asc64 ENDP
as you can see, i used XCHG also - lol
but DIV is what makes this code slow
i could also speed it up by getting rid of STOSW and STD/CLD
EDIT - oh - this is an early version - i was dividing by 100
there is a faster version someplace that divides by 100,000,000
I make no representation that my routine is quick, it's a five minute interview question to see if someone can code on a whiteboard without notes.
i'd hire ya, Clive :P
those other guys were making me look bad
it was about that time that the ling long kai fang idea popped into my head and i quit playing with 64-bit
seeing as they had me beat for smaller integers - i went and played ball in another court - lol
sounds like wemight need to do a new speedtest setup, to test for correctness aswell, perhaps i'm being nickpicky but the passing function params by registers seems abit hackish, using params is nice for other languages like C/C++ :D
well - i was a complete n00b to 32-bit code, then :bg
in the old 16-bit days, passing parms in register was quite common
i don't want anyone to stick that particular routine in a timer - lol - it is sad, i am sure
as for testing for correctness, you can't do a brute force test - it would take forever to test all 18,446,744,073,709,551,616 values
Quote from: clive on April 30, 2010, 03:32:17 AM
I make no representation that my routine is quick, it's a five minute interview question to see if someone can code on a whiteboard without notes.
Hi Clive,
No defamation of your code was intended just spotted a couple of things that I ran into this week while trying to write an instruction size decoder for GoP (not that I'm getting very far, lots of IA32 reference study and not a lot of coding).
Edgar
on quick testing of the optimization of xor, it actually seems to of made it slower, also donkey how do I represent T[] in masm? you're using it with fbstp T[
Hi E^Cube
Maybe TBYTE PTR ?
thanks i'll have speed test up in a moment
Quote from: E^cube on April 30, 2010, 04:06:13 AM
on quick testing of the optimization of xor, it actually seems to of made it slower...
You're right, I wonder where I tested it that I found it faster ?
AMD Athlon(tm) 64 Processor 3000+
1630 cycles for I64toDecimalASCII 18067432769859648
1636 cycles for I64toDecimalASCII Modified 18067432769859648
988 cycles for Asc64 18067432769859648
998 cycles for Asc64 Modified 18067432769859648
341 cycles for dq2ascii Modified 18067432769859648
Press any key to continue ...
didn't mean to say modified for your alg donkey, oh well
If you don't want such a long number, have a look at http://www.masm32.com/board/index.php?topic=9585.0
With 64-bit windows you can use wsprintf with "%I64" as well.
It seems Str$() is not so competitive :(
counter_begin LoopCount, HIGH_PRIORITY_CLASS
mov edi, Str$(q:My64a)
counter_end
Intel(R) Pentium(R) 4 CPU 3.40GHz
2917 cycles for I64toDecimalASCII 18067432769859648
2764 cycles for I64toDecimalASCII Modified 18067432769859648
1973 cycles for Asc64 18067432769859648
1991 cycles for Asc64 Modified 18067432769859648
765 cycles for dq2ascii Modified 18067432769859648
1015 cycles for MasmBasic Str$ 18067432769859648
Quote from: donkey
You're right, I wonder where I tested it that I found it faster ?
XOR reg,reg 386:2, 486:1
vs
XCHG reg,reg 386:3, 486:3
It would be heaps faster on memory, but you'd break the atomicity(sp)
i was curious how my ling long kai fang BigNum routine would fair...
results on a prescott
526 clock cycles
528 clock cycles
526 clock cycles
525 clock cycles
526 clock cycles
i think Drizz's routine is under 200 clock cycles and Lingo's modified Dixon SSE/LUT routine is under 100
but, the LLKF9 routines may be used for larger integers - signed and/or unsigned
if you had other requirements elsewhere in the program, it could take care of all of em :P
maybe i will write a special 64-bit LLKF routine someday, just to see how it compares
Wasn't QWord to ASCII done a while ago?
Check out reply #25 in this thread: http://www.masm32.com/board/index.php?topic=3051.msg24570#msg24570
The attachment there does qword to ascii quiite quickly.
Paul.
yes Paul - i think that is the one of yours that Lingo later optimized - which is the fastest as far as i know
i don't remember ever seeing timings for your original version, though
64-bit integer to decimal is a fun algo to work on
we will probably see it pop up again and again :P
I use a modified version of drizz' algo (http://www.masm32.com/board/index.php?topic=9857.msg72422#msg72422) in MasmBasic. It is pretty fast, too, although it obviously suffers a bit from the overhead of an all-purpose Str$:
Intel(R) Celeron(R) M CPU 420 @ 1.60GHz
854 cycles for I64toDecimalASCII 18067432769859648
1121 cycles for I64toDecimalASCII2 mod 18067432769859648
562 cycles for Asc64 18067432769859648
688 cycles for Asc642 mod 18067432769859648
294 cycles for dq2ascii Modified 18067432769859648
484 cycles for MasmBasic Str$ 18067432769859648
54 bytes for I64toDecimalASCII
69 bytes for I64toDecimalASCII2
52 bytes for Asc64
82 bytes for Asc642
284 bytes for dq2ascii
1540 bytes for Str$
Quote... although it obviously suffers a bit from the overhead of an all-purpose Str$
what you need is a nice ling long kai fang BigNum routine (signed-unsigned mode selectable)
i know where there is one already written :bg
dedndave,
Quotei think that is the one of yours that Lingo later optimized - which is the fastest as far as i know
Someone made it faster? I might have to take another look!
Quotei don't remember ever seeing timings for your original version, though
It originlly ran on an Athlon XP in 100clks for a signed 19 digit quad, faster for less digits or unsigned.
The same code now runs on a Phenom II in about 60clks, again faster for less digits.
Paul.
yah - someplace Lingo had optimized it :P
as a side note, i read a post of yours in some other forum from long ago that had a 16-bit multiple-precision divide routine
my first one (posted earlier in this thread) was inspired by that one :bg