News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

qwtoa using xmm registers?

Started by jj2007, September 08, 2008, 08:09:53 AM

Previous topic - Next topic

jj2007

Dear all,
I have had a look at the dwtoa code, which is very simple and compact and reasonably fast. However, it can't deal with 64-bit integers. I wonder whether it would be possible to convert it to SSE, using xmm registers only? I post a test bed below. Any ideas?

P.S.: I am aware of the 2005 qwtoa thread.

include \masm32\include\masm32rt.inc
; .686
; include \masm32\macros\timers.asm
.xmm


.data?
MyInt64 dq ?
NumBuffer dd 20 dup(?)

.data
MyReal10 Real10 12345678901234567890.0
lpBuffer dd NumBuffer

.code
start:

; following 4 lines needed for new xmm loop
  mov edx, offset MyReal10
  fld REAL10 ptr [edx] ; load from mem, 6 cycles
  mov edx, offset MyInt64
  fistp qword ptr [edx] ; the 64-bit integer is now in MyInt64...

  mov eax, 1234567890 ; test value we want to convert to ASCII
  mov edi, lpBuffer ; pointer to output buffer

; This is the core loop of dwtoa:
; -------------- can this loop be translated to SSE2, using only xmm registers?? --------------
  mov ecx, 3435973837 ; 0CCCCCCCDh, magic multiplier
  .While eax > 0
mov ebx, eax
mul ecx
shr edx, 3
mov eax, edx
lea edx, [edx*4+edx]
add edx, edx
sub ebx, edx
add bl, '0'
mov [edi], bl
add edi, 1
  .Endw
; -------------------------------------------------------------------------------------------------------------------------------------

  mov byte ptr [edi], 0       ; terminate the string
; We now have all the digits, but in reverse order
    mov esi, lpBuffer ; pointer to output buffer
    .while esi < edi
      sub edi, 1
      mov al, [esi]
      mov ah, [edi]
      mov [edi], al
      mov [esi], ah
      add esi, 1
    .endw

  invoke MessageBox, 0, lpBuffer, chr$("Our number:"), MB_OK

end start

qWord

hi,

I've written an SSE2-implementation of dw2a (just for fun  :dance:) and I think it could be addapt to convert 64bit values. Hope this give you a hint.

regard, qWord


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Convert unsigned DWORD to ASC-string by using      ;
;  SSE2 extension                                     ;
;                                                     ;
; IN:  eax = unsigned dword-value                     ;
;      edx = pointer to buffer, must be algined to 16 ;
;                                                     ;
; OUT: eax = pointer to string in buffer              ;
;                                                     ;
;idea and implementation                              ;
;by qWord, September 2008                             ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 16
d2a proc
   ;eax == dwNum
   ;edx == lpBuffer
   
    .data
        align 16          ; shr 13    shr 26
        d2a_msk_0   QWORD 3518437209, 2882303762 ;magic-number
        d2a_msk_4   QWORD 10000     , 100000000
        d2a_msk_1   REAL4 1.000001E-4, 1.000001E-3, 1.000001E-2, 1.000001E-1
        d2a_msk_2   REAL4 4 dup(10.0000000)
        d2a_msk_3   BYTE 16 dup(030h)
    .code

    test eax,eax
    .if !ZERO?

        movdqa xmm4,OWORD ptr d2a_msk_0              ;split dword into 3 parts
        movdqa xmm2,OWORD ptr d2a_msk_4              ; e.g.:  1234567890
                                                     ;    ________|________
        movd xmm0,eax                                ;   |        |        |
        pshufd xmm0,xmm0,011001100y                  ;  xmm2     xmm1    xmm0
        pshufd xmm3,xmm0,011001100y                  ;
        pmuludq xmm0,xmm4                            ;   12      3456    7890
        movdqa xmm1,xmm0                             ;
        psrlq xmm0,13+32                             ;
        psrlq xmm1,26+32                             ;
                                                     ;
        pshufd xmm0,xmm0,01001110y                   ;
        punpckhqdq xmm0,xmm1                         ;
        ;xmm0 = xl/10000 | xh/100000000              ;
        ;xmm3 = xl       | xh                        ;
                                                     ;
                                                     ;
        pmuludq xmm2,xmm0                            ;
        psubd xmm3,xmm2                              ;
        ; xmm3 = xl mod 10000 | xh mod 100000000     ;
                                                     ;
        pshufd xmm2,xmm3,01001110y                   ;
        pmuludq xmm2,xmm4                            ;
        psrlq xmm2,13+32                             ;
                                                     ;
        pshufd xmm0,xmm0,010101010y                  ;
        pshufd xmm1,xmm2,000000000y                  ;
        pshufd xmm2,xmm3,000000000y                  ;

        cvtdq2ps xmm0,xmm0                    ;;  extract the numerics
        cvtdq2ps xmm1,xmm1                    ;;
        cvtdq2ps xmm2,xmm2                    ;;  xmm-reg:
                                              ;;
        movdqa xmm3,OWORD ptr d2a_msk_1       ;;   2345.0     2345.0     2345.0     2345.0
        movdqa xmm6,OWORD ptr d2a_msk_2       ;;     |___________|_________|___________|
        movdqa xmm7,OWORD ptr d2a_msk_3       ;;                      |
                                              ;;                   div. by
        movdqa xmm4,xmm3                      ;;                      |
        movdqa xmm5,xmm3                      ;;     |-----------|----|----|-----------|
                                              ;;   10000.0    1000.0     100.0       10.0
        mulps xmm3,xmm2                       ;;
        mulps xmm4,xmm1                       ;;                      |
        mulps xmm5,xmm0                       ;;                      |
                                              ;;                      v
        cvttps2dq xmm2,xmm3                   ;;
        cvttps2dq xmm1,xmm4                   ;;   0.2345     2.345      23.45       234.5
        cvttps2dq xmm0,xmm5                   ;;                      |
                                              ;;                      |
        cvtdq2ps xmm2,xmm2                    ;;             truncate each value
        cvtdq2ps xmm1,xmm1                    ;;    and subtract from non-truncated values
        cvtdq2ps xmm0,xmm0                    ;;                      |
                                              ;;                      V
        subps xmm3,xmm2                       ;;   0.2345     0.345       0.45       0.5
        subps xmm4,xmm1                       ;;                      |
        subps xmm5,xmm0                       ;;                      |
                                              ;;                 mul. by 10.0
        mulps xmm3,xmm6                       ;;                and truncate
        mulps xmm4,xmm6                       ;;                      |
        mulps xmm5,xmm6                       ;;                      V
                                              ;;    2         3           4          5
        cvttps2dq xmm3,xmm3                   ;;
        cvttps2dq xmm4,xmm4                   ;;
        cvttps2dq xmm5,xmm5                   ;;
                                              ;;
        PACKSSDW xmm5,xmm4                    ;;
        PACKSSDW xmm3,xmm3                    ;;
        PACKSSWB xmm5,xmm3                    ;;
                                              ;;
        paddb xmm5,xmm7                       ;;

        pcmpeqb xmm7,xmm5                     ;;;  skip preceding zeros
        xor eax,eax                           ;;;  and return addr. in eax
        pinsrw xmm5,eax,6                     ;;;
        pmovmskb eax,xmm7                     ;;;
        movdqa OWORD ptr [edx],xmm5           ;;;
        not eax                               ;;;
        bsf eax,eax                           ;;;
                                              ;;;
        test eax,32                           ;;;
        .if !ZERO?                            ;;;
            dec eax                           ;;;
        .endif                                ;;;
        add eax,edx                           ;;;
        ret                                   ;;;
        align 16                              ;;;
    .else
        mov eax,030h
        mov DWORD ptr [edx],eax
        mov eax,edx
        ret
    .endif

d2a endp
FPU in a trice: SmplMath
It's that simple!

hutch--

The trick is instead of converting an existing procedure that is "known", simply write a new one that has the extra capacity.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

jj2007

Quote from: qWord on September 08, 2008, 12:35:47 PM
I've written an SSE2-implementation of dw2a (just for fun  :dance:) and I think it could be addapt to convert 64bit values. Hope this give you a hint.

Thanks, qWord. I hope I'll understand it :dazzled:
Did you time it against good ol' dwtoa?

qWord

#4
Quote from: jj2007 on September 08, 2008, 12:47:24 PM
Did you time it against good ol' dwtoa?

yes, on my core2duo it takes ca. 48 clocks.
it's slower than dwtoa(masmlib) on values < 10000,but faster on greater values
FPU in a trice: SmplMath
It's that simple!

BlackVortex

Quote from: qWord
it's slower than dwtoa(masmlib) on values > 10000, but faster on greater values
that didn't really make sense  :-D

hutch--

He probably inverted to > instead of < .
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

BlackVortex

Quote from: hutch-- on September 08, 2008, 02:33:37 PM
He probably inverted to > instead of < .
Just teasing him   :cheekygreen:

Mark_Larson

#8
this runs in under 1 cycles on my core 2 duo.  It uses a 64k lookup table to do two 16-bit values at the same time.  You could do an 8-bit lookup table, use less memory, and still PROBABLY have it run under 1 cycle.  or close to it.  the lookup table is made of all dword values.  It returns a 16-bit value, which is equivalent to the ascii value for that word.  so offset 0 in the table would have '00'.  Make sense?


mov eax,10000
mov edi,offset dst

mov ebx,eax

and eax,0ffffh
shr ebx,16

mov ecx,[lookup_table+eax*4]
mov edx,[lookup_table+ebx*4]

mov word [edi],cx
mov word [edi+2],dx


EDIT: my bad, that's actually my HEX to ascii routine.  I cut and pasted the wrong one.
BIOS programmers do it fastest, hehe.  ;)

My Optimization webpage
htttp://www.website.masmforum.com/mark/index.htm

drizz

here are my 63bit (signed) and 64bit (unsigned) conversion routines ( to and from ascii ).
edit: no mmx sse!

[attachment deleted by admin]
The truth cannot be learned ... it can only be recognized.

drizz

Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0

Edit: i'm reffering to the twice edited & deleted post  ::)
The truth cannot be learned ... it can only be recognized.

Mark_Larson

Quote from: drizz on September 08, 2008, 09:22:14 PM
Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0

Edit: i'm reffering to the twice edited & deleted post  ::)

thanks!

BIOS programmers do it fastest, hehe.  ;)

My Optimization webpage
htttp://www.website.masmforum.com/mark/index.htm

jj2007

Quote from: drizz on September 08, 2008, 06:36:28 PM
here are my 63bit (signed) and 64bit (unsigned) conversion routines ( to and from ascii ).
edit: no mmx sse!

Looks promising, thanks. It seems that SSEx is not an option, despite of the simplicity of the algo.

Mark_Larson

#13
Quote from: drizz on September 08, 2008, 09:22:14 PM
Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0

Edit: i'm reffering to the twice edited & deleted post  ::)

what kind of processor do you have?

EDIT:  I am guessing an athlon.

EDIT2: my timings for your unsigned 64-bit to ascii code are 158.685983 cycles for a FULL 64-bit number.  So there are no skipping of the loops.  I have a core 2 duo.
BIOS programmers do it fastest, hehe.  ;)

My Optimization webpage
htttp://www.website.masmforum.com/mark/index.htm

Mark_Larson

I ran qwords code.  It runs in 48.504735 cycles on my core 2 duo.  But it's only doing a dword.
BIOS programmers do it fastest, hehe.  ;)

My Optimization webpage
htttp://www.website.masmforum.com/mark/index.htm