qwtoa using xmm registers?

jj2007 · September 08, 2008, 08:09:53 AM

Dear all,
I have had a look at the dwtoa code, which is very simple and compact and reasonably fast. However, it can't deal with 64-bit integers. I wonder whether it would be possible to convert it to SSE, using xmm registers only? I post a test bed below. Any ideas?

P.S.: I am aware of the 2005 qwtoa thread.

Code Select

include \masm32\include\masm32rt.inc
; .686
; include \masm32\macros\timers.asm
.xmm


.data?
MyInt64	dq	?
NumBuffer		dd 20 dup(?)

.data
MyReal10	Real10	12345678901234567890.0
lpBuffer	dd	NumBuffer

.code
start:

; following 4 lines needed for new xmm loop
  mov edx, offset MyReal10
  fld REAL10 ptr [edx]		; load from mem, 6 cycles
  mov edx, offset MyInt64
  fistp qword ptr [edx]		; the 64-bit integer is now in MyInt64...

  mov eax, 1234567890	; test value we want to convert to ASCII
  mov edi, lpBuffer	; pointer to output buffer

; This is the core loop of dwtoa: 
; -------------- can this loop be translated to SSE2, using only xmm registers?? --------------
  mov ecx, 3435973837	; 0CCCCCCCDh, magic multiplier
  .While eax > 0
	mov ebx, eax
	mul ecx
	shr edx, 3
	mov eax, edx
	lea edx, [edx*4+edx]
	add edx, edx
	sub ebx, edx
	add bl, '0'
	mov [edi], bl
	add edi, 1
  .Endw
; -------------------------------------------------------------------------------------------------------------------------------------

  mov byte ptr [edi], 0       ; terminate the string
; We now have all the digits, but in reverse order
    mov esi, lpBuffer	; pointer to output buffer
    .while esi < edi
      sub edi, 1
      mov al, [esi]
      mov ah, [edi]
      mov [edi], al
      mov [esi], ah
      add esi, 1
    .endw

  invoke MessageBox, 0, lpBuffer, chr$("Our number:"), MB_OK

end start

qWord · September 08, 2008, 12:35:47 PM

hi,

I've written an SSE2-implementation of dw2a (just for fun :dance:) and I think it could be addapt to convert 64bit values. Hope this give you a hint.

regard, qWord

Code Select


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Convert unsigned DWORD to ASC-string by using      ;
;  SSE2 extension                                     ;
;                                                     ;
; IN:  eax = unsigned dword-value                     ;
;      edx = pointer to buffer, must be algined to 16 ;
;                                                     ;
; OUT: eax = pointer to string in buffer              ;
;                                                     ;
;idea and implementation                              ;
;by qWord, September 2008                             ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 16
d2a proc 
   ;eax == dwNum
   ;edx == lpBuffer
   
    .data
        align 16          ; shr 13    shr 26
        d2a_msk_0   QWORD 3518437209, 2882303762 ;magic-number
        d2a_msk_4   QWORD 10000     , 100000000
        d2a_msk_1   REAL4 1.000001E-4, 1.000001E-3, 1.000001E-2, 1.000001E-1
        d2a_msk_2   REAL4 4 dup(10.0000000)
        d2a_msk_3   BYTE 16 dup(030h)
    .code

    test eax,eax
    .if !ZERO?

        movdqa xmm4,OWORD ptr d2a_msk_0              ;split dword into 3 parts
        movdqa xmm2,OWORD ptr d2a_msk_4              ; e.g.:  1234567890
                                                     ;    ________|________
        movd xmm0,eax                                ;   |        |        |
        pshufd xmm0,xmm0,011001100y                  ;  xmm2     xmm1    xmm0
        pshufd xmm3,xmm0,011001100y                  ;
        pmuludq xmm0,xmm4                            ;   12      3456    7890
        movdqa xmm1,xmm0                             ;
        psrlq xmm0,13+32                             ;
        psrlq xmm1,26+32                             ;
                                                     ;
        pshufd xmm0,xmm0,01001110y                   ;
        punpckhqdq xmm0,xmm1                         ;
        ;xmm0 = xl/10000 | xh/100000000              ;
        ;xmm3 = xl       | xh                        ;
                                                     ;
                                                     ;
        pmuludq xmm2,xmm0                            ;
        psubd xmm3,xmm2                              ;
        ; xmm3 = xl mod 10000 | xh mod 100000000     ;
                                                     ;
        pshufd xmm2,xmm3,01001110y                   ;
        pmuludq xmm2,xmm4                            ;
        psrlq xmm2,13+32                             ;
                                                     ;
        pshufd xmm0,xmm0,010101010y                  ;
        pshufd xmm1,xmm2,000000000y                  ;
        pshufd xmm2,xmm3,000000000y                  ;

        cvtdq2ps xmm0,xmm0                    ;;  extract the numerics
        cvtdq2ps xmm1,xmm1                    ;;
        cvtdq2ps xmm2,xmm2                    ;;  xmm-reg:
                                              ;;
        movdqa xmm3,OWORD ptr d2a_msk_1       ;;   2345.0     2345.0     2345.0     2345.0
        movdqa xmm6,OWORD ptr d2a_msk_2       ;;     |___________|_________|___________|
        movdqa xmm7,OWORD ptr d2a_msk_3       ;;                      |
                                              ;;                   div. by
        movdqa xmm4,xmm3                      ;;                      |
        movdqa xmm5,xmm3                      ;;     |-----------|----|----|-----------|
                                              ;;   10000.0    1000.0     100.0       10.0
        mulps xmm3,xmm2                       ;;
        mulps xmm4,xmm1                       ;;                      |
        mulps xmm5,xmm0                       ;;                      |
                                              ;;                      v
        cvttps2dq xmm2,xmm3                   ;;
        cvttps2dq xmm1,xmm4                   ;;   0.2345     2.345      23.45       234.5
        cvttps2dq xmm0,xmm5                   ;;                      |
                                              ;;                      |
        cvtdq2ps xmm2,xmm2                    ;;             truncate each value
        cvtdq2ps xmm1,xmm1                    ;;    and subtract from non-truncated values
        cvtdq2ps xmm0,xmm0                    ;;                      |
                                              ;;                      V
        subps xmm3,xmm2                       ;;   0.2345     0.345       0.45       0.5
        subps xmm4,xmm1                       ;;                      |
        subps xmm5,xmm0                       ;;                      |
                                              ;;                 mul. by 10.0
        mulps xmm3,xmm6                       ;;                and truncate
        mulps xmm4,xmm6                       ;;                      |
        mulps xmm5,xmm6                       ;;                      V
                                              ;;    2         3           4          5
        cvttps2dq xmm3,xmm3                   ;;
        cvttps2dq xmm4,xmm4                   ;;
        cvttps2dq xmm5,xmm5                   ;;
                                              ;;
        PACKSSDW xmm5,xmm4                    ;;
        PACKSSDW xmm3,xmm3                    ;;
        PACKSSWB xmm5,xmm3                    ;;
                                              ;;
        paddb xmm5,xmm7                       ;;

        pcmpeqb xmm7,xmm5                     ;;;  skip preceding zeros
        xor eax,eax                           ;;;  and return addr. in eax
        pinsrw xmm5,eax,6                     ;;;
        pmovmskb eax,xmm7                     ;;;
        movdqa OWORD ptr [edx],xmm5           ;;;
        not eax                               ;;;
        bsf eax,eax                           ;;;
                                              ;;;
        test eax,32                           ;;;
        .if !ZERO?                            ;;;
            dec eax                           ;;;
        .endif                                ;;;
        add eax,edx                           ;;;
        ret                                   ;;;
        align 16                              ;;;
    .else
        mov eax,030h
        mov DWORD ptr [edx],eax
        mov eax,edx
        ret
    .endif

d2a endp

hutch-- · September 08, 2008, 12:43:18 PM

The trick is instead of converting an existing procedure that is "known", simply write a new one that has the extra capacity.

jj2007 · September 08, 2008, 12:47:24 PM

Quote from: qWord on September 08, 2008, 12:35:47 PM
I've written an SSE2-implementation of dw2a (just for fun :dance:) and I think it could be addapt to convert 64bit values. Hope this give you a hint.

Thanks, qWord. I hope I'll understand it :dazzled:
Did you time it against good ol' dwtoa?

qWord · September 08, 2008, 01:00:38 PM

Quote from: jj2007 on September 08, 2008, 12:47:24 PM
Did you time it against good ol' dwtoa?

yes, on my core2duo it takes ca. 48 clocks.
it's slower than dwtoa(masmlib) on values < 10000,but faster on greater values

BlackVortex · September 08, 2008, 02:31:45 PM

Quote from: qWord
it's slower than dwtoa(masmlib) on values > 10000, but faster on greater values

that didn't really make sense :-D

hutch-- · September 08, 2008, 02:33:37 PM

He probably inverted to > instead of < .

BlackVortex · September 08, 2008, 02:41:41 PM

Quote from: hutch-- on September 08, 2008, 02:33:37 PM
He probably inverted to > instead of < .

Just teasing him :cheekygreen:

Mark_Larson · September 08, 2008, 03:52:29 PM

this runs in under 1 cycles on my core 2 duo. It uses a 64k lookup table to do two 16-bit values at the same time. You could do an 8-bit lookup table, use less memory, and still PROBABLY have it run under 1 cycle. or close to it. the lookup table is made of all dword values. It returns a 16-bit value, which is equivalent to the ascii value for that word. so offset 0 in the table would have '00'. Make sense?

Code Select


	mov	eax,10000
	mov	edi,offset dst

	mov	ebx,eax

	and	eax,0ffffh
	shr	ebx,16
	
	mov	ecx,[lookup_table+eax*4]
	mov	edx,[lookup_table+ebx*4]
	
	mov	word [edi],cx
	mov	word [edi+2],dx

EDIT: my bad, that's actually my HEX to ascii routine. I cut and pasted the wrong one.

drizz · September 08, 2008, 06:36:28 PM

here are my 63bit (signed) and 64bit (unsigned) conversion routines ( to and from ascii ).
edit: no mmx sse!

[attachment deleted by admin]

drizz · September 08, 2008, 09:22:14 PM

Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0

Edit: i'm reffering to the twice edited & deleted post ::)

Mark_Larson · September 08, 2008, 09:48:06 PM

Quote from: drizz on September 08, 2008, 09:22:14 PM
Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0

Edit: i'm reffering to the twice edited & deleted post ::)

thanks!

jj2007 · September 09, 2008, 09:16:49 AM

Quote from: drizz on September 08, 2008, 06:36:28 PM
here are my 63bit (signed) and 64bit (unsigned) conversion routines ( to and from ascii ).
edit: no mmx sse!

Looks promising, thanks. It seems that SSEx is not an option, despite of the simplicity of the algo.

Mark_Larson · September 09, 2008, 01:03:40 PM

Quote from: drizz on September 08, 2008, 09:22:14 PM
Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0

Edit: i'm reffering to the twice edited & deleted post ::)

what kind of processor do you have?

EDIT: I am guessing an athlon.

EDIT2: my timings for your unsigned 64-bit to ascii code are 158.685983 cycles for a FULL 64-bit number. So there are no skipping of the loops. I have a core 2 duo.

Mark_Larson · September 09, 2008, 02:27:04 PM

I ran qwords code. It runs in 48.504735 cycles on my core 2 duo. But it's only doing a dword.

News:

qwtoa using xmm registers?