The MASM Forum Archive 2004 to 2012

General Forums => The Laboratory => Topic started by: Seb on August 01, 2005, 02:42:53 PM

Title: MMX to SSE/SSE2?
Post by: Seb on August 01, 2005, 02:42:53 PM
Hello!

I got a piece of MMX optimized code, and I would like to adapt it to SSE/SSE2. The question is, what to think of when doing it? Except for the "mm0" and "xmm0" registers, I would like to know if there are any other things that I need to think about.

Regards,
Seb
Title: Re: MMX to SSE/SSE2?
Post by: hutch-- on August 01, 2005, 02:51:11 PM
Post the code Seb, someone can probably help you with it.
Title: Re: MMX to SSE/SSE2?
Post by: Seb on August 01, 2005, 03:09:05 PM
Thanks hutch. :) I've pasted the code below, however, it's written for NASM.



%include "Tools.inc"

segment_code

;
; void  Adapt ( short* pM, const short* pAdapt, int nDirection, int nOrder )
;
;   [esp+16]    nOrder
;   [esp+12]    nDirection
;   [esp+ 8]    pAdapt
;   [esp+ 4]    pM
;   [esp+ 0]    Return Address

            align 16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
proc        Adapt

            mov  eax, [esp +  4]                ; pM
            mov  ecx, [esp +  8]                ; pAdapt
            mov  edx, [esp + 16]                ; nOrder
            shr  edx, 4

            cmp  dword [esp + 12], byte 0       ; nDirection
            jle  short AdaptSub

AdaptAddLoop:
            movq  mm0, [eax]
            paddw mm0, [ecx]
            movq  [eax], mm0
            movq  mm1, [eax + 8]
            paddw mm1, [ecx + 8]
            movq  [eax + 8], mm1
            movq  mm2, [eax + 16]
            paddw mm2, [ecx + 16]
            movq  [eax + 16], mm2
            movq  mm3, [eax + 24]
            paddw mm3, [ecx + 24]
            movq  [eax + 24], mm3
            add   eax, byte 32
            add   ecx, byte 32
            dec   edx
            jnz   AdaptAddLoop

            emms
            ret

            align 16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop

AdaptSub:   je    short AdaptDone

AdaptSubLoop:
            movq  mm0, [eax]
            psubw mm0, [ecx]
            movq  [eax], mm0
            movq  mm1, [eax + 8]
            psubw mm1, [ecx + 8]
            movq  [eax + 8], mm1
            movq  mm2, [eax + 16]
            psubw mm2, [ecx + 16]
            movq  [eax + 16], mm2
            movq  mm3, [eax + 24]
            psubw mm3, [ecx + 24]
            movq  [eax + 24], mm3
            add   eax, byte 32
            add   ecx, byte 32
            dec   edx
            jnz   AdaptSubLoop

            emms
AdaptDone:

endproc

;
; int  CalculateDotProduct ( const short* pA, const short* pB, int nOrder )
;
;   [esp+12]    nOrder
;   [esp+ 8]    pB
;   [esp+ 4]    pA
;   [esp+ 0]    Return Address

            align   16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop

proc        CalculateDotProduct

            mov     eax, [esp +  4]             ; pA
            mov     ecx, [esp +  8]             ; pB
            mov     edx, [esp + 12]             ; nOrder
            shr     edx, 4
            pxor    mm7, mm7

loopDot:    movq    mm0, [eax]
            pmaddwd mm0, [ecx]
            paddd   mm7, mm0
            movq    mm1, [eax +  8]
            pmaddwd mm1, [ecx +  8]
            paddd   mm7, mm1
            movq    mm2, [eax + 16]
            pmaddwd mm2, [ecx + 16]
            paddd   mm7, mm2
            movq    mm3, [eax + 24]
            pmaddwd mm3, [ecx + 24]
            add     eax, byte 32
            add     ecx, byte 32
            paddd   mm7, mm3
            dec     edx
            jnz     loopDot

            movq    mm6, mm7
            psrlq   mm7, 32
            paddd   mm6, mm7
            movd    [esp + 4], mm6
            emms
            mov     eax, [esp + 4]
endproc


;
; BOOL GetMMXAvailable ( void );
;

proc        GetMMXAvailable
            pushad
            pushfd
            pop     eax
            mov     ecx, eax
            xor     eax, 0x200000
            push    eax
            popfd
            pushfd
            pop     eax
            cmp     eax, ecx
            jz      short return        ; no CPUID command, so no MMX

            mov     eax,1
            CPUID
            test    edx,0x800000
return:     popad
            setnz   al
            and     eax, byte 1
endproc

            end


I want it to be as fast as possible, because the current code is painfully slow sometimes. And no, I am not the author of the code above (it's from the Monkey's Audio core). The development of MAC has sort of stopped, so if you want to speed up things, you have to do it yourself (which I am trying to now).
Title: Re: MMX to SSE/SSE2?
Post by: manhattan on August 01, 2005, 04:03:10 PM
Try to use a negative index to save instructions in the loop. Reads and writes can be grouped. For example, AdaptAddLoop can be rewritten :


add eax, edx
add  ecx, edx
neg  edx
AdaptAddLoop:
movdqa xmm0, [eax][edx][16*0]
movdqa xmm1, [eax][edx][16*1]
movdqa xmm2, [eax][edx][16*2]
movdqa xmm3, [eax][edx][16*3]
paddw xmm0, [ecx][edx][16*0]
paddw xmm1, [ecx][edx][16*1]
paddw xmm2, [ecx][edx][16*2]
paddw xmm3, [ecx][edx][16*3]
movdqa [eax][edx][16*0], xmm0
movdqa [eax][edx][16*1], xmm1
movdqa [eax][edx][16*2], xmm2
movdqa [eax][edx][16*2], xmm3
add edx, 4*16
jnz AdaptAddLoop
Title: Re: MMX to SSE/SSE2?
Post by: Seb on August 01, 2005, 04:16:58 PM
Thanks. However, the code you posted will throw some errors at me in both NASM and MASM. In MASM, it whines about "xmm0", and yes - I've included ".686" and ".xmm".
Title: Re: MMX to SSE/SSE2?
Post by: Randall Hyde on August 01, 2005, 04:39:01 PM
Quote from: Seb on August 01, 2005, 04:16:58 PM
Thanks. However, the code you posted will throw some errors at me in both NASM and MASM. In MASM, it whines about "xmm0", and yes - I've included ".686" and ".xmm".

What version of MASM are you using? You probably have to be using v7.0 or later for the SSE* stuff.
Cheers,
Randy Hyde
Title: Re: MMX to SSE/SSE2?
Post by: Seb on August 01, 2005, 04:47:59 PM
I am using the latest one found at www.masm32.com, and I am also using the latest service pack. However, "ML.EXE" shows "Microsoft (R) Macro Assembler Version 6.14.8444" - 6.14. :eek
Title: Re: MMX to SSE/SSE2?
Post by: roticv on August 01, 2005, 04:49:42 PM
You would have to get a newer version of ml.exe
Title: Re: MMX to SSE/SSE2?
Post by: Seb on August 01, 2005, 04:55:01 PM
I just did some Googling, but the latest one I could find was MASM 6.15. Can anyone link me any later version with SSE/SSE2 support?

Edit: My bad. MASM 6.15 was able to assemble the SSE stuff. :U
Title: Re: MMX to SSE/SSE2?
Post by: Seb on August 01, 2005, 05:01:12 PM
Anyways, to go back to topic; is there anything else (if yes, tip?) to do with the code I posted?
Title: Re: MMX to SSE/SSE2?
Post by: Mark Jones on August 01, 2005, 05:13:03 PM
Maybe check out http://www.mark.masmcode.com/
Title: Re: MMX to SSE/SSE2?
Post by: Seb on August 01, 2005, 05:35:01 PM
Yep, I've read that Mark, great article. But I am still uncertain on how to proceed with the MMX --> SSE2 conversion. I tried the code that manhattan posted, but it results in a GPF.

Edit: *sgrumf* tutorial is not the right word (replaced tutorial with article).
Title: Re: MMX to SSE/SSE2?
Post by: manhattan on August 01, 2005, 09:58:13 PM
The SSE code assumes that the pointers are 16-byte aligned and that the size is a multiple of 64. Bad alignment could explain why the MMX code is slow sometimes, it always uses movq to access the buffers.
Title: Re: MMX to SSE/SSE2?
Post by: Seb on August 02, 2005, 12:41:30 AM
I just found out that if I compile the code I posted above in MASM, it will result in a GPF - no matter what. But if I include the piece you posted in NASM, it'll generate some errors ("expected end of line" or something).
Title: Re: MMX to SSE/SSE2?
Post by: Seb on August 02, 2005, 12:44:11 AM
Also, I think I'll skip the MMX --> SSE/SSE2 code conversion, and simply concentrate on optimizing the MMX code. Since the code is relatively small, I wonder if it really is possible to optimize it any further?
Title: Re: MMX to SSE/SSE2?
Post by: hutch-- on August 02, 2005, 01:53:19 AM
Seb,

Its probably worth the effort to learn the SSE code as it is the future where MMX is on its way out. Later MMX requires SSE support anyway as it was introduced with late PIVs so you won't be wasting your work here.
Title: Re: MMX to SSE/SSE2?
Post by: Seb on August 02, 2005, 03:47:21 AM
Quote from: hutch-- on August 02, 2005, 01:53:19 AM
Seb,

Its probably worth the effort to learn the SSE code as it is the future where MMX is on its way out. Later MMX requires SSE support anyway as it was introduced with late PIVs so you won't be wasting your work here.

Thanks hutch, always good to know. :U Regarding converting the MMX code to SSE; I've tried a few things, but nothing will really work. :( I either get no sound output or a GPF. What I tried was basically the stuff manhattan posted, and changing the instructions/registers to the SSE ones.
Title: Re: MMX to SSE/SSE2?
Post by: Farabi on August 02, 2005, 08:09:30 AM
Based on the tutorial emms can made your function slow. I think if you have use SSE2 you will not need FPU anymore so you can remove that emms instruction and use it rarely.
Title: Re: MMX to SSE/SSE2?
Post by: Codewarp on August 02, 2005, 10:46:23 PM
Seb,

Things to think about when using mmx/sse/sse2

  (1)  Know what cpus have the capabilities you are using.
  (2)  Know how you want to degrade on earlier cpus.
  (3)  Think 16 bytes at a time, not 8-bytes.
  (4)  SSE/SSE2 can process 64-bit floating point, MMX cannot.
  (5)  Don't forget data alignment issues, otherwise it's exception city for your code.
  (7)  Know the richer instruction set in SSE/SSE2 that MMX does not possess.
  (8)  Write in pure SSE/SSE2 (w/o MMX) to promote portability to later cpus.
  (9)  Always test new asm code by single stepping to verify correct operation

Is that enough to think about?
Title: Re: MMX to SSE/SSE2?
Post by: Seb on August 03, 2005, 12:56:45 AM
Quote from: Codewarp on August 02, 2005, 10:46:23 PM
Is that enough to think about?

Yeah, and the code below is what I've done so far. It doesn't crash, nor does it output any sound (well, there is a small "click" between each second).



%include "Tools.inc"

segment_code

;
; void  Adapt ( short* pM, const short* pAdapt, int nDirection, int nOrder )
;
;   [esp+16]    nOrder
;   [esp+12]    nDirection
;   [esp+ 8]    pAdapt
;   [esp+ 4]    pM
;   [esp+ 0]    Return Address

            align 16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
proc        Adapt

            mov  eax, [esp +  4]                ; pM
            mov  ecx, [esp +  8]                ; pAdapt
            mov  edx, [esp + 16]                ; nOrder
            shr  edx, 4

            cmp  dword [esp + 12], byte 0       ; nDirection
            jle  short AdaptSub

AdaptAddLoop:
            movdqa  xmm0, [eax]
            movdqa  xmm1, [eax+16]
            movdqa  xmm2, [eax+32]
            movdqa  xmm3, [eax+48]
            paddq   xmm0, [ecx]
            paddq   xmm1, [ecx+16]
            paddq   xmm2, [ecx+32]
            paddq   xmm3, [ecx+48]
            movntdq  [eax], xmm0
            movntdq  [eax+16], xmm1
            movntdq  [eax+32], xmm2
            movntdq  [eax+48], xmm3
            add     eax, byte 64
            add     ecx, byte 64
            dec     edx
            jnz     AdaptAddLoop

            ;emms
            ret

            align 16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop

AdaptSub:   je    short AdaptDone

AdaptSubLoop:
            movdqa  xmm0, [eax]
            movdqa  xmm1, [eax+16]
            movdqa  xmm2, [eax+32]
            movdqa  xmm3, [eax+48]
            psubq   xmm0, [ecx]
            psubq   xmm1, [ecx+16]
            psubq   xmm2, [ecx+32]
            psubq   xmm3, [ecx+48]
            movntdq  [eax], xmm0
            movntdq  [eax+16], xmm1
            movntdq  [eax+32], xmm2
            movntdq  [eax+48], xmm3
            add     eax, byte 64
            add     ecx, byte 64
            dec     edx
            jnz   AdaptSubLoop

            ;emms
AdaptDone:

endproc

;
; int  CalculateDotProduct ( const short* pA, const short* pB, int nOrder )
;
;   [esp+12]    nOrder
;   [esp+ 8]    pB
;   [esp+ 4]    pA
;   [esp+ 0]    Return Address

            align   16

proc        CalculateDotProduct

            mov     eax, [esp +  4]             ; pA
            mov     ecx, [esp +  8]             ; pB
            mov     edx, [esp + 12]             ; nOrder
            shr     edx, 4
            pxor    mm7, mm7

loopDot:    movq    mm0, [eax]
            movq    mm1, [eax +  8]
            movq    mm2, [eax + 16]
            movq    mm3, [eax + 24]
            pmaddwd mm0, [ecx]
            pmaddwd mm1, [ecx +  8]
            pmaddwd mm2, [ecx + 16]
            pmaddwd mm3, [ecx + 24]
            paddd   mm7, mm0
            paddd   mm7, mm1
            paddd   mm7, mm2
            add     eax, byte 32
            add     ecx, byte 32
            paddd   mm7, mm3
            dec     edx
            jnz     loopDot

            movq    mm6, mm7
            psrlq   mm7, 32
            paddd   mm6, mm7
            movd    [esp + 4], mm6
            emms
            mov     eax, [esp + 4]
endproc


;
; BOOL GetSSE2Available ( void );
;

; just for now
proc        GetSSE2Available
mov eax, 1
ret
endproc

            end


Any ideas?
Title: Re: MMX to SSE/SSE2?
Post by: Farabi on August 03, 2005, 11:41:53 AM
Quote from: Seb on August 03, 2005, 12:56:45 AM
Quote from: Codewarp on August 02, 2005, 10:46:23 PM
Is that enough to think about?

Yeah, and the code below is what I've done so far. It doesn't crash, nor does it output any sound (well, there is a small "click" between each second).



%include "Tools.inc"

segment_code

;
; void  Adapt ( short* pM, const short* pAdapt, int nDirection, int nOrder )
;
;   [esp+16]    nOrder
;   [esp+12]    nDirection
;   [esp+ 8]    pAdapt
;   [esp+ 4]    pM
;   [esp+ 0]    Return Address

            align 16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
proc        Adapt

            mov  eax, [esp +  4]                ; pM
            mov  ecx, [esp +  8]                ; pAdapt
            mov  edx, [esp + 16]                ; nOrder
            shr  edx, 4

            cmp  dword [esp + 12], byte 0       ; nDirection
            jle  short AdaptSub

AdaptAddLoop:
            movdqa  xmm0, [eax]
            movdqa  xmm1, [eax+16]
            movdqa  xmm2, [eax+32]
            movdqa  xmm3, [eax+48]
            paddq   xmm0, [ecx]
            paddq   xmm1, [ecx+16]
            paddq   xmm2, [ecx+32]
            paddq   xmm3, [ecx+48]
            movntdq  [eax], xmm0
            movntdq  [eax+16], xmm1
            movntdq  [eax+32], xmm2
            movntdq  [eax+48], xmm3
            add     eax, byte 64
            add     ecx, byte 64
            dec     edx
            jnz     AdaptAddLoop

            ;emms
            ret

            align 16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop

AdaptSub:   je    short AdaptDone

AdaptSubLoop:
            movdqa  xmm0, [eax]
            movdqa  xmm1, [eax+16]
            movdqa  xmm2, [eax+32]
            movdqa  xmm3, [eax+48]
            psubq   xmm0, [ecx]
            psubq   xmm1, [ecx+16]
            psubq   xmm2, [ecx+32]
            psubq   xmm3, [ecx+48]
            movntdq  [eax], xmm0
            movntdq  [eax+16], xmm1
            movntdq  [eax+32], xmm2
            movntdq  [eax+48], xmm3
            add     eax, byte 64
            add     ecx, byte 64
            dec     edx
            jnz   AdaptSubLoop

            ;emms
AdaptDone:

endproc

;
; int  CalculateDotProduct ( const short* pA, const short* pB, int nOrder )
;
;   [esp+12]    nOrder
;   [esp+ 8]    pB
;   [esp+ 4]    pA
;   [esp+ 0]    Return Address

            align   16

proc        CalculateDotProduct

            mov     eax, [esp +  4]             ; pA
            mov     ecx, [esp +  8]             ; pB
            mov     edx, [esp + 12]             ; nOrder
            shr     edx, 4
            pxor    mm7, mm7

loopDot:    movq    mm0, [eax]
            movq    mm1, [eax +  8]
            movq    mm2, [eax + 16]
            movq    mm3, [eax + 24]
            pmaddwd mm0, [ecx]
            pmaddwd mm1, [ecx +  8]
            pmaddwd mm2, [ecx + 16]
            pmaddwd mm3, [ecx + 24]
            paddd   mm7, mm0
            paddd   mm7, mm1
            paddd   mm7, mm2
            add     eax, byte 32
            add     ecx, byte 32
            paddd   mm7, mm3
            dec     edx
            jnz     loopDot

            movq    mm6, mm7
            psrlq   mm7, 32
            paddd   mm6, mm7
            movd    [esp + 4], mm6
            emms
            mov     eax, [esp + 4]
endproc


;
; BOOL GetSSE2Available ( void );
;

; just for now
proc        GetSSE2Available
mov eax, 1
ret
endproc

            end


Any ideas?

Try to execute windows explorer if you use MMX code. In my laptop it is crashed.
Title: Re: MMX to SSE/SSE2?
Post by: Seb on August 03, 2005, 01:40:54 PM
Quote from: Farabi on August 03, 2005, 11:41:53 AM
Try to execute windows explorer if you use MMX code. In my laptop it is crashed.

What do you mean?
Title: Re: MMX to SSE/SSE2?
Post by: Farabi on August 14, 2005, 07:10:44 AM
Quote from: Seb on August 03, 2005, 01:40:54 PM
Quote from: Farabi on August 03, 2005, 11:41:53 AM
Try to execute windows explorer if you use MMX code. In my laptop it is crashed.

What do you mean?

So it is not crash on your laptop? I think I made mistake on my code.


IsSupportMMX proc

   push   ebx                 
    push   ecx
    push   edx

    ; Check feature flag 23 in EDX for MMX support
    mov    eax, 1               
    cpuid                       
    mov    eax, edx           
    shr    eax, 23             
    and    eax, 1                   

    ; Restore registers
    pop    edx                 
    pop    ecx
    pop    ebx
    ret

ret
IsSupportMMX endp

EndMMX proc

    emms     ; Allow CPU to use floating point
    ret

ret
EndMMX endp


Use above code if you like.