MMX to SSE/SSE2?

Seb · August 01, 2005, 02:42:53 PM

Hello!

I got a piece of MMX optimized code, and I would like to adapt it to SSE/SSE2. The question is, what to think of when doing it? Except for the "mm0" and "xmm0" registers, I would like to know if there are any other things that I need to think about.

Regards,
Seb

hutch-- · August 01, 2005, 02:51:11 PM

Post the code Seb, someone can probably help you with it.

Seb · August 01, 2005, 03:09:05 PM

Thanks hutch. :) I've pasted the code below, however, it's written for NASM.

Code Select



%include "Tools.inc"

segment_code

;
; void  Adapt ( short* pM, const short* pAdapt, int nDirection, int nOrder )
;
;   [esp+16]    nOrder
;   [esp+12]    nDirection
;   [esp+ 8]    pAdapt
;   [esp+ 4]    pM
;   [esp+ 0]    Return Address

            align 16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
proc        Adapt

            mov  eax, [esp +  4]                ; pM
            mov  ecx, [esp +  8]                ; pAdapt
            mov  edx, [esp + 16]                ; nOrder
            shr  edx, 4

            cmp  dword [esp + 12], byte 0       ; nDirection
            jle  short AdaptSub

AdaptAddLoop:
            movq  mm0, [eax]
            paddw mm0, [ecx]
            movq  [eax], mm0
            movq  mm1, [eax + 8]
            paddw mm1, [ecx + 8]
            movq  [eax + 8], mm1
            movq  mm2, [eax + 16]
            paddw mm2, [ecx + 16]
            movq  [eax + 16], mm2
            movq  mm3, [eax + 24]
            paddw mm3, [ecx + 24]
            movq  [eax + 24], mm3
            add   eax, byte 32
            add   ecx, byte 32
            dec   edx
            jnz   AdaptAddLoop

            emms
            ret

            align 16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop

AdaptSub:   je    short AdaptDone

AdaptSubLoop:
            movq  mm0, [eax]
            psubw mm0, [ecx]
            movq  [eax], mm0
            movq  mm1, [eax + 8]
            psubw mm1, [ecx + 8]
            movq  [eax + 8], mm1
            movq  mm2, [eax + 16]
            psubw mm2, [ecx + 16]
            movq  [eax + 16], mm2
            movq  mm3, [eax + 24]
            psubw mm3, [ecx + 24]
            movq  [eax + 24], mm3
            add   eax, byte 32
            add   ecx, byte 32
            dec   edx
            jnz   AdaptSubLoop

            emms
AdaptDone:

endproc

;
; int  CalculateDotProduct ( const short* pA, const short* pB, int nOrder )
;
;   [esp+12]    nOrder
;   [esp+ 8]    pB
;   [esp+ 4]    pA
;   [esp+ 0]    Return Address

            align   16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop

proc        CalculateDotProduct

            mov     eax, [esp +  4]             ; pA
            mov     ecx, [esp +  8]             ; pB
            mov     edx, [esp + 12]             ; nOrder
            shr     edx, 4
            pxor    mm7, mm7

loopDot:    movq    mm0, [eax]
            pmaddwd mm0, [ecx]
            paddd   mm7, mm0
            movq    mm1, [eax +  8]
            pmaddwd mm1, [ecx +  8]
            paddd   mm7, mm1
            movq    mm2, [eax + 16]
            pmaddwd mm2, [ecx + 16]
            paddd   mm7, mm2
            movq    mm3, [eax + 24]
            pmaddwd mm3, [ecx + 24]
            add     eax, byte 32
            add     ecx, byte 32
            paddd   mm7, mm3
            dec     edx
            jnz     loopDot

            movq    mm6, mm7
            psrlq   mm7, 32
            paddd   mm6, mm7
            movd    [esp + 4], mm6
            emms
            mov     eax, [esp + 4]
endproc


;
; BOOL GetMMXAvailable ( void );
;

proc        GetMMXAvailable
            pushad
            pushfd
            pop     eax
            mov     ecx, eax
            xor     eax, 0x200000
            push    eax
            popfd
            pushfd
            pop     eax
            cmp     eax, ecx
            jz      short return        ; no CPUID command, so no MMX

            mov     eax,1
            CPUID
            test    edx,0x800000
return:     popad
            setnz   al
            and     eax, byte 1
endproc

            end

I want it to be as fast as possible, because the current code is painfully slow sometimes. And no, I am not the author of the code above (it's from the Monkey's Audio core). The development of MAC has sort of stopped, so if you want to speed up things, you have to do it yourself (which I am trying to now).

manhattan · August 01, 2005, 04:03:10 PM

Try to use a negative index to save instructions in the loop. Reads and writes can be grouped. For example, AdaptAddLoop can be rewritten :

Code Select


				add	 eax, edx
				add  ecx, edx
				neg  edx
AdaptAddLoop:
				movdqa	xmm0, [eax][edx][16*0]
				movdqa	xmm1, [eax][edx][16*1]
				movdqa	xmm2, [eax][edx][16*2]
				movdqa	xmm3, [eax][edx][16*3]			
				paddw	xmm0, [ecx][edx][16*0]
				paddw	xmm1, [ecx][edx][16*1]
				paddw	xmm2, [ecx][edx][16*2]
				paddw	xmm3, [ecx][edx][16*3]
				movdqa	[eax][edx][16*0], xmm0
				movdqa	[eax][edx][16*1], xmm1
				movdqa	[eax][edx][16*2], xmm2
				movdqa	[eax][edx][16*2], xmm3
				add		edx, 4*16
				jnz		AdaptAddLoop

Seb · August 01, 2005, 04:16:58 PM

Thanks. However, the code you posted will throw some errors at me in both NASM and MASM. In MASM, it whines about "xmm0", and yes - I've included ".686" and ".xmm".

Randall Hyde · August 01, 2005, 04:39:01 PM

Quote from: Seb on August 01, 2005, 04:16:58 PM
Thanks. However, the code you posted will throw some errors at me in both NASM and MASM. In MASM, it whines about "xmm0", and yes - I've included ".686" and ".xmm".

What version of MASM are you using? You probably have to be using v7.0 or later for the SSE* stuff.
Cheers,
Randy Hyde

Seb · August 01, 2005, 04:47:59 PM

I am using the latest one found at www.masm32.com, and I am also using the latest service pack. However, "ML.EXE" shows "Microsoft (R) Macro Assembler Version 6.14.8444" - 6.14. :eek

roticv · August 01, 2005, 04:49:42 PM

You would have to get a newer version of ml.exe

Seb · August 01, 2005, 04:55:01 PM

I just did some Googling, but the latest one I could find was MASM 6.15. Can anyone link me any later version with SSE/SSE2 support?

Edit: My bad. MASM 6.15 was able to assemble the SSE stuff. :U

Seb · August 01, 2005, 05:01:12 PM

Anyways, to go back to topic; is there anything else (if yes, tip?) to do with the code I posted?

Mark Jones · August 01, 2005, 05:13:03 PM

Maybe check out http://www.mark.masmcode.com/

Seb · August 01, 2005, 05:35:01 PM

Yep, I've read that Mark, great article. But I am still uncertain on how to proceed with the MMX --> SSE2 conversion. I tried the code that manhattan posted, but it results in a GPF.

Edit: *sgrumf* tutorial is not the right word (replaced tutorial with article).

manhattan · August 01, 2005, 09:58:13 PM

The SSE code assumes that the pointers are 16-byte aligned and that the size is a multiple of 64. Bad alignment could explain why the MMX code is slow sometimes, it always uses movq to access the buffers.

Seb · August 02, 2005, 12:41:30 AM

I just found out that if I compile the code I posted above in MASM, it will result in a GPF - no matter what. But if I include the piece you posted in NASM, it'll generate some errors ("expected end of line" or something).

Seb · August 02, 2005, 12:44:11 AM

Also, I think I'll skip the MMX --> SSE/SSE2 code conversion, and simply concentrate on optimizing the MMX code. Since the code is relatively small, I wonder if it really is possible to optimize it any further?

News:

MMX to SSE/SSE2?

Seb

hutch--

Seb

manhattan

Seb

Randall Hyde

Seb

roticv

Seb

Seb

Mark Jones

Seb

manhattan

Seb

Seb