News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

MMX to SSE/SSE2?

Started by Seb, August 01, 2005, 02:42:53 PM

Previous topic - Next topic

Seb

Hello!

I got a piece of MMX optimized code, and I would like to adapt it to SSE/SSE2. The question is, what to think of when doing it? Except for the "mm0" and "xmm0" registers, I would like to know if there are any other things that I need to think about.

Regards,
Seb

hutch--

Post the code Seb, someone can probably help you with it.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

Seb

Thanks hutch. :) I've pasted the code below, however, it's written for NASM.



%include "Tools.inc"

segment_code

;
; void  Adapt ( short* pM, const short* pAdapt, int nDirection, int nOrder )
;
;   [esp+16]    nOrder
;   [esp+12]    nDirection
;   [esp+ 8]    pAdapt
;   [esp+ 4]    pM
;   [esp+ 0]    Return Address

            align 16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
proc        Adapt

            mov  eax, [esp +  4]                ; pM
            mov  ecx, [esp +  8]                ; pAdapt
            mov  edx, [esp + 16]                ; nOrder
            shr  edx, 4

            cmp  dword [esp + 12], byte 0       ; nDirection
            jle  short AdaptSub

AdaptAddLoop:
            movq  mm0, [eax]
            paddw mm0, [ecx]
            movq  [eax], mm0
            movq  mm1, [eax + 8]
            paddw mm1, [ecx + 8]
            movq  [eax + 8], mm1
            movq  mm2, [eax + 16]
            paddw mm2, [ecx + 16]
            movq  [eax + 16], mm2
            movq  mm3, [eax + 24]
            paddw mm3, [ecx + 24]
            movq  [eax + 24], mm3
            add   eax, byte 32
            add   ecx, byte 32
            dec   edx
            jnz   AdaptAddLoop

            emms
            ret

            align 16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop

AdaptSub:   je    short AdaptDone

AdaptSubLoop:
            movq  mm0, [eax]
            psubw mm0, [ecx]
            movq  [eax], mm0
            movq  mm1, [eax + 8]
            psubw mm1, [ecx + 8]
            movq  [eax + 8], mm1
            movq  mm2, [eax + 16]
            psubw mm2, [ecx + 16]
            movq  [eax + 16], mm2
            movq  mm3, [eax + 24]
            psubw mm3, [ecx + 24]
            movq  [eax + 24], mm3
            add   eax, byte 32
            add   ecx, byte 32
            dec   edx
            jnz   AdaptSubLoop

            emms
AdaptDone:

endproc

;
; int  CalculateDotProduct ( const short* pA, const short* pB, int nOrder )
;
;   [esp+12]    nOrder
;   [esp+ 8]    pB
;   [esp+ 4]    pA
;   [esp+ 0]    Return Address

            align   16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop

proc        CalculateDotProduct

            mov     eax, [esp +  4]             ; pA
            mov     ecx, [esp +  8]             ; pB
            mov     edx, [esp + 12]             ; nOrder
            shr     edx, 4
            pxor    mm7, mm7

loopDot:    movq    mm0, [eax]
            pmaddwd mm0, [ecx]
            paddd   mm7, mm0
            movq    mm1, [eax +  8]
            pmaddwd mm1, [ecx +  8]
            paddd   mm7, mm1
            movq    mm2, [eax + 16]
            pmaddwd mm2, [ecx + 16]
            paddd   mm7, mm2
            movq    mm3, [eax + 24]
            pmaddwd mm3, [ecx + 24]
            add     eax, byte 32
            add     ecx, byte 32
            paddd   mm7, mm3
            dec     edx
            jnz     loopDot

            movq    mm6, mm7
            psrlq   mm7, 32
            paddd   mm6, mm7
            movd    [esp + 4], mm6
            emms
            mov     eax, [esp + 4]
endproc


;
; BOOL GetMMXAvailable ( void );
;

proc        GetMMXAvailable
            pushad
            pushfd
            pop     eax
            mov     ecx, eax
            xor     eax, 0x200000
            push    eax
            popfd
            pushfd
            pop     eax
            cmp     eax, ecx
            jz      short return        ; no CPUID command, so no MMX

            mov     eax,1
            CPUID
            test    edx,0x800000
return:     popad
            setnz   al
            and     eax, byte 1
endproc

            end


I want it to be as fast as possible, because the current code is painfully slow sometimes. And no, I am not the author of the code above (it's from the Monkey's Audio core). The development of MAC has sort of stopped, so if you want to speed up things, you have to do it yourself (which I am trying to now).

manhattan

Try to use a negative index to save instructions in the loop. Reads and writes can be grouped. For example, AdaptAddLoop can be rewritten :


add eax, edx
add  ecx, edx
neg  edx
AdaptAddLoop:
movdqa xmm0, [eax][edx][16*0]
movdqa xmm1, [eax][edx][16*1]
movdqa xmm2, [eax][edx][16*2]
movdqa xmm3, [eax][edx][16*3]
paddw xmm0, [ecx][edx][16*0]
paddw xmm1, [ecx][edx][16*1]
paddw xmm2, [ecx][edx][16*2]
paddw xmm3, [ecx][edx][16*3]
movdqa [eax][edx][16*0], xmm0
movdqa [eax][edx][16*1], xmm1
movdqa [eax][edx][16*2], xmm2
movdqa [eax][edx][16*2], xmm3
add edx, 4*16
jnz AdaptAddLoop

Seb

Thanks. However, the code you posted will throw some errors at me in both NASM and MASM. In MASM, it whines about "xmm0", and yes - I've included ".686" and ".xmm".

Randall Hyde

Quote from: Seb on August 01, 2005, 04:16:58 PM
Thanks. However, the code you posted will throw some errors at me in both NASM and MASM. In MASM, it whines about "xmm0", and yes - I've included ".686" and ".xmm".

What version of MASM are you using? You probably have to be using v7.0 or later for the SSE* stuff.
Cheers,
Randy Hyde

Seb

I am using the latest one found at www.masm32.com, and I am also using the latest service pack. However, "ML.EXE" shows "Microsoft (R) Macro Assembler Version 6.14.8444" - 6.14. :eek

roticv

You would have to get a newer version of ml.exe

Seb

I just did some Googling, but the latest one I could find was MASM 6.15. Can anyone link me any later version with SSE/SSE2 support?

Edit: My bad. MASM 6.15 was able to assemble the SSE stuff. :U

Seb

Anyways, to go back to topic; is there anything else (if yes, tip?) to do with the code I posted?

Mark Jones

"To deny our impulses... foolish; to revel in them, chaos." MCJ 2003.08

Seb

#11
Yep, I've read that Mark, great article. But I am still uncertain on how to proceed with the MMX --> SSE2 conversion. I tried the code that manhattan posted, but it results in a GPF.

Edit: *sgrumf* tutorial is not the right word (replaced tutorial with article).

manhattan

The SSE code assumes that the pointers are 16-byte aligned and that the size is a multiple of 64. Bad alignment could explain why the MMX code is slow sometimes, it always uses movq to access the buffers.

Seb

I just found out that if I compile the code I posted above in MASM, it will result in a GPF - no matter what. But if I include the piece you posted in NASM, it'll generate some errors ("expected end of line" or something).

Seb

Also, I think I'll skip the MMX --> SSE/SSE2 code conversion, and simply concentrate on optimizing the MMX code. Since the code is relatively small, I wonder if it really is possible to optimize it any further?