News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

MMX to SSE/SSE2?

Started by Seb, August 01, 2005, 02:42:53 PM

Previous topic - Next topic

hutch--

Seb,

Its probably worth the effort to learn the SSE code as it is the future where MMX is on its way out. Later MMX requires SSE support anyway as it was introduced with late PIVs so you won't be wasting your work here.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

Seb

Quote from: hutch-- on August 02, 2005, 01:53:19 AM
Seb,

Its probably worth the effort to learn the SSE code as it is the future where MMX is on its way out. Later MMX requires SSE support anyway as it was introduced with late PIVs so you won't be wasting your work here.

Thanks hutch, always good to know. :U Regarding converting the MMX code to SSE; I've tried a few things, but nothing will really work. :( I either get no sound output or a GPF. What I tried was basically the stuff manhattan posted, and changing the instructions/registers to the SSE ones.

Farabi

Based on the tutorial emms can made your function slow. I think if you have use SSE2 you will not need FPU anymore so you can remove that emms instruction and use it rarely.
Those who had universe knowledges can control the world by a micro processor.
http://www.wix.com/farabio/firstpage

"Etos siperi elegi"

Codewarp

Seb,

Things to think about when using mmx/sse/sse2

  (1)  Know what cpus have the capabilities you are using.
  (2)  Know how you want to degrade on earlier cpus.
  (3)  Think 16 bytes at a time, not 8-bytes.
  (4)  SSE/SSE2 can process 64-bit floating point, MMX cannot.
  (5)  Don't forget data alignment issues, otherwise it's exception city for your code.
  (7)  Know the richer instruction set in SSE/SSE2 that MMX does not possess.
  (8)  Write in pure SSE/SSE2 (w/o MMX) to promote portability to later cpus.
  (9)  Always test new asm code by single stepping to verify correct operation

Is that enough to think about?

Seb

Quote from: Codewarp on August 02, 2005, 10:46:23 PM
Is that enough to think about?

Yeah, and the code below is what I've done so far. It doesn't crash, nor does it output any sound (well, there is a small "click" between each second).



%include "Tools.inc"

segment_code

;
; void  Adapt ( short* pM, const short* pAdapt, int nDirection, int nOrder )
;
;   [esp+16]    nOrder
;   [esp+12]    nDirection
;   [esp+ 8]    pAdapt
;   [esp+ 4]    pM
;   [esp+ 0]    Return Address

            align 16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
proc        Adapt

            mov  eax, [esp +  4]                ; pM
            mov  ecx, [esp +  8]                ; pAdapt
            mov  edx, [esp + 16]                ; nOrder
            shr  edx, 4

            cmp  dword [esp + 12], byte 0       ; nDirection
            jle  short AdaptSub

AdaptAddLoop:
            movdqa  xmm0, [eax]
            movdqa  xmm1, [eax+16]
            movdqa  xmm2, [eax+32]
            movdqa  xmm3, [eax+48]
            paddq   xmm0, [ecx]
            paddq   xmm1, [ecx+16]
            paddq   xmm2, [ecx+32]
            paddq   xmm3, [ecx+48]
            movntdq  [eax], xmm0
            movntdq  [eax+16], xmm1
            movntdq  [eax+32], xmm2
            movntdq  [eax+48], xmm3
            add     eax, byte 64
            add     ecx, byte 64
            dec     edx
            jnz     AdaptAddLoop

            ;emms
            ret

            align 16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop

AdaptSub:   je    short AdaptDone

AdaptSubLoop:
            movdqa  xmm0, [eax]
            movdqa  xmm1, [eax+16]
            movdqa  xmm2, [eax+32]
            movdqa  xmm3, [eax+48]
            psubq   xmm0, [ecx]
            psubq   xmm1, [ecx+16]
            psubq   xmm2, [ecx+32]
            psubq   xmm3, [ecx+48]
            movntdq  [eax], xmm0
            movntdq  [eax+16], xmm1
            movntdq  [eax+32], xmm2
            movntdq  [eax+48], xmm3
            add     eax, byte 64
            add     ecx, byte 64
            dec     edx
            jnz   AdaptSubLoop

            ;emms
AdaptDone:

endproc

;
; int  CalculateDotProduct ( const short* pA, const short* pB, int nOrder )
;
;   [esp+12]    nOrder
;   [esp+ 8]    pB
;   [esp+ 4]    pA
;   [esp+ 0]    Return Address

            align   16

proc        CalculateDotProduct

            mov     eax, [esp +  4]             ; pA
            mov     ecx, [esp +  8]             ; pB
            mov     edx, [esp + 12]             ; nOrder
            shr     edx, 4
            pxor    mm7, mm7

loopDot:    movq    mm0, [eax]
            movq    mm1, [eax +  8]
            movq    mm2, [eax + 16]
            movq    mm3, [eax + 24]
            pmaddwd mm0, [ecx]
            pmaddwd mm1, [ecx +  8]
            pmaddwd mm2, [ecx + 16]
            pmaddwd mm3, [ecx + 24]
            paddd   mm7, mm0
            paddd   mm7, mm1
            paddd   mm7, mm2
            add     eax, byte 32
            add     ecx, byte 32
            paddd   mm7, mm3
            dec     edx
            jnz     loopDot

            movq    mm6, mm7
            psrlq   mm7, 32
            paddd   mm6, mm7
            movd    [esp + 4], mm6
            emms
            mov     eax, [esp + 4]
endproc


;
; BOOL GetSSE2Available ( void );
;

; just for now
proc        GetSSE2Available
mov eax, 1
ret
endproc

            end


Any ideas?

Farabi

Quote from: Seb on August 03, 2005, 12:56:45 AM
Quote from: Codewarp on August 02, 2005, 10:46:23 PM
Is that enough to think about?

Yeah, and the code below is what I've done so far. It doesn't crash, nor does it output any sound (well, there is a small "click" between each second).



%include "Tools.inc"

segment_code

;
; void  Adapt ( short* pM, const short* pAdapt, int nDirection, int nOrder )
;
;   [esp+16]    nOrder
;   [esp+12]    nDirection
;   [esp+ 8]    pAdapt
;   [esp+ 4]    pM
;   [esp+ 0]    Return Address

            align 16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
proc        Adapt

            mov  eax, [esp +  4]                ; pM
            mov  ecx, [esp +  8]                ; pAdapt
            mov  edx, [esp + 16]                ; nOrder
            shr  edx, 4

            cmp  dword [esp + 12], byte 0       ; nDirection
            jle  short AdaptSub

AdaptAddLoop:
            movdqa  xmm0, [eax]
            movdqa  xmm1, [eax+16]
            movdqa  xmm2, [eax+32]
            movdqa  xmm3, [eax+48]
            paddq   xmm0, [ecx]
            paddq   xmm1, [ecx+16]
            paddq   xmm2, [ecx+32]
            paddq   xmm3, [ecx+48]
            movntdq  [eax], xmm0
            movntdq  [eax+16], xmm1
            movntdq  [eax+32], xmm2
            movntdq  [eax+48], xmm3
            add     eax, byte 64
            add     ecx, byte 64
            dec     edx
            jnz     AdaptAddLoop

            ;emms
            ret

            align 16
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop
            nop

AdaptSub:   je    short AdaptDone

AdaptSubLoop:
            movdqa  xmm0, [eax]
            movdqa  xmm1, [eax+16]
            movdqa  xmm2, [eax+32]
            movdqa  xmm3, [eax+48]
            psubq   xmm0, [ecx]
            psubq   xmm1, [ecx+16]
            psubq   xmm2, [ecx+32]
            psubq   xmm3, [ecx+48]
            movntdq  [eax], xmm0
            movntdq  [eax+16], xmm1
            movntdq  [eax+32], xmm2
            movntdq  [eax+48], xmm3
            add     eax, byte 64
            add     ecx, byte 64
            dec     edx
            jnz   AdaptSubLoop

            ;emms
AdaptDone:

endproc

;
; int  CalculateDotProduct ( const short* pA, const short* pB, int nOrder )
;
;   [esp+12]    nOrder
;   [esp+ 8]    pB
;   [esp+ 4]    pA
;   [esp+ 0]    Return Address

            align   16

proc        CalculateDotProduct

            mov     eax, [esp +  4]             ; pA
            mov     ecx, [esp +  8]             ; pB
            mov     edx, [esp + 12]             ; nOrder
            shr     edx, 4
            pxor    mm7, mm7

loopDot:    movq    mm0, [eax]
            movq    mm1, [eax +  8]
            movq    mm2, [eax + 16]
            movq    mm3, [eax + 24]
            pmaddwd mm0, [ecx]
            pmaddwd mm1, [ecx +  8]
            pmaddwd mm2, [ecx + 16]
            pmaddwd mm3, [ecx + 24]
            paddd   mm7, mm0
            paddd   mm7, mm1
            paddd   mm7, mm2
            add     eax, byte 32
            add     ecx, byte 32
            paddd   mm7, mm3
            dec     edx
            jnz     loopDot

            movq    mm6, mm7
            psrlq   mm7, 32
            paddd   mm6, mm7
            movd    [esp + 4], mm6
            emms
            mov     eax, [esp + 4]
endproc


;
; BOOL GetSSE2Available ( void );
;

; just for now
proc        GetSSE2Available
mov eax, 1
ret
endproc

            end


Any ideas?

Try to execute windows explorer if you use MMX code. In my laptop it is crashed.
Those who had universe knowledges can control the world by a micro processor.
http://www.wix.com/farabio/firstpage

"Etos siperi elegi"

Seb

Quote from: Farabi on August 03, 2005, 11:41:53 AM
Try to execute windows explorer if you use MMX code. In my laptop it is crashed.

What do you mean?

Farabi

Quote from: Seb on August 03, 2005, 01:40:54 PM
Quote from: Farabi on August 03, 2005, 11:41:53 AM
Try to execute windows explorer if you use MMX code. In my laptop it is crashed.

What do you mean?

So it is not crash on your laptop? I think I made mistake on my code.


IsSupportMMX proc

   push   ebx                 
    push   ecx
    push   edx

    ; Check feature flag 23 in EDX for MMX support
    mov    eax, 1               
    cpuid                       
    mov    eax, edx           
    shr    eax, 23             
    and    eax, 1                   

    ; Restore registers
    pop    edx                 
    pop    ecx
    pop    ebx
    ret

ret
IsSupportMMX endp

EndMMX proc

    emms     ; Allow CPU to use floating point
    ret

ret
EndMMX endp


Use above code if you like.
Those who had universe knowledges can control the world by a micro processor.
http://www.wix.com/farabio/firstpage

"Etos siperi elegi"