News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

SSE Matrix Transpose/Swizzle

Started by johnsa, February 15, 2011, 12:08:35 PM

Previous topic - Next topic

johnsa

Hey all, based on the intrinsics / macro from C++ it seems as if the _mm_transpose function does the following:


movaps xmm0,row0
movaps xmm2,xmm0
shufps xmm0,row1,01000100b
shufps xmm2,row1,11101110b
movaps xmm1,row2
movaps xmm3,xmm1
shufps xmm1,row3,01000100b
shufps xmm3,row3,11101110b
movaps xmm4,xmm0
movaps xmm6,xmm4
shufps xmm4,xmm1,10001000b
shufps xmm6,xmm1,11011101b
movaps xmm5,xmm2
movaps xmm7,xmm5
shufps xmm5,xmm3,10001000b
shufps xmm7,xmm3,11011101b
movaps row0,xmm4
movaps row1,xmm5
movaps row2,xmm6
movaps row3,xmm7


I tried a variation as follows:


movaps xmm0,row0 ; Load Vertex as | x1 | x2 | x3 | x4 |
movaps xmm1,row1 ; Load Vertex as | y1 | y2 | y3 | y4 |
movaps xmm2,row2 ; Load Vertex as | z1 | z2 | z3 | z4 |
movaps xmm3,row3 ; Load Vertex as | w1 | w2 | w3 | w4 |
movaps xmm4,xmm1 ; xmm4 = | y1 | y2 | y3 | y4 | = xmm1
punpckhdq xmm4,xmm0 ; xmm4 = | x1 | y1 | x2 | y2 |
movaps xmm5,xmm3 ; xmm5 = | w1 | w2 | w3 | w4 | = xmm3
punpckhdq xmm5,xmm2 ; xmm5 = | z1 | w1 | z2 | w2 |
movaps xmm6,xmm5 ; xmm6 = | z1 | w1 | z2 | w2 | = xmm5
shufps xmm5,xmm4,11101110b ; xmm4 = | x1 | y1 | z1 | w1 |
shufps xmm6,xmm4,01000100b ; xmm4 = | x2 | y2 | z2 | w2 |
movaps xmm4,xmm1 ; xmm4 = | y1 | y2 | y3 | y4 | = xmm1
punpckldq xmm4,xmm0 ; xmm4 = | x3 | y3 | x4 | y4 |
movaps xmm7,xmm3 ; xmm7 = | w1 | w2 | w3 | w4 | = xmm3
punpckldq xmm7,xmm2 ; xmm7 = | z3 | w3 | z4 | w4 |
movaps xmm0,xmm7 ; xmm0 = | z3 | w3 | z4 | w4 | = xmm7
shufps xmm7,xmm4,11101110b ; xmm7 = | x3 | y3 | z3 | w3 |             
shufps xmm0,xmm4,01000100b ; xmm0 = | x4 | y4 | z4 | w4 |
movaps row3,xmm5
movaps row2,xmm6
movaps row1,xmm7
movaps row0,xmm0


Which seems to be about 10% faster on my machines (core2 duo 2.3ghz and a core i7). Not sure if this is interesting for anyone, or if anyone else has any further suggestions/improvements.

drizz

See if my code works for you.
; input rows in xmm0..xmm3
; output rows in xmm0..xmm3
; uses xmm0..xmm5
Matrix4x4TransposeInline macro
;xmm1 ; 5 6 7 8
movdqa xmm4,xmm2 ; 9 10 11 12
movdqa xmm5,xmm2 ; 9 10 11 12
;xmm3 ; 13 14 15 16
punpckldq xmm4,xmm3 ; 9 13 10 14
punpckhdq xmm5,xmm3 ; 11 15 12 16
movdqa xmm2,xmm0 ; 1 2 3 4
punpckldq xmm0,xmm1 ; 1 5 2 6
punpckhdq xmm2,xmm1 ; 3 7 4 8

movdqa xmm1,xmm0 ; 1 5 2 6
movdqa xmm3,xmm2 ; 9 13 10 14

punpcklqdq xmm0,xmm4 ; 1 5 9 13
punpckhqdq xmm1,xmm4 ; 2 6 10 14
punpcklqdq xmm2,xmm5 ; 3 7 11 15
punpckhqdq xmm3,xmm5 ; 4 8 12 16
endm

option prologue:none
option epilogue:none
; extern "c" Matrix4x4Transpose(float *pM);
Matrix4x4Transpose proc c pM

ifdef __x86_64__
M equ <[rcx]>
ifdef __linux__; ABIs just had to be different didn't it...
M equ <[rsi]>
endif
else
mov eax,[esp+4];pM
M equ <[eax]>
endif


movdqa xmm0,M[0*16]
movdqa xmm1,M[1*16]
movdqa xmm2,M[2*16]
movdqa xmm3,M[3*16]
Matrix4x4TransposeInline
movdqa M[0*16],xmm0
movdqa M[1*16],xmm1
movdqa M[2*16],xmm2
movdqa M[3*16],xmm3

ret
Matrix4x4Transpose endp

; compile with JWASM
The truth cannot be learned ... it can only be recognized.

johnsa

That works nicely! and its another 15% faster, overall that's 30% faster than the default _mm_transpose intrinsic and the other derivatives of that floating around. Nice one :)

chrisw

I myself use this one for matrix transpose:

//
//   TRANSPOSE_4x4(xmm1, xmm2, xmm3, xmm4, xmm0)
//
//   Transposes the 4x4 matrix of 32 bit data elements in xmm1 - xmm4 with each register containing a row
//   of 4 data elements. After transposing each of xmm1-xmm4 contains a column of 4 data elements,
//
//   IN
//      xmm1   first row of 4 data elements      e.g.   a4 a3 a2 a1
//      xmm2   second row of 4 data elements      e.g.   b4 b3 b2 b1
//      xmm3   third row of 4 data elements      e.g.   c4 c3 c2 c1
//      xmm4   fourth row of 4 data elements      e.g.   d4 d3 d2 d1
//
//   OUT
//      xmm1   first column of 4 data elements      e.g.   d1 c1 b1 a1
//      xmm2   second column of 4 data elements   e.g.   d2 c2 b2 a2
//      xmm3   third column of 4 data elements      e.g.   d3 c3 b3 a3
//      xmm4   fourth column of 4 data elements   e.g.   d4 c4 b4 a4
//
//   USES
//      xmm0
//
#define TRANSPOSE_4x4(xmm1, xmm2, xmm3, xmm4, xmm0)                  \
__asm                                                   \
__asm      movaps      xmm0,   xmm3      /* xmm0:   c4 c3 c2 c1 */   \
__asm      punpckldq   xmm3,    xmm4      /* xmm3:   d2 c2 d1 c1 */   \
__asm      punpckhdq   xmm0,    xmm4      /* xmm0:   d4 c4 d3 c3 */   \
__asm                                                   \
__asm      movaps      xmm4,    xmm1      /* xmm4:   a4 a3 a2 a1 */   \
__asm      punpckldq   xmm1,    xmm2      /* xmm1:   b2 a2 b1 a1 */   \
__asm      punpckhdq   xmm4,    xmm2      /* xmm4:   b4 a4 b3 a3 */   \
__asm                                                   \
__asm      movaps      xmm2,    xmm1      /* xmm2:   b2 a2 b1 a1 */   \
__asm      punpcklqdq  xmm1,    xmm3      /* xmm1:   d1 c1 b1 a1 */   \
__asm      punpckhqdq  xmm2,    xmm3      /* xmm2:   d2 c2 b2 a2 */   \
__asm                                                   \
__asm      movaps      xmm3,    xmm4      /* xmm3:   b4 a4 b3 a3 */   \
__asm      punpcklqdq  xmm3,    xmm0      /* xmm3:   d3 c3 b3 a3 */   \
__asm      punpckhqdq  xmm4,    xmm0      /* xmm4:   d4 c4 b4 a4 */   \



(sorry for notation, i do use this as an assembly macro within my C-program)

This codes keeps the data in the original registers and does need only one additional register and needs 12 instructions which is the lowest number, I was able to find. Maybe it is useful, too :-).

regards,
Christian