SSE Matrix Transpose/Swizzle

johnsa · February 15, 2011, 12:08:35 PM

Hey all, based on the intrinsics / macro from C++ it seems as if the _mm_transpose function does the following:


	movaps xmm0,row0
	movaps xmm2,xmm0
	shufps xmm0,row1,01000100b
	shufps xmm2,row1,11101110b
	movaps xmm1,row2
	movaps xmm3,xmm1
	shufps xmm1,row3,01000100b
	shufps xmm3,row3,11101110b
	movaps xmm4,xmm0
	movaps xmm6,xmm4
	shufps xmm4,xmm1,10001000b
	shufps xmm6,xmm1,11011101b
	movaps xmm5,xmm2
	movaps xmm7,xmm5
	shufps xmm5,xmm3,10001000b
	shufps xmm7,xmm3,11011101b
	movaps row0,xmm4
	movaps row1,xmm5
	movaps row2,xmm6
	movaps row3,xmm7

I tried a variation as follows:

Code Select


movaps xmm0,row0			; Load Vertex as | x1 | x2 | x3 | x4 |
	movaps xmm1,row1			; Load Vertex as | y1 | y2 | y3 | y4 |
	movaps xmm2,row2			; Load Vertex as | z1 | z2 | z3 | z4 |
	movaps xmm3,row3			; Load Vertex as | w1 | w2 | w3 | w4 |
	movaps xmm4,xmm1			; xmm4 = | y1 | y2 | y3 | y4 | = xmm1
	punpckhdq xmm4,xmm0			; xmm4 = | x1 | y1 | x2 | y2 |
	movaps xmm5,xmm3			; xmm5 = | w1 | w2 | w3 | w4 | = xmm3
	punpckhdq xmm5,xmm2			; xmm5 = | z1 | w1 | z2 | w2 |
	movaps xmm6,xmm5			; xmm6 = | z1 | w1 | z2 | w2 | = xmm5
	shufps xmm5,xmm4,11101110b	; xmm4 = | x1 | y1 | z1 | w1 |
	shufps xmm6,xmm4,01000100b	; xmm4 = | x2 | y2 | z2 | w2 |
	movaps xmm4,xmm1			; xmm4 = | y1 | y2 | y3 | y4 | = xmm1
	punpckldq xmm4,xmm0			; xmm4 = | x3 | y3 | x4 | y4 |
	movaps xmm7,xmm3			; xmm7 = | w1 | w2 | w3 | w4 | = xmm3
	punpckldq xmm7,xmm2			; xmm7 = | z3 | w3 | z4 | w4 |
	movaps xmm0,xmm7			; xmm0 = | z3 | w3 | z4 | w4 | = xmm7
	shufps xmm7,xmm4,11101110b	; xmm7 = | x3 | y3 | z3 | w3 |             
	shufps xmm0,xmm4,01000100b	; xmm0 = | x4 | y4 | z4 | w4 |
	movaps row3,xmm5
	movaps row2,xmm6
	movaps row1,xmm7
	movaps row0,xmm0

Which seems to be about 10% faster on my machines (core2 duo 2.3ghz and a core i7). Not sure if this is interesting for anyone, or if anyone else has any further suggestions/improvements.

drizz · February 15, 2011, 03:21:00 PM

See if my code works for you.

Code Select

; input rows in xmm0..xmm3	
; output rows in xmm0..xmm3	
; uses xmm0..xmm5
Matrix4x4TransposeInline macro
					;xmm1	; 5 6 7 8
	movdqa xmm4,xmm2		; 9 10 11 12
	movdqa xmm5,xmm2		; 9 10 11 12
					;xmm3	; 13 14 15 16
	punpckldq xmm4,xmm3		; 9 13 10 14
	punpckhdq xmm5,xmm3		; 11 15 12 16
	movdqa xmm2,xmm0		; 1 2 3 4
	punpckldq xmm0,xmm1		; 1 5 2 6
	punpckhdq xmm2,xmm1		; 3 7 4 8
	
	movdqa xmm1,xmm0		; 1 5 2 6
	movdqa xmm3,xmm2		; 9 13 10 14
	
	punpcklqdq xmm0,xmm4	; 1 5 9 13
	punpckhqdq xmm1,xmm4	; 2 6 10 14
	punpcklqdq xmm2,xmm5	; 3 7 11 15
	punpckhqdq xmm3,xmm5	; 4 8 12 16
endm

option prologue:none
option epilogue:none
; extern "c" Matrix4x4Transpose(float *pM);
Matrix4x4Transpose proc c pM

	ifdef __x86_64__
		M equ <[rcx]>
		ifdef __linux__; ABIs just had to be different didn't it...
		M equ <[rsi]>
		endif
	else
		mov eax,[esp+4];pM
		M equ <[eax]>
	endif

	
	movdqa xmm0,M[0*16]
	movdqa xmm1,M[1*16]
	movdqa xmm2,M[2*16]
	movdqa xmm3,M[3*16]
	Matrix4x4TransposeInline
	movdqa M[0*16],xmm0
	movdqa M[1*16],xmm1
	movdqa M[2*16],xmm2
	movdqa M[3*16],xmm3
	
	ret
Matrix4x4Transpose endp

; compile with JWASM

johnsa · February 15, 2011, 03:45:10 PM

That works nicely! and its another 15% faster, overall that's 30% faster than the default _mm_transpose intrinsic and the other derivatives of that floating around. Nice one :)

chrisw · March 25, 2011, 10:40:47 AM

I myself use this one for matrix transpose:

//
//   TRANSPOSE_4x4(xmm1, xmm2, xmm3, xmm4, xmm0)
//
//   Transposes the 4x4 matrix of 32 bit data elements in xmm1 - xmm4 with each register containing a row
//   of 4 data elements. After transposing each of xmm1-xmm4 contains a column of 4 data elements,
//
//   IN
//      xmm1   first row of 4 data elements      e.g.   a4 a3 a2 a1
//      xmm2   second row of 4 data elements      e.g.   b4 b3 b2 b1
//      xmm3   third row of 4 data elements      e.g.   c4 c3 c2 c1
//      xmm4   fourth row of 4 data elements      e.g.   d4 d3 d2 d1
//
//   OUT
//      xmm1   first column of 4 data elements      e.g.   d1 c1 b1 a1
//      xmm2   second column of 4 data elements   e.g.   d2 c2 b2 a2
//      xmm3   third column of 4 data elements      e.g.   d3 c3 b3 a3
//      xmm4   fourth column of 4 data elements   e.g.   d4 c4 b4 a4
//
//   USES
//      xmm0
//
#define TRANSPOSE_4x4(xmm1, xmm2, xmm3, xmm4, xmm0)                  \
__asm                                                   \
__asm      movaps      xmm0,   xmm3      /* xmm0:   c4 c3 c2 c1 */   \
__asm      punpckldq xmm3,    xmm4      /* xmm3:   d2 c2 d1 c1 */   \
__asm      punpckhdq xmm0,    xmm4      /* xmm0:   d4 c4 d3 c3 */   \
__asm                                                   \
__asm      movaps xmm4,    xmm1      /* xmm4:   a4 a3 a2 a1 */   \
__asm      punpckldq xmm1,    xmm2      /* xmm1:   b2 a2 b1 a1 */   \
__asm      punpckhdq xmm4,    xmm2      /* xmm4:   b4 a4 b3 a3 */   \
__asm                                                   \
__asm      movaps xmm2,    xmm1      /* xmm2:   b2 a2 b1 a1 */   \
__asm      punpcklqdq xmm1,    xmm3      /* xmm1:   d1 c1 b1 a1 */   \
__asm      punpckhqdq xmm2,    xmm3      /* xmm2:   d2 c2 b2 a2 */   \
__asm                                                   \
__asm      movaps xmm3,    xmm4      /* xmm3:   b4 a4 b3 a3 */   \
__asm      punpcklqdq xmm3,    xmm0      /* xmm3:   d3 c3 b3 a3 */   \
__asm      punpckhqdq xmm4,    xmm0      /* xmm4:   d4 c4 b4 a4 */   \

(sorry for notation, i do use this as an assembly macro within my C-program)

This codes keeps the data in the original registers and does need only one additional register and needs 12 instructions which is the lowest number, I was able to find. Maybe it is useful, too :-).

regards,
Christian

News:

SSE Matrix Transpose/Swizzle

johnsa

drizz

johnsa

chrisw