Hybrid SoA vs AoS and Vertex Transforms with SSE

johnsa · August 26, 2009, 01:48:45 PM

I've been re-looking at my 3d engine pipeline again (fully software based) and timing all the various stages to optimize them. While doing this I've run
into a couple of things that I need some clarity on.

The first issue is that up till now I've stored all vertices/vectors as independent objects (ie: AOS). I was debating moving all the geometry data into
a hybrid SOA and replacing the the transforms etc with SSE versions.
So I started writing some SSE2+ code down to see what the transform would like for normal xyzw as opposed to xxxxyyyyzzzzwwww. To be honest I can't see how the
hybrid SOA is going to gain me anything (I think I might be having a dumb moment).

Lets take for example:
[xyzw] vs [x1x2x3x4 y1y2y3y4 z1z2z3z4 w1w2w3w4]

To transform a vertex/vector by a 4x4 matrix we'd have:

Code Select


| a b c d |
| e f g h |
| i j k l |
| m n o p |

x1' = x1*a + y1*b + z1*c + w1*d
y1' = x1*e + y1*f + z1*g + w1*h
z1' = x1*i + y1*j + z1*k + w1*l
w1' = x1*m + y1*n + z1*o + w1*p

(One possible optimization would be to assume that w' = m+n+o+p for a vertex and 0 for a vector).

so if we did this with SSE (AoS) we'd land up with :
a) not having to swizzle the matrix into column major form
b) 4 loads outside the loop to load the matrix rows
IE: xmm4 = (a,b,c,d) ... xmm5 = (e,f,g,h) etc..
c) then each vertex(xyzw) would be one load.. movaps xmm0,[edi] or similar..
d) we'd have four mulps (or 3 if we use the above optimization).
e) then a few haddps/shufps/blendps
f) a single store of the new vertex back

Now in theory, using Hybrid SoA we should be able to do 4 vertices/vectors at a time.. but then loading the matrix components out and broadcasting would land up being more expensive than the AoS version?

Or am I being dumb? :)

Here was a quick AoS draft function I put down assuming it loads the matrix every time and processes as opposed to a batch process which would load the matrix outside the loop:

Code Select



movaps xmm0,[esi]  ; load the vertex
movaps xmm1,[edi]      ;abcd (row centric)
movaps xmm2,[edi+16]
movaps xmm3,[edi+32]
movaps xmm4,[edi+48]

mulps xmm1,xmm0
mulps xmm2,xmm0
mulps xmm3,xmm0
mulps xmm4,xmm0

haddps xmm1,xmm1
haddps xmm1,xmm1

haddps xmm2,xmm2
haddps xmm2,xmm2

haddps xmm3,xmm3
haddps xmm3,xmm3

haddps xmm4,xmm4
haddps xmm4,xmm4

shufps xmm1,xmm2,00 00 00 00b  ; xmm1 = | y' | y' | x' | x' |
shufps xmm1,xmm3,00 00 10 00b  ; xmm1 = | z' | z' | y' | x' |
blendps xmm1,xmm4,00001000b    ; xmm1 = | w' | z' | y' | x' |	

movaps [esi],xmm1

dedndave · August 26, 2009, 02:04:33 PM

i haven't had a chance to play with this yet, but here are a few docs that look very promising for matrices using SSE

https://www.ccs.uky.edu/~thorne/cs521_sm4/Docs/General/fulltext.pdf
http://cache-www.intel.com/cd/00/00/04/06/40624_w_3d_transform.pdf
http://www-cs.ccny.cuny.edu/~gertner/Cs342Fall2007/20071107SIMD/Matrix-vector%20multiplication_4.pdf
http://www.sandpile.org/docs/intel/isa-ext.htm

near the end of that first document is 17 lines of well-commented SSE code for a matrix multiply

dedndave · August 26, 2009, 02:18:50 PM

this also looks interesting
from: http://www.cortstratton.org/articles/OptimizingForSSE.php

// BatchMultiply5 -- A modified version of BatchMultiply4 which loads
// vector components individually from memory, thereby allowing us
// to work on TWO VECTORS SIMULTANEOUSLY!
//
// Performance: 20 cycles/vector
void BatchMultiply5(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// initializations in C++ land
Matrix4f mt(m, Matrix4f::TRANSPOSE); // work from a
float *row0 = mt.Ref();
static const int vecSize = 2 * sizeof(Vector4f);

// if there are an odd number of vectors, process the first one
// separately and advance the pointers
if (len & 0x1) {
MatrixMultiply3(mt, vin, vout);
++vin;
++vout;
}
len >>= 1; // we process two vectors at a time

__asm {
mov esi, vin
mov edi, vout
mov ecx, len

// load columns of matrix into xmm4-7
mov edx, row0
movaps xmm4, [edx]
movaps xmm5, [edx+0x10]
movaps xmm6, [edx+0x20]
movaps xmm7, [edx+0x30]

BM5_START:

// process x
movss xmm1, [esi+0x00]
movss xmm3, [esi+0x10]
shufps xmm1, xmm1, 0x00
prefetchnta [esi+0x30]
shufps xmm3, xmm3, 0x00
mulps xmm1, xmm4
prefetchnta [edi+0x30]
mulps xmm3, xmm4

// process y
movss xmm0, [esi+0x04]
movss xmm2, [esi+0x14]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm5
mulps xmm2, xmm5
addps xmm1, xmm0
addps xmm3, xmm2

// process z
movss xmm0, [esi+0x08]
movss xmm2, [esi+0x18]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm6
mulps xmm2, xmm6
addps xmm1, xmm0
addps xmm3, xmm2

// process w (hiding some pointer increments between the
// multiplies)
movss xmm0, [esi+0x0C]
movss xmm2, [esi+0x1C]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm7
add esi, vecSize
mulps xmm2, xmm7
add edi, vecSize
addps xmm1, xmm0
addps xmm3, xmm2

// write output vectors to memory, and loop
movaps [edi-0x20], xmm1
movaps [edi-0x10], xmm3
dec ecx
jnz BM5_START
}
}

// BatchTransform1 -- A modified version of BatchMultiply4 which makes
// an additional assumption about the vectors in vin: if each vector's
// 4th element (the homogenous coordinate w) is assumed to be 1.0 (as is
// the case for 3D vertices), we can eliminate a move, a shuffle and a
// multiply instruction.
//
// Performance: 17 cycles/vector
void BatchTransform1(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// initializations in C++ land
Matrix4f mt(m, Matrix4f::TRANSPOSE); // work from a
float *row0 = mt.Ref();
static const int vecSize = 2 * sizeof(Vector4f);

// if there are an odd number of vectors, process the first one
// separately and advance the pointers
if (len & 0x1) {
MatrixMultiply3(mt, vin, vout);
++vin;
++vout;
}
len >>= 1; // we process two vectors at a time

__asm {
mov esi, vin
mov edi, vout
mov ecx, len

// load columns of matrix into xmm4-7
mov edx, row0
movaps xmm4, [edx]
movaps xmm5, [edx+0x10]
movaps xmm6, [edx+0x20]
movaps xmm7, [edx+0x30]

BT2_START:
// process x (hiding the prefetches in the delays)
movss xmm1, [esi+0x00]
movss xmm3, [esi+0x10]
shufps xmm1, xmm1, 0x00
prefetchnta [edi+0x30]
shufps xmm3, xmm3, 0x00
mulps xmm1, xmm4
prefetchnta [esi+0x30]
mulps xmm3, xmm4

// process y
movss xmm0, [esi+0x04]
movss xmm2, [esi+0x14]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm5
mulps xmm2, xmm5
addps xmm1, xmm0
addps xmm3, xmm2

// process z (hiding some pointer arithmetic between
// the multiplies)
movss xmm0, [esi+0x08]
movss xmm2, [esi+0x18]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm6
add esi, vecSize
mulps xmm2, xmm6
add edi, vecSize
addps xmm1, xmm0
addps xmm3, xmm2

// process w
addps xmm1, xmm7
addps xmm3, xmm7

// write output vectors to memory and loop
movaps [edi-0x20], xmm1
movaps [edi-0x10], xmm3
dec ecx
jnz BT2_START
}
}

johnsa · August 26, 2009, 02:46:15 PM

I had a quick look at those, they seem to use AoS as well.. but i see so much hype about swizzle and SoA.. but I'm honestly at a loss to see how it helps ...

NightWare · August 26, 2009, 10:57:33 PM

johnsa,
i don't know how you've made your 3D engine (imho it must be scary :toothy)
BUT, with SSE+, you're in the industrial world ! you must have a global point of view, and not see task after task...
example, IF you transpose your matrix (an extra job made only once), you will be able to use :
(or something similar, depending of your needs, here for 4 points of a 3d objetc)

Code Select

; load 4 points
		movaps XMM1,OWORD PTR [edx]						; XMM1 = X2,Z1,Y1,X1
		movaps XMM2,OWORD PTR [edx+16]					; XMM2 = Y3,X3,Z2,Y2
		movaps XMM3,OWORD PTR [edx+32]					; XMM3 = Z4,Y4,X4,Z3
; substract position of the point of view
		subps XMM1,OWORD PTR (_Workd3D_ PTR [esi])._Position_Vue_Plus_01_	; subtract pov position povX,povZ,povY,povX to X2,Z1,Y1,X1
		subps XMM2,OWORD PTR (_Workd3D_ PTR [esi])._Position_Vue_Plus_05_	; subtract pov position povY,povX,povZ,povY to Y3,X3,Z2,Y2
		subps XMM3,OWORD PTR (_Workd3D_ PTR [esi])._Position_Vue_Plus_09_	; subtract pov position povZ,povY,povX,povZ to Z4,Y4,X4,Z3
; calc point 1...
		pshufd XMM5,XMM1,0AAh							; XMM5 = Z1,Z1,Z1,Z1
		pshufd XMM4,XMM1,055h							; XMM4 = Y1,Y1,Y1,Y1
		pshufd XMM0,XMM1,000h							; XMM0 = X1,X1,X1,X1
		mulps XMM0,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line1_	; )multiply by our transposed matrix
		mulps XMM4,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line2_	; )
		mulps XMM5,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line3_	; )
		addps XMM0,XMM4								;	) add the results
		addps XMM0,XMM5								;	)
; here XMM0 = _,Z1,Y1,X1
; calc point 2...
		pshufd XMM5,XMM2,055h							; XMM3 = Z2,Z2,Z2,Z2
		pshufd XMM4,XMM2,000h							; XMM1 = Y2,Y2,Y2,Y2
		pshufd XMM1,XMM1,0FFh							; XMM1 = X2,X2,X2,X2
		mulps XMM1,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line1_	; )multiply by our transposed matrix
		mulps XMM4,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line2_	; )
		mulps XMM5,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line3_	; )
		addps XMM1,XMM4								;	) add the results
		addps XMM1,XMM5								;	)
; here XMM1 = _,Z2,Y2,X2
; calc point 3...
		pshufd XMM5,XMM3,000h							; XMM2 = Z3,Z3,Z3,Z3
		pshufd XMM4,XMM2,0AAh							; XMM4 = X3,X3,X3,X3
		pshufd XMM2,XMM2,0FFh							; XMM6 = Y3,Y3,Y3,Y3
		mulps XMM2,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line2_	; )
		mulps XMM4,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line1_	; )multiply by our transposed matrix
		mulps XMM5,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line3_	; )
		addps XMM2,XMM4								;	) add the results
		addps XMM2,XMM5								;	)
; here XMM2 = _,Z3,Y3,X3
; calc point 4...
		pshufd XMM5,XMM3,0AAh							; XMM5 = Y4,Y4,Y4,Y4
		pshufd XMM4,XMM3,055h							; XMM3 = X4,X4,X4,X4
		pshufd XMM3,XMM3,0FFh							; XMM7 = Z4,Z4,Z4,Z4
		mulps XMM3,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line3_	; )
		mulps XMM4,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line1_	; )multiply by our transposed matrix
		mulps XMM5,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line2_	; )
		addps XMM3,XMM4								;	) add the results
		addps XMM3,XMM5								;	)
; here XMM3 = _,Z4,Y4,X4

johnsa & dedndave, IF you look how the net idiots do their stuff, you will be influenced and will not go very far... just my 2 cts....

dedndave · August 27, 2009, 02:07:54 AM

we have to start someplace, NightWare - lol
but, i have always been a big fan of Hank Ford

News:

Hybrid SoA vs AoS and Vertex Transforms with SSE

johnsa

dedndave

dedndave

johnsa

NightWare

dedndave