I've been re-looking at my 3d engine pipeline again (fully software based) and timing all the various stages to optimize them. While doing this I've run
into a couple of things that I need some clarity on.
The first issue is that up till now I've stored all vertices/vectors as independent objects (ie: AOS). I was debating moving all the geometry data into
a hybrid SOA and replacing the the transforms etc with SSE versions.
So I started writing some SSE2+ code down to see what the transform would like for normal xyzw as opposed to xxxxyyyyzzzzwwww. To be honest I can't see how the
hybrid SOA is going to gain me anything (I think I might be having a dumb moment).
Lets take for example:
[xyzw] vs [x1x2x3x4 y1y2y3y4 z1z2z3z4 w1w2w3w4]
To transform a vertex/vector by a 4x4 matrix we'd have:
| a b c d |
| e f g h |
| i j k l |
| m n o p |
x1' = x1*a + y1*b + z1*c + w1*d
y1' = x1*e + y1*f + z1*g + w1*h
z1' = x1*i + y1*j + z1*k + w1*l
w1' = x1*m + y1*n + z1*o + w1*p
(One possible optimization would be to assume that w' = m+n+o+p for a vertex and 0 for a vector).
so if we did this with SSE (AoS) we'd land up with :
a) not having to swizzle the matrix into column major form
b) 4 loads outside the loop to load the matrix rows
IE: xmm4 = (a,b,c,d) ... xmm5 = (e,f,g,h) etc..
c) then each vertex(xyzw) would be one load.. movaps xmm0,[edi] or similar..
d) we'd have four mulps (or 3 if we use the above optimization).
e) then a few haddps/shufps/blendps
f) a single store of the new vertex back
Now in theory, using Hybrid SoA we should be able to do 4 vertices/vectors at a time.. but then loading the matrix components out and broadcasting would land up being more expensive than the AoS version?
Or am I being dumb? :)
Here was a quick AoS draft function I put down assuming it loads the matrix every time and processes as opposed to a batch process which would load the matrix outside the loop:
movaps xmm0,[esi] ; load the vertex
movaps xmm1,[edi] ;abcd (row centric)
movaps xmm2,[edi+16]
movaps xmm3,[edi+32]
movaps xmm4,[edi+48]
mulps xmm1,xmm0
mulps xmm2,xmm0
mulps xmm3,xmm0
mulps xmm4,xmm0
haddps xmm1,xmm1
haddps xmm1,xmm1
haddps xmm2,xmm2
haddps xmm2,xmm2
haddps xmm3,xmm3
haddps xmm3,xmm3
haddps xmm4,xmm4
haddps xmm4,xmm4
shufps xmm1,xmm2,00 00 00 00b ; xmm1 = | y' | y' | x' | x' |
shufps xmm1,xmm3,00 00 10 00b ; xmm1 = | z' | z' | y' | x' |
blendps xmm1,xmm4,00001000b ; xmm1 = | w' | z' | y' | x' |
movaps [esi],xmm1
i haven't had a chance to play with this yet, but here are a few docs that look very promising for matrices using SSE
https://www.ccs.uky.edu/~thorne/cs521_sm4/Docs/General/fulltext.pdf
http://cache-www.intel.com/cd/00/00/04/06/40624_w_3d_transform.pdf
http://www-cs.ccny.cuny.edu/~gertner/Cs342Fall2007/20071107SIMD/Matrix-vector%20multiplication_4.pdf
http://www.sandpile.org/docs/intel/isa-ext.htm
near the end of that first document is 17 lines of well-commented SSE code for a matrix multiply
this also looks interesting
from: http://www.cortstratton.org/articles/OptimizingForSSE.php
// BatchMultiply5 -- A modified version of BatchMultiply4 which loads
// vector components individually from memory, thereby allowing us
// to work on TWO VECTORS SIMULTANEOUSLY!
//
// Performance: 20 cycles/vector
void BatchMultiply5(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// initializations in C++ land
Matrix4f mt(m, Matrix4f::TRANSPOSE); // work from a
float *row0 = mt.Ref();
static const int vecSize = 2 * sizeof(Vector4f);
// if there are an odd number of vectors, process the first one
// separately and advance the pointers
if (len & 0x1) {
MatrixMultiply3(mt, vin, vout);
++vin;
++vout;
}
len >>= 1; // we process two vectors at a time
__asm {
mov esi, vin
mov edi, vout
mov ecx, len
// load columns of matrix into xmm4-7
mov edx, row0
movaps xmm4, [edx]
movaps xmm5, [edx+0x10]
movaps xmm6, [edx+0x20]
movaps xmm7, [edx+0x30]
BM5_START:
// process x
movss xmm1, [esi+0x00]
movss xmm3, [esi+0x10]
shufps xmm1, xmm1, 0x00
prefetchnta [esi+0x30]
shufps xmm3, xmm3, 0x00
mulps xmm1, xmm4
prefetchnta [edi+0x30]
mulps xmm3, xmm4
// process y
movss xmm0, [esi+0x04]
movss xmm2, [esi+0x14]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm5
mulps xmm2, xmm5
addps xmm1, xmm0
addps xmm3, xmm2
// process z
movss xmm0, [esi+0x08]
movss xmm2, [esi+0x18]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm6
mulps xmm2, xmm6
addps xmm1, xmm0
addps xmm3, xmm2
// process w (hiding some pointer increments between the
// multiplies)
movss xmm0, [esi+0x0C]
movss xmm2, [esi+0x1C]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm7
add esi, vecSize
mulps xmm2, xmm7
add edi, vecSize
addps xmm1, xmm0
addps xmm3, xmm2
// write output vectors to memory, and loop
movaps [edi-0x20], xmm1
movaps [edi-0x10], xmm3
dec ecx
jnz BM5_START
}
}
// BatchTransform1 -- A modified version of BatchMultiply4 which makes
// an additional assumption about the vectors in vin: if each vector's
// 4th element (the homogenous coordinate w) is assumed to be 1.0 (as is
// the case for 3D vertices), we can eliminate a move, a shuffle and a
// multiply instruction.
//
// Performance: 17 cycles/vector
void BatchTransform1(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len)
{
// initializations in C++ land
Matrix4f mt(m, Matrix4f::TRANSPOSE); // work from a
float *row0 = mt.Ref();
static const int vecSize = 2 * sizeof(Vector4f);
// if there are an odd number of vectors, process the first one
// separately and advance the pointers
if (len & 0x1) {
MatrixMultiply3(mt, vin, vout);
++vin;
++vout;
}
len >>= 1; // we process two vectors at a time
__asm {
mov esi, vin
mov edi, vout
mov ecx, len
// load columns of matrix into xmm4-7
mov edx, row0
movaps xmm4, [edx]
movaps xmm5, [edx+0x10]
movaps xmm6, [edx+0x20]
movaps xmm7, [edx+0x30]
BT2_START:
// process x (hiding the prefetches in the delays)
movss xmm1, [esi+0x00]
movss xmm3, [esi+0x10]
shufps xmm1, xmm1, 0x00
prefetchnta [edi+0x30]
shufps xmm3, xmm3, 0x00
mulps xmm1, xmm4
prefetchnta [esi+0x30]
mulps xmm3, xmm4
// process y
movss xmm0, [esi+0x04]
movss xmm2, [esi+0x14]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm5
mulps xmm2, xmm5
addps xmm1, xmm0
addps xmm3, xmm2
// process z (hiding some pointer arithmetic between
// the multiplies)
movss xmm0, [esi+0x08]
movss xmm2, [esi+0x18]
shufps xmm0, xmm0, 0x00
shufps xmm2, xmm2, 0x00
mulps xmm0, xmm6
add esi, vecSize
mulps xmm2, xmm6
add edi, vecSize
addps xmm1, xmm0
addps xmm3, xmm2
// process w
addps xmm1, xmm7
addps xmm3, xmm7
// write output vectors to memory and loop
movaps [edi-0x20], xmm1
movaps [edi-0x10], xmm3
dec ecx
jnz BT2_START
}
}
I had a quick look at those, they seem to use AoS as well.. but i see so much hype about swizzle and SoA.. but I'm honestly at a loss to see how it helps ...
johnsa,
i don't know how you've made your 3D engine (imho it must be scary :toothy)
BUT, with SSE+, you're in the industrial world ! you must have a global point of view, and not see task after task...
example, IF you transpose your matrix (an extra job made only once), you will be able to use :
(or something similar, depending of your needs, here for 4 points of a 3d objetc)
; load 4 points
movaps XMM1,OWORD PTR [edx] ; XMM1 = X2,Z1,Y1,X1
movaps XMM2,OWORD PTR [edx+16] ; XMM2 = Y3,X3,Z2,Y2
movaps XMM3,OWORD PTR [edx+32] ; XMM3 = Z4,Y4,X4,Z3
; substract position of the point of view
subps XMM1,OWORD PTR (_Workd3D_ PTR [esi])._Position_Vue_Plus_01_ ; subtract pov position povX,povZ,povY,povX to X2,Z1,Y1,X1
subps XMM2,OWORD PTR (_Workd3D_ PTR [esi])._Position_Vue_Plus_05_ ; subtract pov position povY,povX,povZ,povY to Y3,X3,Z2,Y2
subps XMM3,OWORD PTR (_Workd3D_ PTR [esi])._Position_Vue_Plus_09_ ; subtract pov position povZ,povY,povX,povZ to Z4,Y4,X4,Z3
; calc point 1...
pshufd XMM5,XMM1,0AAh ; XMM5 = Z1,Z1,Z1,Z1
pshufd XMM4,XMM1,055h ; XMM4 = Y1,Y1,Y1,Y1
pshufd XMM0,XMM1,000h ; XMM0 = X1,X1,X1,X1
mulps XMM0,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line1_ ; )multiply by our transposed matrix
mulps XMM4,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line2_ ; )
mulps XMM5,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line3_ ; )
addps XMM0,XMM4 ; ) add the results
addps XMM0,XMM5 ; )
; here XMM0 = _,Z1,Y1,X1
; calc point 2...
pshufd XMM5,XMM2,055h ; XMM3 = Z2,Z2,Z2,Z2
pshufd XMM4,XMM2,000h ; XMM1 = Y2,Y2,Y2,Y2
pshufd XMM1,XMM1,0FFh ; XMM1 = X2,X2,X2,X2
mulps XMM1,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line1_ ; )multiply by our transposed matrix
mulps XMM4,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line2_ ; )
mulps XMM5,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line3_ ; )
addps XMM1,XMM4 ; ) add the results
addps XMM1,XMM5 ; )
; here XMM1 = _,Z2,Y2,X2
; calc point 3...
pshufd XMM5,XMM3,000h ; XMM2 = Z3,Z3,Z3,Z3
pshufd XMM4,XMM2,0AAh ; XMM4 = X3,X3,X3,X3
pshufd XMM2,XMM2,0FFh ; XMM6 = Y3,Y3,Y3,Y3
mulps XMM2,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line2_ ; )
mulps XMM4,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line1_ ; )multiply by our transposed matrix
mulps XMM5,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line3_ ; )
addps XMM2,XMM4 ; ) add the results
addps XMM2,XMM5 ; )
; here XMM2 = _,Z3,Y3,X3
; calc point 4...
pshufd XMM5,XMM3,0AAh ; XMM5 = Y4,Y4,Y4,Y4
pshufd XMM4,XMM3,055h ; XMM3 = X4,X4,X4,X4
pshufd XMM3,XMM3,0FFh ; XMM7 = Z4,Z4,Z4,Z4
mulps XMM3,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line3_ ; )
mulps XMM4,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line1_ ; )multiply by our transposed matrix
mulps XMM5,OWORD PTR (_Object3D_ PTR [edi])._Matrix_._Line2_ ; )
addps XMM3,XMM4 ; ) add the results
addps XMM3,XMM5 ; )
; here XMM3 = _,Z4,Y4,X4
johnsa & dedndave, IF you look how the net idiots do their stuff, you will be influenced and will not go very far... just my 2 cts....
we have to start someplace, NightWare - lol
but, i have always been a big fan of Hank Ford