So in my quest for optimizations, comparison timings etc and while working on a vector math library I decided to throw in
a vector-normalize routine using FPU. Below is the outcome of that after a few variations with which this seemed to perform the
best.
align 16
Vector3D_Normalize_FPU PROC ptrVR:DWORD, ptrV1:DWORD
mov esi,ptrV1
mov edi,ptrVR
fld dword ptr [esi]
fmul st,st(0)
fld dword ptr [esi+4]
fmul st,st(0)
faddp st(1),st
fld dword ptr [esi+8]
fmul st,st(0)
faddp st(1),st
fsqrt
fld dword ptr [esi]
fdiv st,st(1)
fstp dword ptr [edi]
fld dword ptr [esi+4]
fdiv st,st(1)
fstp dword ptr [edi+4]
fld dword ptr [esi+8]
fdiv st,st(1)
fstp dword ptr [edi+8]
ret
Vector3D_Normalize_FPU ENDP
So a testpiece with masm using both GetTickCount and Timers.asm as supplied on this forum running 1,000,000 iterations
provide a result of 650ms.
So I decide to whip up a quick benchmark in Visual Studio 2008 C# which runs 1,000,000 iterations as well and timed
with Environment.TickCount (corresponds to GetTickCount) / StopWatch Object(more precise).
It's code is very simple and looks like:
public static vector Normalize(vector in)
{
vector result = new vector();
float veclength = (float)Math.Sqrt( (in.x*in.x) + (in.y*in.y) + (in.z*in.z) );
result.x = in.x / veclength;
result.y = in.y / veclength;
result.z = in.z / veclength;
return(result);
}
.
.
.
for (int i=0;i<1000000;i++)
{
vecB = Normalize(vecA);
}
I checked the disassembly and the fpu/asm code is very similar to my function, with a bit of extra waffle as expected...
Timing result for 1 million iterations: 150ms.
What the hell??
How can the debug C# code be 5 times faster than the asm function??
I checked and all loops, proc and variables are aligned.. I also double checked to make sure the C# method wasnt
automatically being inlined..(which it's not)..
What gives?
The only possible explanation I have is that C#'s timers are lying... or else my asm code is honestly 5 times slower...
Ok... so to make sure the timers aren't lying I increased the iterations to 100 million... so that it was in a range that i could verify by manually counting.. and true enough.. the C# ran in 14.8seconds and the ASM testpiece ran in 67 seconds..... what on earth? I also confirmed by looking at the disassembly that C#.NET uses a straight fsqrt and doesn't have any custom (faster) sqrt implementation.
John,
You should be able to use the direct API in C++ and I would be inclined to use an identical timing method on both to make sure you are benchmarking the same thing. A comment from another member recently was if you don't need the higher level of precision that you may get a substantial speed increase using SSE2/3/4 instead.
johnsa,
i've just recoded this one 2 days ago so it must be tested, but it seems fast :
movaps XMM0,OWORD PTR [esi] ;; XMM0 = W,Z,Y,X
movaps XMM2,XMM0 ;; XMM2 = W,Z,Y,X
mulps XMM0,XMM0 ;; XMM0 = W^2,Z^2,Y^2,X^2
pshufd XMM1,XMM0,04Eh ;; XMM1 = Y^2,X^2,W^2,Z^2
addps XMM1,XMM0 ;; XMM1 = W^2+Y^2,Z^2+X^2,Y^2+W^2,X^2+Z^2
pshufd XMM0,XMM1,0B1h ;; XMM0 = Z^2+X^2,W^2+Y^2,X^2+Z^2,Y^2+W^2
addps XMM0,XMM1 ;; XMM0 = Z^2+X^2+W^2+Y^2,W^2+Y^2+Z^2+X^2,X^2+Z^2+Y^2+W^2,Y^2+W^2+X^2+Z^2
rsqrtps XMM0,XMM0 ;; XMM0 = 1/(X^2+Y^2+Z^2+W^2),1/(X^2+Y^2+Z^2+W^2),1/(X^2+Y^2+Z^2+W^2),1/(X^2+Y^2+Z^2+W^2)
mulps XMM0,XMM2 ;; XMM0 = W*1/(X^2+Y^2+Z^2+W^2),Z*1/(X^2+Y^2+Z^2+W^2),Y*1/(X^2+Y^2+Z^2+W^2),X*1/(X^2+Y^2+Z^2+W^2)
movaps OWORD PTR [edi],XMM0 ;; XMM0 = W/(X^2+Y^2+Z^2+W^2),Z/(X^2+Y^2+Z^2+W^2),Y/(X^2+Y^2+Z^2+W^2),X/(X^2+Y^2+Z^2+W^2)
Using shufps should be faster than pshufd, because the CPU doesn't need to switch SSE register modes. Also, using the following instead of shufps,addps,shufps,addps might be even faster, even though haddps has higher latency and throughput than most SSE instructions:
haddps xmm0,xmm0
haddps xmm0,xmm0
John,
I don't have Visual Studio 2008 C# so I did my comparisons with the Microsoft Visual C++ Toolkit 2003, using this code:
#include <windows.h>
// The pragma optimize statements are to prevent the compiler
// from optimizing the loop, eliminating the redundant calls by
// placing a copy of the code inline and executing it once.
typedef struct _vector {
float x,y,z;
}vector;
#pragma optimize( "t", on )
vector normalize( vector in )
{
vector result;
float veclength = (float)sqrt( (in.x*in.x) + (in.y*in.y) + (in.z*in.z) );
result.x = in.x / veclength;
result.y = in.y / veclength;
result.z = in.z / veclength;
return(result);
}
#pragma optimize( "", off )
int main(void)
{
vector r,v={1,2,3};
int i,t1,t2;
t1=GetTickCount();
for (i=0 ; i<1000000 ; i++){
r = normalize(v);
}
t2=GetTickCount();
printf("%d\n",t2-t1);
printf("%f\t%f\t%f\n",r.x,r.y,r.z);
getch();
return 0;
}
And this command line:
cl /O2 /G6 /FA normalize.c
Running on my P3 the time varied somewhat, but a typical result would be:
281
0.267261 0.534522 0.801784
This is the asm code that the compiler generated:
PUBLIC _normalize
PUBLIC __real@3f800000
EXTRN __fltused:NEAR
; COMDAT __real@3f800000
; File c:\program files\microsoft visual c++ toolkit 2003\my\normalize.c
CONST SEGMENT
__real@3f800000 DD 03f800000r ; 1
; Function compile flags: /Ogty
CONST ENDS
; COMDAT _normalize
_TEXT SEGMENT
_result$ = -12 ; size = 12
$T74080 = 8 ; size = 4
_in$ = 12 ; size = 12
_normalize PROC NEAR ; COMDAT
; Line 13
sub esp, 12 ; 0000000cH
; Line 15
fld DWORD PTR _in$[esp+16]
; Line 19
mov eax, DWORD PTR $T74080[esp+8]
fmul DWORD PTR _in$[esp+16]
mov ecx, eax
fld DWORD PTR _in$[esp+12]
fmul DWORD PTR _in$[esp+12]
faddp ST(1), ST(0)
fld DWORD PTR _in$[esp+8]
fmul DWORD PTR _in$[esp+8]
faddp ST(1), ST(0)
fsqrt
fdivr DWORD PTR __real@3f800000
fld DWORD PTR _in$[esp+8]
fmul ST(0), ST(1)
fstp DWORD PTR _result$[esp+12]
mov edx, DWORD PTR _result$[esp+12]
fld DWORD PTR _in$[esp+12]
mov DWORD PTR [ecx], edx
fmul ST(0), ST(1)
fstp DWORD PTR _result$[esp+16]
mov edx, DWORD PTR _result$[esp+16]
mov DWORD PTR [ecx+4], edx
fmul DWORD PTR _in$[esp+16]
fstp DWORD PTR _result$[esp+20]
mov edx, DWORD PTR _result$[esp+20]
mov DWORD PTR [ecx+8], edx
; Line 20
add esp, 12 ; 0000000cH
ret 0
_normalize ENDP
In my asm version I basically duplicated the compiler-generated code, without the integer instructions.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
include \masm32\include\masm32rt.inc
vector struct
x REAL4 ?
y REAL4 ?
z REAL4 ?
vector ends
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.data
v1 vector <1.0,2.0,3.0>
vr vector <>
dblx REAL8 0.0
dbly REAL8 0.0
dblz REAL8 0.0
.code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 16
Vector3D_Normalize_FPU PROC ptrVR:DWORD, ptrV1:DWORD
mov esi,ptrV1
mov edi,ptrVR
fld dword ptr [esi]
fmul st,st(0)
fld dword ptr [esi+4]
fmul st,st(0)
faddp st(1),st
fld dword ptr [esi+8]
fmul st,st(0)
faddp st(1),st
fsqrt
fld dword ptr [esi]
fdiv st,st(1)
fstp dword ptr [edi]
fld dword ptr [esi+4]
fdiv st,st(1)
fstp dword ptr [edi+4]
fld dword ptr [esi+8]
fdiv st,st(1)
fstp dword ptr [edi+8]
ret
Vector3D_Normalize_FPU ENDP
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 4
normalize proc pv1:DWORD, pvr:DWORD
mov ecx, pv1
mov edx, pvr
fld [ecx].vector.z
fmul [ecx].vector.z
fld [ecx].vector.y
fmul [ecx].vector.y
faddp st(1), st
fld [ecx].vector.x
fmul [ecx].vector.x
faddp st(1), st
fsqrt
fld4 1.0
fdivr
fld [ecx].vector.x
fmul st, st(1)
fstp [edx].vector.x
fld [ecx].vector.y
fmul st, st(1)
fstp [edx].vector.y
fmul [ecx].vector.z
fstp [edx].vector.z
ret
normalize endp
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
invoke normalize, ADDR v1, ADDR vr
fld vr.x
fstp dblx
fld vr.y
fstp dbly
fld vr.z
fstp dblz
invoke crt_printf, chr$("%f %f %f %c"), dblx, dbly, dblz, 10
mov ebx, 1000000
invoke GetTickCount
push eax
.WHILE ebx
invoke Vector3D_Normalize_FPU, ADDR vr, ADDR v1
dec ebx
.ENDW
invoke GetTickCount
pop edx
sub eax, edx
print ustr$(eax),13,10
mov ebx, 1000000
invoke GetTickCount
push eax
.WHILE ebx
invoke normalize, ADDR v1, ADDR vr
dec ebx
.ENDW
invoke GetTickCount
pop edx
sub eax, edx
print ustr$(eax),13,10
inkey "Press any key to exit..."
exit
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start
In my results the time ratio between your code and the compiler's code is 6.88.
0.267261 0.534522 0.801784
1932
241
I didn't have time to determine exactly what is causing the large difference, but I doubt that replacing three fdiv instructions with an fdivr and three fmul instructions could account for it. And I didn't try any other codings to see if the time could be improved. I would like to see how the timing compares for the SSE code.
fdiv has a very high latency and is not pipelined. So you should definitely replace the three divisions by one reciprocal and three multiplications. The rest of the performance difference comes from inlining the call.
This illustrates just how smart compilers are these days. Only for SIMD code they haven't really caught up yet. NightWare's SSE code will definitely be very fast. Even faster though is normalizing four vectors in parallel.
Quote from: Neo on June 16, 2008, 03:42:55 AM
Using shufps should be faster than pshufd, because the CPU doesn't need to switch SSE register modes.
There is no such thing as switching SSE register modes. Using pshufd instead of only shufps resulted in a few percent higher performance for my (floating-point) vector code. LLVM uses pshufd aggressively as well.
QuoteAlso, using the following instead of shufps,addps,shufps,addps might be even faster, even though haddps has higher latency and throughput than most SSE instructions:
haddps xmm0,xmm0
haddps xmm0,xmm0
haddps is microcoded, resulting in decoder stalls. It only appears to be beneficial when you need all four sums, which isn't the case for a dot product.
Quote from: c0d1f1ed on June 16, 2008, 11:52:53 AM
fdiv has a very high latency and is not pipelined. So you should definitely replace the three divisions by one reciprocal and three multiplications. The rest of the performance difference comes from inlining the call.
What do you mean by "inlining the call"? And how does the high latency explain the difference when the fdiv instructions are independent? Using the exe in the attachment, on my P3 I get cycle counts of 934 and 130. Per Agner Fog, for a P3 the latencies for fmul and fdiv, assuming the worst-case 64-bit precision, should be 5/6 and 38, and the reciprocal throughputs 2 and 37. Even if I assume that the fdiv instructions are dependent, add the latency and reciprocal throughput together, and ignore the effect of the increased number of fmul instructions, I can't see how the two additional fdiv instructions could explain a difference of 804 cycles.
QuoteThis illustrates just how smart compilers are these days.
Or not. By using the same basic algorithm I was able to code a faster, and probably smaller, procedure in one try. And I suspect that it can be improved significantly.
[attachment deleted by admin]
Quote from: MichaelW on June 16, 2008, 04:01:58 AM
I would like to see how the timing compares for the SSE code.
hi,
here the results on my core2 (i've multiplied the number of loop by 10, otherwise the results are not significant due to the 7ms imprecision of gettickcount, and aligned the vectors to be usable by sse+) :
7754
436
63
i've forgotten to add your last code in the test...
[attachment deleted by admin]
Cool. I tried a few different variations and got some semi-surprising results:
11544 - The one with 3 fdivs
562 - The one with 1 fdivr
109 - Using haddps,haddps
94 - Using shufps,addps,shufps,addps
78 - Using pshufd,addps,pshufd,addps
47 - Inlined using pshufd,addps,pshufd,addps
The system is a Core 2 Duo 1.66GHz with 2GB of RAM running Windows Vista 64-bit. The program was 32-bit, which might skew results quite a bit.
Hi,
I tried re-organizing the fdivs, tried replacing it with a reciprocal multiply and then end result is much the same. The asm hand-coded version which is almost identical to what i saw come out of the c# compiler is approx. 5 times slower still..
I'm aware that moving routines like this to SSE is the idea, which I already have for both AOS and SOA models, however I wanted to have a simple high precision fpu based version of the functions as well.
I'm really stumped as to why the version is SOOO much slower than the compiler output.
The problem is not fdiv in general, but the way it is being (incorrectly) used. I modified my cycle count code to test various forms of fdiv, and determined that the problem is a FPU stack fault. If you uncomment the code that tests the stack fault bit in the FPU status word, for the slow FDIV instruction sequences the code will display 7 OKs, and then display alternating SF and OK for the duration of the test.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
include \masm32\include\masm32rt.inc
.686
include \masm32\macros\timers.asm
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
vector struct
x REAL4 ?
y REAL4 ?
z REAL4 ?
vector ends
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.data
v1 vector <1.0,2.0,3.0>
vr vector <>
fpusw dw 0
.code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 16
Vector3D_Normalize_FPU PROC ptrVR:DWORD, ptrV1:DWORD
mov esi,ptrV1
mov edi,ptrVR
fld dword ptr [esi]
fmul st,st(0)
fld dword ptr [esi+4]
fmul st,st(0)
faddp st(1),st
fld dword ptr [esi+8]
fmul st,st(0)
faddp st(1),st
fsqrt
fld dword ptr [esi]
fdiv st,st(1)
fstp dword ptr [edi]
fld dword ptr [esi+4]
fdiv st,st(1)
fstp dword ptr [edi+4]
fld dword ptr [esi+8]
fdiv st,st(1)
fstp dword ptr [edi+8]
ret
Vector3D_Normalize_FPU ENDP
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 4
normalize proc pv1:DWORD, pvr:DWORD
mov ecx, pv1
mov edx, pvr
fld [ecx].vector.z
fmul [ecx].vector.z
fld [ecx].vector.y
fmul [ecx].vector.y
faddp st(1), st
fld [ecx].vector.x
fmul [ecx].vector.x
faddp st(1), st
fsqrt
fld4 1.0
fdivr
fld [ecx].vector.x
fmul st, st(1)
fstp [edx].vector.x
fld [ecx].vector.y
fmul st, st(1)
fstp [edx].vector.y
fmul [ecx].vector.z
fstp [edx].vector.z
fld DWORD PTR [ecx]
fld DWORD PTR [ecx]
;fdiv ; +12 cycles
;fdivr ; +12 cycles
;fdiv st, st(1) ; +914 cycles stack fault
fdiv st(1), st ; +914 cycles stack fault
;fdivp st(1), st ; +12 cycles
; ------------------------------------------------
; Uncomment this to see state of stack fault bit.
; ------------------------------------------------
comment ~
fstsw fpusw
fwait
test fpusw, 40h
jz @F
print "SF",13,10
@@:
print "OK",13,10
~
fstp DWORD PTR [ecx]
ret
normalize endp
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
invoke Sleep, 3000
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke Vector3D_Normalize_FPU, ADDR vr, ADDR v1
counter_end
print ustr$(eax)," cycles, Vector3D_Normalize_FPU",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke normalize, ADDR v1, ADDR vr
counter_end
print ustr$(eax)," cycles, normalize",13,10
inkey "Press any key to exit..."
exit
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start
And in case it's not obvious from the comments, when fdiv is used correctly on my P3 the following sequence executes in 12 cycles:
fld DWORD PTR [ecx]
fld DWORD PTR [ecx]
fdiv
fstp DWORD PTR [ecx]
And to cover all bases, if in the timed test of Vector3D_Normalize_FPU I follow the last:
fdiv st,st(1)
With:
ffree st(1)
Then there is no stack fault and the results on my P3 look much more reasonable:
0.267261 0.534522 0.801784
0.267261 0.534522 0.801784
320
231
Nice observation MichaelW! That stack fault penalty is shockingly high. Good thing that can't happen with SSE, and compilers avoid it.
Awesome spot!
I cannot believe an FPU stack fault is that significant!
Important tip: If you pushed it.. you should pop it :)
My result are much better now:
1,000,000 iterations:
FPU = 536ms
SSE = 106ms
C# = 156ms
Updated FPU Version:
align 16
Vector3D_Normalize_FPU PROC ptrVR:DWORD, ptrV1:DWORD
mov esi,ptrV1
mov edi,ptrVR
fld dword ptr (Vector3D PTR [esi]).x
fmul st,st(0)
fld dword ptr (Vector3D PTR [esi]).y
fmul st,st(0)
faddp st(1),st
fld dword ptr (Vector3D PTR [esi]).z
fmul st,st(0)
faddp st(1),st
fsqrt
fld1
fdivr
fld dword ptr (Vector3D PTR [esi]).x
fmul st,st(1)
fstp dword ptr (Vector3D PTR [edi]).x
fld dword ptr (Vector3D PTR [esi]).y
fmul st,st(1)
fstp dword ptr (Vector3D PTR [edi]).y
fmul dword ptr (Vector3D PTR [esi]).z
fstp dword ptr (Vector3D PTR [edi]).z
ret
Vector3D_Normalize_FPU ENDP
No stack faults reported over 1000 test runs.
Still about 400ms (over 1,000,000 iterations) slower than the C# compiled version..
For reference here is the C# dis-asm output:
float rlen = (float)Math.Sqrt((i.x * i.x) + (i.y * i.y) + (i.z * i.z));
0000004b fld dword ptr [esi+4]
0000004e fmul st,st(0)
00000050 fld dword ptr [esi+8]
00000053 fmul st,st(0)
00000055 faddp st(1),st
00000057 fld dword ptr [esi+0Ch]
0000005a fmul st,st(0)
0000005c faddp st(1),st
0000005e fstp qword ptr [ebp-58h]
00000061 fld qword ptr [ebp-58h]
00000064 fsqrt
00000066 fstp qword ptr [ebp-50h]
00000069 fld qword ptr [ebp-50h]
0000006c fstp dword ptr [ebp-44h]
r.x = i.x / rlen;
0000006f fld dword ptr [esi+4]
00000072 fdiv dword ptr [ebp-44h]
00000075 fstp dword ptr [ebx+4]
r.y = i.y / rlen;
00000078 fld dword ptr [esi+8]
0000007b fdiv dword ptr [ebp-44h]
0000007e fstp dword ptr [ebx+8]
r.z = i.z / rlen;
00000081 fld dword ptr [esi+0Ch]
00000084 fdiv dword ptr [ebp-44h]
00000087 fstp dword ptr [ebx+0Ch]
return (r);
Not sure if this is helpful: I have tried to "synchronise" the two listings above in two text files. Open them in an editor, and Alt Tab task switch to spot the differences.
Apart from the fld1, fdivr sequence which aims at substituting the divs with muls, there is this oddity:
; r.z = i.z / rlen;
; *** fld dword ptr [esi+0Ch]
[attachment deleted by admin]
MichaelW, I've taken your test piece and re-inserted my updated FPU version. I've also added the MS timings for both at the bottom: You'll see now they're almost identical although I think using fmul st,st(0) to square provides a slight performance increase.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
include \masm32\include\masm32rt.inc
.686
include timers.asm
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
Vector3D STRUCT
x REAL4 0.0
y REAL4 0.0
z REAL4 0.0
w REAL4 0.0
Vector3D ENDS
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.data
v1 Vector3D <1.0,2.0,3.0,1.0>
vr Vector3D <>
fpusw dw 0
.code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 16
Vector3D_Normalize_FPU PROC ptrVR:DWORD, ptrV1:DWORD
mov esi,ptrV1
mov edi,ptrVR
fld dword ptr (Vector3D PTR [esi]).x
fmul st,st(0)
fld dword ptr (Vector3D PTR [esi]).y
fmul st,st(0)
faddp st(1),st
fld dword ptr (Vector3D PTR [esi]).z
fmul st,st(0)
faddp st(1),st
fsqrt
fld1
fdivr
fld dword ptr (Vector3D PTR [esi]).x
fmul st,st(1)
fstp dword ptr (Vector3D PTR [edi]).x
fld dword ptr (Vector3D PTR [esi]).y
fmul st,st(1)
fstp dword ptr (Vector3D PTR [edi]).y
fmul dword ptr (Vector3D PTR [esi]).z
fstp dword ptr (Vector3D PTR [edi]).z
ret
Vector3D_Normalize_FPU ENDP
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 4
normalize proc pv1:DWORD, pvr:DWORD
mov ecx, pv1
mov edx, pvr
fld [ecx].Vector3D.z
fmul [ecx].Vector3D.z
fld [ecx].Vector3D.y
fmul [ecx].Vector3D.y
faddp st(1), st
fld [ecx].Vector3D.x
fmul [ecx].Vector3D.x
faddp st(1), st
fsqrt
fld4 1.0
fdivr
fld [ecx].Vector3D.x
fmul st, st(1)
fstp [edx].Vector3D.x
fld [ecx].Vector3D.y
fmul st, st(1)
fstp [edx].Vector3D.y
fmul [ecx].Vector3D.z
fstp [edx].Vector3D.z
ret
normalize endp
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
invoke Sleep, 3000
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke Vector3D_Normalize_FPU, ADDR vr, ADDR v1
counter_end
print ustr$(eax)," cycles, Vector3D_Normalize_FPU",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke normalize, ADDR v1, ADDR vr
counter_end
print ustr$(eax)," cycles, normalize",13,10
timer_begin 10000000, HIGH_PRIORITY_CLASS
invoke Vector3D_Normalize_FPU, ADDR v1, ADDR vr
timer_end
print ustr$(eax)
print chr$(" Vector3D Normalize ms",13,10)
timer_begin 10000000, HIGH_PRIORITY_CLASS
invoke normalize, ADDR v1, ADDR vr
timer_end
print ustr$(eax)
print chr$("Normalize ms",13,10)
inkey "Press any key to exit..."
exit
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start
So.. on my machine both of these come in at around 530ms... while the C#.Net version still manages 150ms for the same number of iterations... so we're still about 4 times slower... and they're definately no stack faults now.
Any more thoughts? Perhaps re-compare this to the C++ testpiece?
MichaelW, I wrote the same test piece in C++, using GetTickCount (same as yours basically)
and the result from that are:
94ms (release mode with the pragmas)
78ms (release mode - removed the pragma)
155ms (debug mode with pragmas)
So the C# in release mode seems to be equivalent to C++ in debug mode. C++ version is now about 6 times faster than the asm test piece i just posted.
I'm wondering if this is somehow specific to my machine?
Perhaps re-try the last test-piece I posted and the C++ one again.
73 cycles, Vector3D_Normalize_FPU
77 cycles, normalize
345 Vector3D Normalize ms
320Normalize ms
P4 2.4 GHz
Numerical output is identical, I suppose?
It's odd that mine comes in a few cycles less and with fewer memory accesses yet MichaelW's is about 20ms faster on P4.. on my PM the fmul st,st(0) seems to be faster, but P4 it seems like the memory is.. odd..
Quote from: johnsa on June 17, 2008, 05:47:46 PM
C++ version is now about 6 times faster than the asm test piece i just posted.
This just doesn't make sense: C++ uses assembler (and machine code, eventually), so it cannot be faster than the
same code in asm. Can you isolate those bits that are just a little bit different? And eliminate the differences step by step? I am a newbie in this field, but things that come to my mind are:
- stack fault (see above)
- denormalised numbers (that's why I asked earlier if the results - not: the timings - are identical)
For example, between your two listings I see the compiler insert two fstp/fld sequences; what is their function? Delay FPU execution??
faddp st(1),st
; c: fstp qword ptr [ebp-58h]
; c: fld qword ptr [ebp-58h]
fsqrt
; c: fstp qword ptr [ebp-50h]
; c: fld qword ptr [ebp-50h]
Why is there no
fld dword ptr (Vector3D PTR [esi]).z
in the third last row of your asm listing?
We've ruled out stack faults now after checking the fpu status word's stack bit after 1000 iterations of the function.
Results are correct and not denormal as the input vector isn't modified. It's repeatedly updated and stored into a result vector.
Those fld, fstp's from the C code I think are a product of it not being smart enough to optimize in the dependancy. It completes a result stores it, then the next stage of the calculation reloads that value.
Hence why it fstp to [ebp-58h] and then immediately loads the same value again.
I don't load the z in the 3rd last row because the calculation i want is (1/length vector) which is already in st0 to be multiplied with z. so just doing the mul with produce the result in st0 which can then be fstp immediately back to z.
More than that... I'm utterly confused :)
I had run it through Olly and did not see anything suspicious - thanks for explaining in detail what you have done. Really odd. Any chance to isolate the slow instruction? Inserting QPC calls is probably not an option...
Re ran everything via debugger and double checked fpu status after every instruction, no exceptions. The only thing that gets set every is the P(recision) bit in the status when the fsqrt happens which is unavoidable.
I've now come to the conclusion that C#/C++ compiler must be doing something else sneaky somewhere... like setting the FPU to lowest precision if your code never uses a double.. maybe they assume that if the whole code only contains floats, they can get away with setting the round mode to real4 and maybe trunc'ing instead of round.. this could speed up the fpu operations? ::)
Quote from: johnsa on June 17, 2008, 10:54:50 PM
like setting the FPU to lowest precision if your code never uses a double.. maybe they assume that if the whole code only contains floats, they can get away with setting the round mode to real4 and maybe trunc'ing instead of round.. this could speed up the fpu operations? ::)
Yup, that could certainly do it, especially for fdiv and fsqrt.
Quote from: johnsa on June 17, 2008, 10:54:50 PM
I've now come to the conclusion that C#/C++ compiler must be doing something else sneaky somewhere... like setting the FPU to lowest precision if your code never uses a double.. maybe they assume that if the whole code only contains floats, they can get away with setting the round mode to real4 and maybe trunc'ing instead of round.. this could speed up the fpu operations? ::)
Good idea. The clock cycle counts for fdiv in my previous test seemed to imply that the precision was set to something less than 64 bits. This code tests the effects on a version of your original code with an ffree added to eliminate the stack fault, and my version.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
include \masm32\include\masm32rt.inc
.686
include \masm32\macros\timers.asm
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
vector struct
x REAL4 ?
y REAL4 ?
z REAL4 ?
vector ends
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.data
v1 vector <1.0,2.0,3.0>
vr vector <>
fpusw dw 0
fpucw dw 0
.code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 16
Vector3D_Normalize_FPU PROC ptrVR:DWORD, ptrV1:DWORD
mov esi,ptrV1
mov edi,ptrVR
fld dword ptr [esi]
fmul st,st(0)
fld dword ptr [esi+4]
fmul st,st(0)
faddp st(1),st
fld dword ptr [esi+8]
fmul st,st(0)
faddp st(1),st
fsqrt
fld dword ptr [esi]
fdiv st,st(1)
fstp dword ptr [edi]
fld dword ptr [esi+4]
fdiv st,st(1)
fstp dword ptr [edi+4]
fld dword ptr [esi+8]
fdiv st,st(1)
ffree st(1)
fstp dword ptr [edi+8]
comment |
fstsw fpusw
fwait
test fpusw, 40h
jz @F
print "SF",13,10
@@:
print "OK",13,10
|
ret
Vector3D_Normalize_FPU ENDP
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 4
normalize proc pv1:DWORD, pvr:DWORD
mov ecx, pv1
mov edx, pvr
fld [ecx].vector.z
fmul [ecx].vector.z
fld [ecx].vector.y
fmul [ecx].vector.y
faddp st(1), st
fld [ecx].vector.x
fmul [ecx].vector.x
faddp st(1), st
fsqrt
fld4 1.0
fdivr
fld [ecx].vector.x
fmul st, st(1)
fstp [edx].vector.x
fld [ecx].vector.y
fmul st, st(1)
fstp [edx].vector.y
fmul [ecx].vector.z
fstp [edx].vector.z
ret
normalize endp
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
invoke Sleep, 3000
; ---------------------------------
; Display the current value of the
; control word PC field (bits 9-8).
; ---------------------------------
fstcw fpucw
print "Control word PC field = "
movzx eax, fpucw
shr eax, 8
and eax, 11b
print uhex$(eax),"h",13,10,13,10
; ------------------------------------------------------
; Restore FPU to initialized state to set the PC field
; to 11b = 64 bits, then read the value and display it.
; ------------------------------------------------------
finit
fstcw fpucw
print "Control word PC field = "
movzx eax, fpucw
shr eax, 8
and eax, 11b
print uhex$(eax),"h",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke Vector3D_Normalize_FPU, ADDR vr, ADDR v1
counter_end
print ustr$(eax)," cycles, Vector3D_Normalize_FPU",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke normalize, ADDR v1, ADDR vr
counter_end
print ustr$(eax)," cycles, normalize",13,10,13,10
; -------------------------------------------------------
; Set the PC field in the control word to 11b = 53 bits,
; then read the value back and display it.
; -------------------------------------------------------
fstcw fpucw
and fpucw, 1111111011111111b
fldcw fpucw
fstcw fpucw
print "Control word PC field = "
movzx eax, fpucw
shr eax, 8
and eax, 11b
print uhex$(eax),"h",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke Vector3D_Normalize_FPU, ADDR vr, ADDR v1
counter_end
print ustr$(eax)," cycles, Vector3D_Normalize_FPU",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke normalize, ADDR v1, ADDR vr
counter_end
print ustr$(eax)," cycles, normalize",13,10,13,10
; -------------------------------------------------------
; Set the PC field in the control word to 00b = 24 bits,
; then read the value back and display it.
; -------------------------------------------------------
fstcw fpucw
and fpucw, not 1100000000b
fldcw fpucw
fstcw fpucw
print "Control word PC field = "
movzx eax, fpucw
shr eax, 8
and eax, 11b
print uhex$(eax),"h",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke Vector3D_Normalize_FPU, ADDR vr, ADDR v1
counter_end
print ustr$(eax)," cycles, Vector3D_Normalize_FPU",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke normalize, ADDR v1, ADDR vr
counter_end
print ustr$(eax)," cycles, normalize",13,10,13,10
inkey "Press any key to exit..."
exit
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start
Results on my P3:
Control word PC field = 00000002h
Control word PC field = 00000003h
188 cycles, Vector3D_Normalize_FPU
130 cycles, normalize
Control word PC field = 00000002h
159 cycles, Vector3D_Normalize_FPU
131 cycles, normalize
Control word PC field = 00000000h
88 cycles, Vector3D_Normalize_FPU
130 cycles, normalize
I didn't have time to determine why my code is not affected, or why the initial PC setting does not match the FPU initialized state, or to perform any function tests to see what effect the PC setting might have on the return values.
On precision bit (Randy Hyde vs Jentje Goslinga) (http://coding.derkeiler.com/Archive/Assembler/comp.lang.asm.x86/2006-04/msg00034.html):
1. There is a lot of misinformation about the precision bit.
Unless I am terribly wrong the precision bit does not affect
Floating Point Multiplication, Addition or Subtraction, but
only Division and Square Root.
It does not even come into play when multiplying integers.
Neither does it affect any of the other (few) transcendentals.
[One might wonder why the precision bit does not affect the
other transcendentals: probably because they are not computed
using an iterative algorithm]
2. Having settled that issue, the control word in the FPU is
initialized on FPINIT to 037FH which masks all FP interrupts
and sets the precision to 64 bits, which is the maximum. You
are probably confusing the 64 bits mantissa which is Extended
Precision with a 64 bit double which is just a double.
Note that there are two bit since there are three settings.
Still, no chance to explain a factor 6 difference with the precision bit set or not set...
MichaelW, you're on to something there.. one little thing though.. TIMERS.ASM calls finit in those end MACROS... so actually in each timing/cycle count loop the PC mode is back to 03h :)
If you print it out directly after the loop it's be re-finit'ed.
Latest Results:
C++ Test-App using straight 3 divs and fsqrt (no recip). 1,000,000 iterations.
156ms debug mode
94ms release mode with pragma optimizations switched off
78ms release mode all optimizations
ASM Test Piece (using reciprocal with fmuls) set to PC to REAL4 - 1,000,000 iterations.
MichaelW's Normalize 13ms
Vector3D Noramlize 13ms
So now.. the ASM version is 5 times faster than the c++ version (mainly due to REAL4 PC and reciprocal).
What is really strange now.. is that I have two different asm files, pretty much identical in the same folder using the same timers.asm running the same loop of the same function.
assemble/link both one runs in 13ms... the other 26ms exactly.. everytime... and for no reason I can see.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
include \masm32\include\masm32rt.inc
.686p
include timers.asm
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
Vector3D STRUCT
x REAL4 0.0
y REAL4 0.0
z REAL4 0.0
w REAL4 0.0
Vector3D ENDS
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.data
align 16
v1 Vector3D <1.0,2.0,3.0,1.0>
vr Vector3D <>
fpusw dw 0
fpucw dw 0
.code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 16
Vector3D_Normalize_FPU PROC ptrVR:DWORD, ptrV1:DWORD
mov esi,ptrV1
mov edi,ptrVR
fld dword ptr (Vector3D PTR [esi]).x
fmul st,st(0)
fld dword ptr (Vector3D PTR [esi]).y
fmul st,st(0)
faddp st(1),st
fld dword ptr (Vector3D PTR [esi]).z
fmul st,st(0)
faddp st(1),st
fsqrt
fld1
fdivr
fld dword ptr (Vector3D PTR [esi]).x
fmul st,st(1)
fstp dword ptr (Vector3D PTR [edi]).x
fld dword ptr (Vector3D PTR [esi]).y
fmul st,st(1)
fstp dword ptr (Vector3D PTR [edi]).y
fmul dword ptr (Vector3D PTR [esi]).z
fstp dword ptr (Vector3D PTR [edi]).z
ret
Vector3D_Normalize_FPU ENDP
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 4
normalize proc pv1:DWORD, pvr:DWORD
mov ecx, pv1
mov edx, pvr
fld [ecx].Vector3D.z
fmul [ecx].Vector3D.z
fld [ecx].Vector3D.y
fmul [ecx].Vector3D.y
faddp st(1), st
fld [ecx].Vector3D.x
fmul [ecx].Vector3D.x
faddp st(1), st
fsqrt
fld4 1.0
fdivr
fld [ecx].Vector3D.x
fmul st, st(1)
fstp [edx].Vector3D.x
fld [ecx].Vector3D.y
fmul st, st(1)
fstp [edx].Vector3D.y
fmul [ecx].Vector3D.z
fstp [edx].Vector3D.z
ret
normalize endp
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
invoke Sleep, 3000
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke Vector3D_Normalize_FPU, ADDR vr, ADDR v1
counter_end
print ustr$(eax)," cycles, Vector3D_Normalize_FPU",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke normalize, ADDR v1, ADDR vr
counter_end
print ustr$(eax)," cycles, normalize",13,10
fstcw fpucw
and fpucw,1111110011111111b
fldcw fpucw
fstcw fpucw
print "Control word PC field = "
movzx eax, fpucw
shr eax, 8
and eax, 11b
print uhex$(eax),"h",13,10
timer_begin 1000000, HIGH_PRIORITY_CLASS
invoke Vector3D_Normalize_FPU, ADDR v1, ADDR vr
timer_end
print ustr$(eax)
print chr$(" Vector3D Normalize ms",13,10)
fstcw fpucw
and fpucw,1111110011111111b
fldcw fpucw
fstcw fpucw
print "Control word PC field = "
movzx eax, fpucw
shr eax, 8
and eax, 11b
print uhex$(eax),"h",13,10
timer_begin 1000000, HIGH_PRIORITY_CLASS
invoke normalize, ADDR v1, ADDR vr
timer_end
print ustr$(eax)
print chr$(" Normalize ms",13,10)
inkey "Press any key to exit..."
exit
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start
Anyhow.. there is the update ASM with PC=REAL4 and the last revision of the actual FPU code.
QuoteTIMERS.ASM calls finit...
You would think I would be able to remember that :red
It works for the first call because the finit comes after the test loop has ended.
This version corrects the problem and displays the return values for the 64 and 24-bit precisions to 8 digits:
EDIT: updated to your most recent procedure.
EDIT2: and now I realize that they assemble to the same instructions, so it stands to reason that the cycle counts would be the same.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
include \masm32\include\masm32rt.inc
.686
include \masm32\macros\timers.asm
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
Vector3D STRUCT
x REAL4 0.0
y REAL4 0.0
z REAL4 0.0
w REAL4 0.0
Vector3D ENDS
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
.data
v1 Vector3D <1.0,2.0,3.0,1.0>
vr Vector3D <>
dblx REAL8 0.0
dbly REAL8 0.0
dblz REAL8 0.0
fpusw dw 0
fpucw dw 0
.code
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 16
Vector3D_Normalize_FPU PROC ptrVR:DWORD, ptrV1:DWORD
mov esi,ptrV1
mov edi,ptrVR
fld dword ptr (Vector3D PTR [esi]).x
fmul st,st(0)
fld dword ptr (Vector3D PTR [esi]).y
fmul st,st(0)
faddp st(1),st
fld dword ptr (Vector3D PTR [esi]).z
fmul st,st(0)
faddp st(1),st
fsqrt
fld1
fdivr
fld dword ptr (Vector3D PTR [esi]).x
fmul st,st(1)
fstp dword ptr (Vector3D PTR [edi]).x
fld dword ptr (Vector3D PTR [esi]).y
fmul st,st(1)
fstp dword ptr (Vector3D PTR [edi]).y
fmul dword ptr (Vector3D PTR [esi]).z
fstp dword ptr (Vector3D PTR [edi]).z
ret
Vector3D_Normalize_FPU ENDP
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 4
normalize proc pv1:DWORD, pvr:DWORD
mov ecx, pv1
mov edx, pvr
fld [ecx].Vector3D.z
fmul [ecx].Vector3D.z
fld [ecx].Vector3D.y
fmul [ecx].Vector3D.y
faddp st(1), st
fld [ecx].Vector3D.x
fmul [ecx].Vector3D.x
faddp st(1), st
fsqrt
fld4 1.0
fdivr
fld [ecx].Vector3D.x
fmul st, st(1)
fstp [edx].Vector3D.x
fld [ecx].Vector3D.y
fmul st, st(1)
fstp [edx].Vector3D.y
fmul [ecx].Vector3D.z
fstp [edx].Vector3D.z
ret
normalize endp
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
start:
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
invoke Sleep, 3000
; ---------------------------------
; Display the current value of the
; control word PC field (bits 9-8).
; ---------------------------------
fstcw fpucw
print "Control word PC field = "
movzx eax, fpucw
shr eax, 8
and eax, 11b
print uhex$(eax),"h",13,10,13,10
; ------------------------------------------------------
; Restore FPU to initialized state to set the PC field
; to 11b = 64 bits, then read the value and display it.
; ------------------------------------------------------
finit
fstcw fpucw
print "Control word PC field = "
movzx eax, fpucw
shr eax, 8
and eax, 11b
print uhex$(eax),"h",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke Vector3D_Normalize_FPU, ADDR vr, ADDR v1
counter_end
print ustr$(eax)," cycles, Vector3D_Normalize_FPU",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke normalize, ADDR v1, ADDR vr
counter_end
print ustr$(eax)," cycles, normalize",13,10,13,10
invoke Vector3D_Normalize_FPU, ADDR vr, ADDR v1
fld vr.x
fstp dblx
fld vr.y
fstp dbly
fld vr.z
fstp dblz
invoke crt_printf, chr$("%.8f %.8f %.8f%c"), dblx, dbly, dblz, 10
invoke normalize, ADDR v1, ADDR vr
fld vr.x
fstp dblx
fld vr.y
fstp dbly
fld vr.z
fstp dblz
invoke crt_printf, chr$("%.8f %.8f %.8f%c%c"), dblx, dbly, dblz, 10, 10
; -------------------------------------------------------
; Set the PC field in the control word to 11b = 53 bits,
; then read the value back and display it.
; -------------------------------------------------------
fstcw fpucw
and fpucw, 1111111011111111b
fldcw fpucw
fstcw fpucw
print "Control word PC field = "
movzx eax, fpucw
shr eax, 8
and eax, 11b
print uhex$(eax),"h",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke Vector3D_Normalize_FPU, ADDR vr, ADDR v1
counter_end
print ustr$(eax)," cycles, Vector3D_Normalize_FPU",13,10
fstcw fpucw
and fpucw, 1111111011111111b
fldcw fpucw
fstcw fpucw
print "Control word PC field = "
movzx eax, fpucw
shr eax, 8
and eax, 11b
print uhex$(eax),"h",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke normalize, ADDR v1, ADDR vr
counter_end
print ustr$(eax)," cycles, normalize",13,10,13,10
; -------------------------------------------------------
; Set the PC field in the control word to 00b = 24 bits,
; then read the value back and display it.
; -------------------------------------------------------
fstcw fpucw
and fpucw, not 1100000000b
fldcw fpucw
fstcw fpucw
print "Control word PC field = "
movzx eax, fpucw
shr eax, 8
and eax, 11b
print uhex$(eax),"h",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke Vector3D_Normalize_FPU, ADDR vr, ADDR v1
counter_end
print ustr$(eax)," cycles, Vector3D_Normalize_FPU",13,10
fstcw fpucw
and fpucw, not 1100000000b
fldcw fpucw
fstcw fpucw
print "Control word PC field = "
movzx eax, fpucw
shr eax, 8
and eax, 11b
print uhex$(eax),"h",13,10
counter_begin 1000, HIGH_PRIORITY_CLASS
invoke normalize, ADDR v1, ADDR vr
counter_end
print ustr$(eax)," cycles, normalize",13,10,13,10
invoke Vector3D_Normalize_FPU, ADDR vr, ADDR v1
fld vr.x
fstp dblx
fld vr.y
fstp dbly
fld vr.z
fstp dblz
invoke crt_printf, chr$("%.8f %.8f %.8f%c"), dblx, dbly, dblz, 10
invoke normalize, ADDR v1, ADDR vr
fld vr.x
fstp dblx
fld vr.y
fstp dbly
fld vr.z
fstp dblz
invoke crt_printf, chr$("%.8f %.8f %.8f%c%c"), dblx, dbly, dblz, 10, 10
inkey "Press any key to exit..."
exit
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
end start
Results on my P3:
Control word PC field = 00000002h
Control word PC field = 00000003h
130 cycles, Vector3D_Normalize_FPU
130 cycles, normalize
0.26726124 0.53452247 0.80178374
0.26726124 0.53452247 0.80178374
Control word PC field = 00000002h
113 cycles, Vector3D_Normalize_FPU
Control word PC field = 00000002h
113 cycles, normalize
Control word PC field = 00000000h
70 cycles, Vector3D_Normalize_FPU
Control word PC field = 00000000h
70 cycles, normalize
0.26726124 0.53452247 0.80178374
0.26726124 0.53452247 0.80178374
So I think the conclusions are:
1) If you don't ever need more that REAL4 in your code.. set the PC... it'll double the FPU speed when using div/sqrt! (I'm curious to know if the C#/C++ compiler was already doing this with optimizations on).
2) Both fmul st,mem and fmul st,st(0) perform identically on P3, on my PM the st,st(0) version is a few cycles less.
3) After all of this, the assembly langauge version is now :
20ms faster than the optimized c++ on 1,000,000 iterations with full precision fpu
2x as fast set to 53bit mantissa. REAL8
3x as fast set to 24bit mantissa. REAL4
4) Avoid stack faults at ALL costs, penalty is the worst i've seen yet.
Obviously if you took the C++ testpiece and updated it to use reciprocal, fsqrt instead of calling the std sqrt lib function and applied the FPU PC change it would be the same.
So I updated the C++ testpiece to use fsqrt and reciprocals.. it came down from 79ms to 70ms, still around 20ms slower than the asm versions at full FPU precision. I checked the dis-asm and C++ still puts a bit of overhead into the routine along with a few less than optimal fld/fstp combinations. But mostly it's quite close.
Re benchmarked this update FPU setup against the SSE version (using AOS model - one vector at a time). And the SSE comes in at 11ms opposed to 25ms for the fastest FPU version.
Assuming one used SOA we could imagine that we'd see an 8x increase over the fastest FPU version.
Quote from: johnsa on June 17, 2008, 04:57:22 PM
timer_begin 10000000, HIGH_PRIORITY_CLASS
invoke Vector3D_Normalize_FPU, ADDR v1, ADDR vr
timer_end
print ustr$(eax)
print chr$(" Vector3D Normalize ms",13,10)
So.. on my machine both of these come in at around 530ms... while the C#.Net version still manages 150ms for the same number of iterations... so we're still about 4 times slower... and they're definately no stack faults now.
Quote from: johnsa on June 18, 2008, 09:54:57 AM
C++ Test-App using straight 3 divs and fsqrt (no recip). 1,000,000 iterations.
156ms debug mode
94ms release mode with pragma optimizations switched off
78ms release mode all optimizations
ASM Test Piece (using reciprocal with fmuls) set to PC to REAL4 - 1,000,000 iterations.
MichaelW's Normalize 13ms
Vector3D Noramlize 13ms
So now.. the ASM version is 5 times faster than the c++ version (mainly due to REAL4 PC and reciprocal).
Part of the factor 20 between "4*slower" and "5*faster" might be hidden in the counters:
timer_begin 10000000
C++ ... 1,000,000 iterations.
And the remaining factor 2 seems attributable to inverting the divs. Thanks a lot for clarifying this... I am working on a little FPU lib and got deeply worried when I saw your initial post :wink
Yeah I noticed the original posted piece had a a 10 million counter, not 1 million :)
At least I'm happy to say that now the asm is considerably faster than it's counterparts in C# and even C++... Even with PC set the same.
From 150ms to 25ms and then to 11ms with SSE or an effective 3ms if you maximise the throughput of SSE with 4 vectors in parallel.
The only thing I'm not too convinced about with the SSE version is that if you use homogenous vectors with a W, in it's standard AOS format (1 vector at a time) it modifies the W coordinate too.. which I don't think it should.. that should remain 1 for direction and 0 for a coordinate in space... Thinking about the best way to get the AOS version to not touch the W.
Quote from: johnsa on June 18, 2008, 12:51:27 PM
The only thing I'm not too convinced about with the SSE version is that if you use homogenous vectors with a W, in it's standard AOS format (1 vector at a time) it modifies the W coordinate too.. which I don't think it should.. that should remain 1 for direction and 0 for a coordinate in space... Thinking about the best way to get the AOS version to not touch the W.
pffft... now it sucks... :P
.data
Mask_210 DWORD 0FFFFFFFFh,0FFFFFFFFh,0FFFFFFFFh,0
.code
; NormalizeSse2 1
movaps XMM0,OWORD PTR [esi] ;; ) XMM0 and XMM3 = W,Z,Y,X
movaps XMM3,XMM0 ;; )
mulps XMM0,XMM0 ;; XMM0 = _,Z^2,Y^2,X^2
pshufd XMM1,XMM0,001h ;; XMM1 = _,_,_,Y^2
movhlps XMM2,XMM0 ;; XMM2 = _,_,_,Z^2
addss XMM0,XMM1 ;; XMM0 = _,_,_,X^2+Y^2
addss XMM0,XMM2 ;; XMM0 = _,_,_,X^2+Z^2+Z^2
rsqrtss XMM0,XMM0 ;; XMM0 = _,_,_,1/(X^2+Y^2+Z^2)
pshufd XMM0,XMM0,000h ;; XMM0 = 1/(X^2+Y^2+Z^2),1/(X^2+Y^2+Z^2),1/(X^2+Y^2+Z^2),1/(X^2+Y^2+Z^2)
mulps XMM0,XMM3 ;; XMM0 = W*1/(X^2+Y^2+Z^2),Z*1/(X^2+Y^2+Z^2),Y*1/(X^2+Y^2+Z^2),X*1/(X^2+Y^2+Z^2)
movhlps XMM3,XMM0 ;; XMM3 = W,_,_,Z*1/(X^2+Y^2+Z^2)
shufps XMM0,XMM3,0C4h ;; XMM0 = w,Z/(X^2+Y^2+Z^2),Y/(X^2+Y^2+Z^2),X/(X^2+Y^2+Z^2)
movaps OWORD PTR [edi],XMM0 ;; XMM0 = W,Z/(X^2+Y^2+Z^2),Y/(X^2+Y^2+Z^2),X/(X^2+Y^2+Z^2)
; NormalizeSse2 2
push [esi+12]
movdqa XMM0,OWORD PTR [esi] ;; XMM0 = W,Z,Y,X
movdqa XMM2,XMM0 ;; XMM2 = W,Z,Y,X
andps XMM0,OWORD PTR [Mask_210] ;; XMM2 = 0,Z,Y,X
mulps XMM0,XMM0 ;; XMM0 = 0,Z^2,Y^2,X^2
pshufd XMM1,XMM0,04Eh ;; XMM1 = Y^2,X^2,0^2,Z^2
addps XMM1,XMM0 ;; XMM1 = 0^2+Y^2,Z^2+X^2,Y^2+0^2,X^2+Z^2
pshufd XMM0,XMM1,0B1h ;; XMM0 = Z^2+X^2,0^2+Y^2,X^2+Z^2,Y^2+0^2
addps XMM0,XMM1 ;; XMM0 = Z^2+X^2+Y^2,Y^2+Z^2+X^2,X^2+Z^2+Y^2,Y^2+X^2+Z^2
rsqrtps XMM0,XMM0 ;; XMM0 = 1/(X^2+Y^2+Z^2),1/(X^2+Y^2+Z^2),1/(X^2+Y^2+Z^2),1/(X^2+Y^2+Z^2)
mulps XMM0,XMM2 ;; XMM0 = W*1/(X^2+Y^2+Z^2),Z*1/(X^2+Y^2+Z^2),Y*1/(X^2+Y^2+Z^2),X*1/(X^2+Y^2+Z^2)
movdqa OWORD PTR [edi],XMM0 ;; XMM0 = W,Z/(X^2+Y^2+Z^2),Y/(X^2+Y^2+Z^2),X/(X^2+Y^2+Z^2)
pop [edi+12]
It's not so bad :) With adjustments made to my SSE normalize for AOS It still runs at 10ms for 1,000,000 iterations on my machine, approx. 20cycles. Pentium M Centrino 1.8ghz.
One other thing that should be cater for as an option is a single iteration of Newton Raphson on the result returned by the reciprocal square root. This adds about 4ms / 5 cycles for me but will give you significantly better precision if needed.