News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

sine approx. using sse2

Started by qWord, May 16, 2010, 09:06:57 PM

Previous topic - Next topic

qWord

hi,

the last weeks I've invest some time in writting an function for calculating the sine using sse2. The formula based on the sine's Taylor series  at 0.
It god a bit disappointing when seeing that the VCRT implementation (also sse2) is a bit fast and more accurately (mainly because is a table based algo.).
However, my algo use only 4 xmm registers, while the VCRT use all registers - I do not want to use more than 4 register because needed some storage  :U.
In the attachment you will finde a test bed (it also creates 2 *.csv files for showing the difference between fsin and my function).

may some of you have an better function or can optimize this one:

;-----------------------------------------------------------------------------
; calculates the sine frome the low order float value in xmm0
; IN/OUT: low float value in xmm0
;-----------------------------------------------------------------------------
; sine approximation based on the taylor series for:
; f(x) = sin(x*(pi/2))  with   -1 <= x <= 1
;
; T(x) = 1/2*x*pi - 1/48*x^3*pi^3 + 1/3840*x^5*pi^5 - 1/645120*x^7*pi^7 ...
; T(x) = x*(1-(u/6)*(1-(u/20)*(1-(u/42)*(1-(u/72)*[1 - ...]))))
; with u = (x²*pi²)/4
;
OPTION PROLOGUE:none
OPTION EPILOGUE:none
align 16
sinss proc
   
    sinss_const SEGMENT page read   ; for the alignment
        sinss_2_pi  REAL8  0.6366197723675813430755351  ; 2/pi
        sinss_05pi  REAL8  1.5707963267948966192313217  ; 0.5*pi

        sinss_qdrnt REAl4  0.0
                    REAl4  1.0
                    REAl4  0.0
                    REAl4 -1.0

        sinss_c3    REAL8 -0.03426945972600471742650865 ; (pi^2)/(4*72)
        sinss_c2    REAl8 -0.05874764524457951558830054 ; (pi^2)/(4*42)
        sinss_c1    REAL8 -0.12337005501361698273543114 ; (pi^2)/(4*20)
        sinss_c0    REAL8 -0.41123351671205660911810379 ; (pi^2)/(4*6)
    sinss_const ENDS

    cvtss2sd xmm0,xmm0
    mulsd xmm0,sinss_2_pi
    cvttsd2si eax,xmm0
    cvtsi2sd xmm1,eax
    mov edx,eax
    subsd xmm0,xmm1
    and edx,3
    jz @calc
    movsd xmm1,xmm0
    cvtss2sd xmm0,sinss_qdrnt[edx*4]

    and eax,80000003h
    js @s
    jnp @np
    jp @p
@s: test eax,2
    jz @p
@np:    subsd xmm0,xmm1
    jmp @calc   
@p:     addsd xmm0,xmm1
@calc:

    movsd xmm2,xmm0
    cvtss2sd xmm1,sinss_qdrnt[4]
    movsd xmm3,sinss_c3
    mulsd xmm2,xmm2
    mulsd xmm3,xmm2
    addsd xmm3,xmm1
    mulsd xmm0,sinss_05pi
   
    mulsd xmm3,xmm2
    mulsd xmm3,sinss_c2
    addsd xmm3,xmm1
         
    mulsd xmm3,xmm2
    mulsd xmm3,sinss_c1
    addsd xmm3,xmm1
         
    mulsd xmm3,xmm2
    mulsd xmm3,sinss_c0
    addsd xmm3,xmm1
         
    mulsd xmm0,xmm3
    cvtsd2ss xmm0,xmm0

    ret

sinss endp
OPTION PROLOGUE:prologuedef
OPTION EPILOGUE:epiloguedef


on an core2duo:
sine aproximation vs. FPU vs. VCRT (REAL4)
93      fpu
32      VCRT SSE2
35      SSE2
Press any key to continue ...


qWord
FPU in a trice: SmplMath
It's that simple!

jj2007

Hi qWord,
Which OS version are you using? On XP SP2, it complains "error while loading msvcrt-functions". Interesting is also that it crashes polink...:
CPU Disasm
Address   Hex dump          Command                                  Comments
00411AA5  |.  8345 F0 20    |add dword ptr [ebp-10], 20
00411AA9  |.  8B45 F0       |mov eax, [ebp-10]
00411AAC  |.  01C0          |add eax, eax
00411AAE  |.  01C0          |add eax, eax
00411AB0  |.  50            |push eax                                ; /Arg2
00411AB1  |.  8B45 F8       |mov eax, [ebp-8]                        ; |
00411AB4  |.  50            |push eax                                ; |Arg1 => [LOCAL.2]
00411AB5  |.  E8 960F0000   |call 00412A50                           ; \polink.00412A50
00411ABA  |.  83C4 08       |add esp, 8
00411ABD  |.  8945 F8       |mov [ebp-8], eax
00411AC0  |>  8B45 F4       |mov eax, [ebp-0C]
00411AC3  |.  8D50 01       |lea edx, [eax+1]
00411AC6  |.  8955 F4       |mov [ebp-0C], edx
00411AC9  |.  8B55 F8       |mov edx, [ebp-8]
00411ACC  |.  8B4D EC       |mov ecx, [ebp-14]
00411ACF  |.  890C82        |mov [eax*4+edx], ecx
00411AD2  |.  8B45 EC       |mov eax, [ebp-14]
00411AD5  |.  8B18          |mov ebx, [eax]                          ; Crashes here: can't read from 7473
00411AD7  |.  EB 76         |jmp short 00411B4F
00411AD9  |>  807B 14 00    |/cmp byte ptr [ebx+14], 0
00411ADD  |.  75 6E         ||jne short 00411B4D
00411ADF  |.  8D7D D3       ||lea edi, [ebp-2D]
00411AE2  |.  8D73 15       ||lea esi, [ebx+15]
00411AE5  |.  EB 06         ||jmp short 00411AED
00411AE7  |>  8A06          ||/mov al, [esi]
00411AE9  |.  8807          |||mov [edi], al
00411AEB  |.  47            |||inc edi
00411AEC  |.  46            |||inc esi
00411AED  |>  8A06          |||mov al, [esi]
00411AEF  |.  84C0          |||test al, al
00411AF1  |.  74 04         |||je short 00411AF7
00411AF3  |.  3C 24         |||cmp al, 24
00411AF5  |.^ 75 F0         ||\jne short 00411AE7
00411AF7  |>  C607 00       ||mov byte ptr [edi], 0
00411AFA  |.  8D4D E3       ||lea ecx, [ebp-1D]
00411AFD  |.  8D55 D3       ||lea edx, [ebp-2D]
00411B00  |.  31C0          ||xor eax, eax
00411B02  |>  8A02          ||/mov al, [edx]
00411B04  |.  8A21          |||mov ah, [ecx]
00411B06  |.  28C4          |||sub ah, al
00411B08  |.  75 08         |||jne short 00411B12
00411B0A  |.  84C0          |||test al, al
00411B0C  |.  74 09         |||je short 00411B17
00411B0E  |.  42            |||inc edx
00411B0F  |.  41            |||inc ecx
00411B10  |.^ EB F0         ||\jmp short 00411B02
00411B12  |>  19C0          ||sbb eax, eax                           ; Calculates sign(eax)
00411B14  |.  83D8 FF       ||sbb eax, -1
00411B17  |>  85C0          ||test eax, eax
00411B19  |.  75 32         ||jne short 00411B4D
00411B1B  |.  8B45 F4       ||mov eax, [ebp-0C]
00411B1E  |.  3B45 F0       ||cmp eax, [ebp-10]
00411B21  |.  75 1B         ||jne short 00411B3E
00411B23  |.  8345 F0 20    ||add dword ptr [ebp-10], 20
00411B27  |.  8B45 F0       ||mov eax, [ebp-10]
00411B2A  |.  01C0          ||add eax, eax
00411B2C  |.  01C0          ||add eax, eax
00411B2E  |.  50            ||push eax                               ; /Arg2
00411B2F  |.  8B45 F8       ||mov eax, [ebp-8]                       ; |
00411B32  |.  50            ||push eax                               ; |Arg1 => [LOCAL.2]
00411B33  |.  E8 180F0000   ||call 00412A50                          ; \polink.00412A50

qWord

I'm using vista ... thought that the msvcrt.dll is on all winsows system available .

EDIT:
maybe the function __libm_sse2_sinf is not available on your system due the used processor?
FPU in a trice: SmplMath
It's that simple!

qWord

ok - I've upload an new test bed in the first post
FPU in a trice: SmplMath
It's that simple!

jj2007

The dll is there, but GPA fails:
Quotestart:
   if 1
   xor edi, edi
   .if rv(LoadLibrary,"msvcrt.dll")
      mov esi, eax
      .if rv(GetProcAddress,esi,"__libm_sse2_sinf")
         mov edi, eax
      .else
         print "error in GPA",10,13
         inkey
         exit
      .endif
   .else
      print "error while loading the dll",10,13
      inkey
      exit
   .endif
   else
   .if !ASM(mov esi,rv(LoadLibrary,"msvcrt.dll")) || !ASM(mov edi,rv(GetProcAddress,esi,"__libm_sse2_sinf"))
      print "error while loading msvcrt-functions",10,13
      inkey
      exit
   .endif
   endif


Your second version, Celeron M:
__libm_sse2_sinf not available
sine aproximation vs. FPU vs. VCRT (REAL4)
98      fpu
52      SSE2


Fine work and very useful, compliments!

qWord

OK - now it works ... I should not use function that are extracted from DLL's export directory  :bg
FPU in a trice: SmplMath
It's that simple!

jj2007

A factor 2 faster than the FPU. But the latter needs 98 cycles also for REAL10 precision... speed vs accuracy.

qWord

Quote from: jj2007 on May 16, 2010, 10:04:30 PM
... speed vs accuracy.
well, I simply need it because I'm currently working on my easy-to-use-math-macros with SSE2 support. Using fpu instruction in interaction with sse2 instruction would simply take the speed advantage. Also in in many situation (especialy when using SIMD) high precision is not needed.
FPU in a trice: SmplMath
It's that simple!

Farabi

I dont know how it works but to obtain Sin value you can extract it from x position divided by the line lenght.
Those who had universe knowledges can control the world by a micro processor.
http://www.wix.com/farabio/firstpage

"Etos siperi elegi"

Ficko

I am just wondering that using decimal declaration forcing MASM to use some C lib to calculate the hex value this may cause some discrepancy in further calculations.


sinss_2_pi  REAL8  0.6366197723675813430755351  ; 2/pi


qWord

Quote from: Farabi on May 18, 2010, 07:15:06 AM
I dont know how it works but to obtain Sin value you can extract it from x position divided by the line lenght.
well, this doesn't help you if you only have an angle.

Quote from: Ficko on May 18, 2010, 11:05:38 AM
I am just wondering that using decimal declaration forcing MASM to use some C lib to calculate the hex value this may cause some discrepancy in further calculations.
I'm sure that the guys at m$ use the right function for convertion. Also, how would you then convert fp constants? With an calculator that also use some C lib function? :P
FPU in a trice: SmplMath
It's that simple!

clive

I suspect the point was that some numbers don't represent well in decimal, using the actual hex value coming out of the 80x87 would be the best way to retain precision, without going from binary->decimal->binary.


; Encoded as hexadecimals
ieeeshort REAL4 3F800000r ; 1.0 as IEEE short
ieeedouble REAL8 3FF0000000000000r ; 1.0 as IEEE long
temporary REAL10 3FFF8000000000000000r ; 1.0 as 10-byte real
It could be a random act of randomness. Those happen a lot as well.

qWord

ok, I understand what general problem you are talking about, but a quick test show that there is no difference betteen fpu generated constant an masm's one:
.data
align 16
sinss_2_pi  REAL8  0.6366197723675813430755351 
tmp REAL8 0.0
.code

fld1
fld1
faddp
fldpi
fdivr st,st(1)
fstp tmp
fstp st

ollydbg:
masm: 83 C8 C9 6D 30 5F E4 3F
fpu:  83 C8 C9 6D 30 5F E4 3F
FPU in a trice: SmplMath
It's that simple!

clive

Did something very similar

    fld1
    fld1
    faddp st(1),st ; 2
    fldpi
    fdivp st(1),st
    fstp qword ptr [bx] ; or tbyte ptr [bx]


Microsoft C 8.00c

80x87 value for PI - converted to 64-bit
3.1415926535897930000000000 - 400921FB54442D18r

2/pi using above
0.6366197723675814000000000 - 3FE45F306DC9C883r

qWord 25 dp value, using Microsoft C
0.6366197723675814000000000 - 3FE45F306DC9C883r

80x87 computation of 2.0 / PI at 80-bit, result 64-bit
0.6366197723675814000000000 - 3FE45F306DC9C883r

80x87 computation of 2.0 / PI at 80-bit
0.6366197723675813430760000 - 3FFEA2F9836E4E44152Ar

sizeof(double)       8
sizeof(long double) 10


Microsoft C 12.00 (32-bit)

80x87 value for PI - converted to 64-bit
3.1415926535897931000000000 - 400921FB54442D18r

2/pi using above
0.6366197723675813800000000 - 3FE45F306DC9C883r

qWord 25 dp value, using Microsoft C
0.6366197723675813800000000 - 3FE45F306DC9C883r

80x87 computation of 2.0 / PI at 80-bit, result 64-bit
0.6366197723675813800000000 - 3FE45F306DC9C883r

sizeof(double)       8
sizeof(long double)  8


25 decimal place doesn't help.
It could be a random act of randomness. Those happen a lot as well.

dedndave

#14
Quote0.6366197723675813430760000
i count 21 signifigant decimal digits, there, Clive
extended reals are good for 20, tops   :bg
(because it starts with a 6, i'd guess 19 in this case)

if evaluated to full binary representation, i get these values:
3FFE_A2F9836E_4E441529: +0.6366197723675813430221394340069451800445676781237125396728515625

3FFE_A2F9836E_4E44152A: +0.636619772367581343076349542631220401744940318167209625244140625

3FFE_A2F9836E_4E44152B: +0.6366197723675813431305596512554956234453129582107067108154296875

i guess i'd call it +0.6366197723675813431 - 19 digits