Here is a version of the FFT I was working on. The idea was to minimize the amount of local storage, so there's a lot of extra swapping going on, but the plus side is that it doesn't really use any additional space (other than the precalculated scaler values, which can be reused). I kind of ran out of gas optimizing it, so feel free to rip it apart; I'm sure a lot of local vars could be kept in registers, and things like that. I'm more interested in high-level algo optimization, and not the instruction-level stuff, but all comments are welcome. It will need some tailoring if you use it in a real application.
-r
Thanks to Antariy for his comments, and JJ for MasmBasic (used in the testbed)
include \masm32\MasmBasic\MasmBasic.inc
;.686
;.MMX
;.XMM
;.MODEL FLAT, stdcall
;option casemap:none
;include \prog\include\windows.inc
;include \prog\include\kernel32.inc
;includelib \prog\lib\kernel32.lib
COMPLEX STRUCT ; Complex Number
real REAL4 ?
imag REAL4 ?
COMPLEX ENDS
FFT_DATA STRUCT ; Common Data
pData DWORD ? ; Address of data area
len DWORD ? ; Original Length
pWoData DWORD ? ; pointer to Wo Area
FFT_DATA ENDS
pFFTData typedef PTR FFT_DATA
pComplex typedef PTR COMPLEX
pReal4 typedef PTR REAL4
CaclWo PROTO pWoData:pReal4, len:DWORD
FFTSSE3 PROTO input_data:pComplex, leng:DWORD
FFTRecurse PROTO pGlobal:pFFTData, beg:pComplex, step:DWORD
PrintSamples PROTO input_data:pComplex, leng:DWORD
.data
TestData2 COMPLEX <1.0,2.0>,<3.0,4.0> ; { <4+j6>, <-2-2j> }
TestData4 COMPLEX <1.0,2.0>,<3.0,4.0>,<5.0,6.0>,<7.0,8.0> ; { { <16+j20> <-8+j0> <-4-j4> <0-j8> }}
SqaurePulse COMPLEX <1.0, 0.0>,<1.0, 0.0>,<1.0, 0.0>,<1.0, 0.0>,<0.5, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.5, 0.0>,<1.0, 0.0>,<1.0, 0.0>,<1.0, 0.0>
;.code
;start:
Init
invoke FFTSSE3, ADDR TestData2, LENGTHOF TestData2
cmp eax,0
je @F
invoke FFTSSE3, ADDR TestData4, LENGTHOF TestData4
cmp eax,0
je @F
invoke FFTSSE3, ADDR SqaurePulse, LENGTHOF SqaurePulse
cmp eax,0
je @F
invoke PrintSamples, ADDR TestData2, LENGTHOF TestData2
invoke PrintSamples, ADDR TestData4, LENGTHOF TestData4
invoke PrintSamples, ADDR SqaurePulse, LENGTHOF SqaurePulse
@@: mov eax,0
ret
PrintSamples PROC input_data:pComplex, leng:DWORD
pusha
mov esi,0
mov edi,input_data
@@:
lea ebx,[edi+esi*SIZEOF COMPLEX]
fld [ebx].COMPLEX.real
Print Str$(ST(0))
fstp [ebx].COMPLEX.real
fld [ebx].COMPLEX.imag
Print " + j"
Print Str$(ST(0))
fstp [ebx].COMPLEX.imag
Print CrLf$
inc esi
cmp esi, leng
jl @B
Print CrLf$
Print CrLf$
popa
ret
PrintSamples ENDP
FFTSSE3 PROC input_data:pComplex, leng:DWORD
LOCAL common_data:FFT_DATA ; Data common to all recursions
; Check for SSE3 instruction support
mov eax,1
cpuid
and ecx,01h
jz fftfail
; Make sure sample size is a power of two
mov eax, leng
cmp eax,0 ; x>0
je fftfail
mov ebx, eax ; x && (x-1) == 0
dec ebx
and ebx, eax
jne fftfail
mov common_data.len, eax ; save length
invoke CaclWo, ADDR common_data.pWoData, eax ; Create a heap allocation for the values of Wo
mov eax, input_data
mov common_data.pData, eax ; save data pointer
; Align the stack on a 16-byte boundary
mov eax,esp
and eax,0Fh
sub esp,eax
; Start the show!
invoke FFTRecurse, ADDR common_data, input_data, 1
; Free the Wo calculation
invoke GetProcessHeap
invoke HeapFree, eax, 0, common_data.pWoData;
mov eax, 1 ; return true
ret
fftfail:
mov eax,0
ret
FFTSSE3 ENDP
; Actual recursion
FFTRecurse PROC pGlobal:pFFTData, g0:pComplex, step:DWORD
LOCAL step2:DWORD
LOCAL h0:pComplex
LOCAL N:DWORD
; Calculate length based on step, save into N (2L/S)(S-1)
mov eax, pGlobal
mov eax, [eax].FFT_DATA.len
mov edx,0
div step
mov N,eax
; If length of data == 1, return
cmp eax,1
jne @F
ret
@@: ; else, calculate DFT and do math
; Calculate Step*2 for start of h[] recurse
mov eax,step
add eax,eax
mov step2,eax
mov ebx, g0
mov ecx, step
lea eax, [ebx+ecx*(SIZEOF COMPLEX)]
mov h0,eax
invoke FFTRecurse, pGlobal, g0, step2 ; Calculate g[]
invoke FFTRecurse, pGlobal, h0, step2 ; Calculate h[]
; Calculate Base Wo {2(N/S)(S-1)}
mov eax, pGlobal
mov eax, [eax].FFT_DATA.len
mov edx, 0
mov ebx, step ; ebx = S
shl ebx, 1 ; ebx = 2S
div ebx ; eax = (TotN/2S)
shr ebx, 1 ; ebx = S
dec ebx ; ebx = (S-1)
mul ebx ; eax = (TotN/2S)(S-1)
shl eax,1 ; eax = 2(N/S)(S-1)
mov ebx, pGlobal
mov ebx, [ebx].FFT_DATA.pWoData
lea eax, [ebx+eax*(SIZEOF COMPLEX)] ;eax = Pointer to w[0] for current N
; Setup pointers to h and g
mov esi,g0
mov edi,h0
; Setup counter
mov ecx,N
shr ecx,1 ; divide N by half (each h and g are N/2 long)
mov ebx,step
shl ebx,1 ; step by 2
FFTCalc:
cmp ecx,0
je FFTSwap
; Put H, G, and W on the stack
push 0
push [esi].COMPLEX.real ; Gr
push 0
push [esi].COMPLEX.imag ; Gi
push [edi].COMPLEX.imag ; Hi
push [eax].COMPLEX.imag ; Wi
push [edi].COMPLEX.real ; Hr
push [eax].COMPLEX.real ; Wr
; Compute g+Wh, g-Wh
movsldup xmm0,[esp+0]
movshdup xmm1,[esp+0]
shufps xmm1,xmm1,0D8h
mulps xmm0,xmm1
movaps xmm1,xmm0
shufps xmm1,xmm1,0EBh
addsubps xmm0,xmm1
shufps xmm0,xmm0,005h
movsldup xmm1,[esp+16]
addsubps xmm1,xmm0
shufps xmm1,xmm1,0D8h
movaps [esp+0],xmm1
; Unpack data into the original area
mov edx,[esp+0]
mov [edi].COMPLEX.imag,edx ;-i
mov edx,[esp+4]
mov [edi].COMPLEX.real,edx ;-r
mov edx,[esp+8]
mov [esi].COMPLEX.imag,edx ;+i
mov edx,[esp+12]
mov [esi].COMPLEX.real,edx ;+r
; calculate the next pointers
add eax, (SIZEOF COMPLEX) ; move to next Wo
lea esi, [esi+ebx*(SIZEOF COMPLEX)] ; move to next Go
lea edi, [edi+ebx*(SIZEOF COMPLEX)] ; move to next Ho
add esp,(SIZEOF COMPLEX)*4 ; fix the stack
dec ecx
JMP FFTCalc
FFTSwap:
mov ebx,0
mov ecx,N ; N
sub ecx,2 ; (N-2)
shr ecx,1 ; (N-2)/2 = outer counter
FFTSwapOuter:
cmp ecx,0
je @F
inc ebx
mov eax,step
mov edx,0
mul ebx ; eax=start*pos
mov esi,g0
lea esi, [esi+eax*(SIZEOF COMPLEX)] ; esi is the 'starting element'
mov edx,0
FFTSwapInner:
cmp edx,ecx
je FFTSwapOuterTest
mov eax,step
lea edi, [esi+eax*(SIZEOF COMPLEX)] ; load address of next element
push [esi].COMPLEX.real ;push element n
push [esi].COMPLEX.imag
push [edi].COMPLEX.real ;push element n+1
push [edi].COMPLEX.imag
pop [esi].COMPLEX.imag ;pop element n+1 into n
pop [esi].COMPLEX.real
pop [edi].COMPLEX.imag ;pop element n into n+1
pop [edi].COMPLEX.real
lea esi, [edi+eax*(SIZEOF COMPLEX)] ; esi points to 1 above edi (new base)
inc edx
jmp FFTSwapInner
FFTSwapOuterTest:
dec ecx
jmp FFTSwapOuter
@@: ret
FFTRecurse ENDP
CaclWo PROC pWoData:pReal4, leng:DWORD
LOCAL r:DWORD ; r
LOCAL neg2pi:REAL4 ;-2*pi
LOCAL theta:REAL4 ;-2*pi/N
; W= e^-j*2*pi*r/N
; Calculate size of allocation: (len - 1)
mov eax, leng
dec eax
lea ecx, [eax*(SIZEOF COMPLEX)]
invoke GetProcessHeap
invoke HeapAlloc, eax, 0, ecx;
mov edi,eax
mov eax,pWoData
mov [eax],edi ; save the WoArea into the output pointer
; Precalulate -2*pi (ebx is still two)
push 2
fild DWORD PTR [esp] ; load 2
fchs ; -2
add esp,4 ; ajust stack
fldpi ; load pi
fmulp st(1),st(0) ; -2(pi)
fstp neg2pi
; EBX-(r+4), EDX-len, EDI-pdata, ESI-r
mov edx, leng
iloop: ; while (N/2!=1)
fld neg2pi ;calculate a new theta
mov leng,edx
fild leng
fdivp st(1),st(0)
fstp theta
mov esi,0 ; reinitialize 'r'
mov eax,edx
shr eax, 1
wloop: ; while (r<N/2) r= 0-(len-1)
cmp esi,eax
jae ilooptest
fldz
fld theta
mov r,esi
fild r
fmulp st(1),st(0)
fadd st(1),st(0)
fcos
fstp [edi].COMPLEX.real
fsin
fstp [edi].COMPLEX.imag
add edi, SIZEOF COMPLEX
inc esi ; loop for each length
jmp wloop
ilooptest:
shr edx,1 ; N = N / 2
cmp edx,1 ; loop while N > 1
jne iloop
mov eax,-1 ; return true
ret
CaclWo ENDP
end start
red,
Frankly, the Fast Fourier Transform scares the hell out of me. But, so incredibly useful.
Thanks for posting.
Quote from: baltoro on January 25, 2011, 01:37:14 AM
Frankly, the Fast Fourier Transform scares the hell out of me.
Being scared shitless of mathematics is the first sign you understand it :toothy
Funny. :bg
You know, I've read about the FFT being used in so many fascinating applications,...and, I've always intended to spend some time and really get acquainted with it,...and, never did. I must do some research. Thanks again,...it's the perfect example for a beginning effort.
Some light reading :bdg http://www.archive.org/stream/analyticaltheory00fourrich
But seriously, i doubt this is the perfect example; my comments are sub-par and the SSE stuff only makes it worse. I would look for a higher level version, using C++ numerics or MATLAB or something; or start with with regular old continous Fourier transform, or even the fourier series.
-r
The FFT Wiki (http://en.wikipedia.org/wiki/Fast_Fourier_transform) looks good, but I must admit it's way over my head.
Good to see you found a use for MasmBasic :bg
In case you need it, the Str$() macro handles various output precision:
Quote Print Str$("The number PI is %Hf[/color]", PI) ; precision H = 17 digits (I=18 works fine, J may yield rubbish)
Allowed are also all kinds of variables, except MMX regs. But these lines are perfectly valid:
Quotemovlps xmm1, MyReal8
Print Str$("PI*1000 is %Hf\n", PI*MyReal8)
Print Str$("PI*1000 is %Hf\n", PI*f:xmm1)
(note the f: telling Str$() that xmm1 is to be interpreted as a float here. No f: means use it as an integer. Same rule for xmm regs in the deb macro)
This is really exciting stuff.. if you're that way inclined... that is :green2
I'm busy with a hardware project involving this type of stuff, and had to see if it was feasable.
So, I wrote a quick proggy to test my ideas..
Essentially I'm transmitting a signal as in the first pic (Input signal).
I made an FFT (DSP) filter which looks really funny, and with a bit of convoluted maths I end up with filtered signals in the top graph.
(http://www.vanjast.com/IL2Pics/DSPPic1.JPG)
BUT, the big thing is that the input signals are never perfect, to I added random noise to the input signal which now looks a complete mess.
And the 'Bunny out the Hat' is that the signal peaks are still there... This is just a software filter of one kind, and you can do a gazillion other goodies with DSP.
The FFT makes it real fast, wrt numerical computation.
(http://www.vanjast.com/IL2Pics/DSPPic2.JPG)
Quote from: baltoro on January 25, 2011, 01:59:00 AM
Funny. :bg
You know, I've read about the FFT being used in so many fascinating applications,...and, I've always intended to spend some time and really get acquainted with it,...and, never did. I must do some research. Thanks again,...it's the perfect example for a beginning effort.
Hi,
I have used FFT's for image processing. Probably a good
test bed for learning the ins and outs of how they work. And
I tried using them for convolving functions together to combine
effectiveness measures. That got bogged down as the complexity
got out of hand. (For my fuzzy brain at least.)
I have tried coding up a DFT and FFT in the past, but fell
back to using existing code for practicality reasons. But trying
to code them up helped to understand what was being done.
The last course I attended (on lasers) used Fourier transforms
to explain some functionality, and my prior playing around (and/or
using them) helped in understanding what was being described.
Such as the spectral content of a short laser pulse.
So, try using an FFT on some problem, the experience can
prove useful in understanding another problem. Image processing
provides visual feedback that I could understand. YMMV.
Regards,
Steve N.
When I was spending time researching Climate Change, I would run into studies that performed FFT on long time series of proxy data (ice cores, deep sea cores, radiocarbon sources), looking for inherent periodicities in the data chaos that could then be used as a comparison with other climate and solar data. The concept in the abstract I understood, but, the actual mathematics was way beyond my intellect. Many of the models were generated by computers.
I think that's why Global Climate Change scares people,...because, they know intuitively that they can't understand the analysis, but, it's real (and, unfortunately is more or less background noise in a person's life).
4-way vectorize it, man. Just use a radix-4 FFT method so that you can simply split the array into 4, interlace it, do 4 concurrent FFTs (optionally with a different radix), then recombine them. It should be almost a perfect 4x speedup over using scalar SSE operations, and more like a 10x or more speedup over FPU code. :U
Edit: Oops! The last part of the title didn't show up on the main page and I saw FPU code, so I jumped, lol. Oh well. :lol
Quote from: Neo on January 26, 2011, 09:24:59 AM
..use a radix-4 FFT method...
That's the next project on the list :toothy
I dont understand it but it is amazing :U
Quote from: REDSKULLThat's the next project on the list. :toothy
Wow, Neo comes up with some mind-boggling stuff. :dazzled: :dazzled:
Just out of curiosity, redskull, what are you using FFT for (in real life,...)?
Quote from: baltoro on January 27, 2011, 12:52:43 AM
Just out of curiosity, redskull, what are you using FFT for (in real life,...)?
In day-to-day life, not really anything; I don't do a whole lot of DSP, so I usually end up grinding out the continous fourier transform with paper and pencil, or just using whatever is in MATLAB. This was basically just a way to refamiliarize myself with the SSE instructions. But here's a short list of things I can recall doing at some point: solving partial differential equations, filter design, receiver design, edge detection in images... basically, it's faster to convolve functions by transforming, multiplying, and inverse-transforming. But either way, it's the best way to get a computer to do the fourier transform, and the fourier transform really is what drives almost all our technology, so it's certainly not effort wasted.
-r
Quote from: baltoro on January 27, 2011, 12:52:43 AM
Quote from: REDSKULLThat's the next project on the list. :toothy
Wow, Neo comes up with some mind-boggling stuff. :dazzled: :dazzled:
It's the quantum physics simulation that's driven me mind-boggly. :wink
I started implementing a radix-4 FFT and inverse FFT to compute some crazy correlation function before I realized that for the particular thing the correlation function was in turn being used to compute, it really only needed: bit-shift, XOR, sum the bits, repeat a bunch of times. That ended up being over 10x faster, whereas the vectorized FFT would've only been 4x faster.
Don't even get me started on eigenvectors and eigenvalues... unless you really need giant sparse matrices diagonalized.
Quote from: NEO...eigenvectors and eigenvalues,...
:green Good thing you warned us...I was just getting juiced up for a long, incredibly annoying rant about the unconstrained usage of eigenstuff,... :green
For people like me,...any transformation beyond what you would use to balance your checkbook, is inherently destabilizing (in respect to the functionality of my cerebtal cortex).
Quote from: Neo on January 29, 2011, 09:27:09 AM
I started implementing a radix-4 FFT and inverse FFT to compute some crazy correlation function before I realized that for the particular thing the correlation function was in turn being used to compute, it really only needed: bit-shift, XOR, sum the bits, repeat a bunch of times. That ended up being over 10x faster, whereas the vectorized FFT would've only been 4x faster.
I was thinking along these lines.. but i see you've beaten me to it.
After looking at the FFT algorithm's implementation(not yours Red), I always thought that it was a bit cumbersome, and could be improved...
I'll tweak later.
:bg
You could speed it up by making the Bit Reversal section seperate from all the FFTs.. :green2
I've finished a version of FFT's based on FPU instructions (non mmx, SSE).
All these are direct translations of Steve Smiths 'Basic' Algorithms.. I'll post these later :bg
Here we go...
I haven't checked, optimised or run this yet, it's currently part of a bigger project so I'll only test this later.
There's probably a few things still to do..
I was going over this project today... and picked up a few mistakes with the FFT module, just a few incorrect store instructions that should be FIST(P) instead of FST(P) when saving to integer formats.. plus a few other funnies. Fixit later.
:8)