News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Fast Fourier Transform (SSE3)

Started by redskull, January 24, 2011, 02:25:07 PM

Previous topic - Next topic

redskull

Here is a version of the FFT I was working on.  The idea was to minimize the amount of local storage, so there's a lot of extra swapping going on, but the plus side is that it doesn't really use any additional space (other than the precalculated scaler values, which can be reused).  I kind of ran out of gas optimizing it, so feel free to rip it apart; I'm sure a lot of local vars could be kept in registers, and things like that.  I'm more interested in high-level algo optimization, and not the instruction-level stuff, but all comments are welcome.  It will need some tailoring if you use it in a real application.

-r

Thanks to Antariy for his comments, and JJ for MasmBasic (used in the testbed)


include \masm32\MasmBasic\MasmBasic.inc

;.686
;.MMX
;.XMM
;.MODEL FLAT, stdcall
;option casemap:none
;include \prog\include\windows.inc
;include \prog\include\kernel32.inc
;includelib \prog\lib\kernel32.lib

COMPLEX STRUCT          ; Complex Number
   real    REAL4 ?
   imag    REAL4 ?
COMPLEX ENDS


FFT_DATA STRUCT         ; Common Data
  pData   DWORD ?         ; Address of data area
  len     DWORD ?         ; Original Length
  pWoData DWORD ?         ; pointer to Wo Area
FFT_DATA ENDS

pFFTData typedef PTR FFT_DATA
pComplex typedef PTR COMPLEX
pReal4   typedef PTR REAL4

CaclWo  PROTO pWoData:pReal4, len:DWORD
FFTSSE3 PROTO input_data:pComplex, leng:DWORD
FFTRecurse  PROTO pGlobal:pFFTData, beg:pComplex, step:DWORD

PrintSamples PROTO input_data:pComplex, leng:DWORD

.data


TestData2   COMPLEX <1.0,2.0>,<3.0,4.0> ; { <4+j6>, <-2-2j> }
TestData4   COMPLEX <1.0,2.0>,<3.0,4.0>,<5.0,6.0>,<7.0,8.0> ; { { <16+j20> <-8+j0> <-4-j4> <0-j8> }}
SqaurePulse COMPLEX <1.0, 0.0>,<1.0, 0.0>,<1.0, 0.0>,<1.0, 0.0>,<0.5, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.0, 0.0>,<0.5, 0.0>,<1.0, 0.0>,<1.0, 0.0>,<1.0, 0.0>



;.code
;start:
    Init
   
    invoke FFTSSE3, ADDR TestData2, LENGTHOF TestData2
    cmp eax,0
    je @F
   
    invoke FFTSSE3, ADDR TestData4, LENGTHOF TestData4
    cmp eax,0
    je @F
   
    invoke FFTSSE3, ADDR SqaurePulse, LENGTHOF SqaurePulse
    cmp eax,0
    je @F
   
    invoke PrintSamples, ADDR TestData2, LENGTHOF TestData2
    invoke PrintSamples, ADDR TestData4, LENGTHOF TestData4
    invoke PrintSamples, ADDR SqaurePulse, LENGTHOF SqaurePulse
   
   
@@: mov eax,0
    ret
   


PrintSamples PROC input_data:pComplex, leng:DWORD

    pusha
    mov esi,0
    mov edi,input_data


@@:
    lea ebx,[edi+esi*SIZEOF COMPLEX]
   
   
    fld [ebx].COMPLEX.real
    Print Str$(ST(0))
    fstp [ebx].COMPLEX.real

    fld [ebx].COMPLEX.imag
    Print " + j"
    Print Str$(ST(0))
    fstp [ebx].COMPLEX.imag

    Print CrLf$
   
    inc esi
    cmp esi, leng
    jl @B

    Print CrLf$
    Print CrLf$
    popa
    ret
   
PrintSamples ENDP




FFTSSE3 PROC input_data:pComplex, leng:DWORD

    LOCAL common_data:FFT_DATA      ; Data common to all recursions

; Check for SSE3 instruction support
    mov eax,1
    cpuid
    and ecx,01h
    jz fftfail
   

; Make sure sample size is a power of two
    mov eax, leng
    cmp eax,0             ; x>0
    je fftfail
    mov ebx, eax          ; x && (x-1) == 0
    dec ebx
    and ebx, eax
    jne fftfail
    mov common_data.len, eax    ; save length

    invoke CaclWo, ADDR common_data.pWoData, eax    ;  Create a heap allocation for the values of Wo   
   
    mov eax, input_data
    mov common_data.pData, eax  ; save data pointer
   
   
; Align the stack on a 16-byte boundary
    mov eax,esp
    and eax,0Fh
    sub esp,eax

; Start the show!

    invoke FFTRecurse, ADDR common_data, input_data, 1


; Free the Wo calculation
    invoke GetProcessHeap
    invoke HeapFree, eax, 0, common_data.pWoData;

    mov eax, 1      ; return true
    ret
   
fftfail:
    mov eax,0
    ret


FFTSSE3 ENDP


; Actual recursion

FFTRecurse  PROC pGlobal:pFFTData, g0:pComplex, step:DWORD

    LOCAL step2:DWORD
    LOCAL h0:pComplex
    LOCAL N:DWORD
 
  ;  Calculate length based on step, save into N (2L/S)(S-1)
    mov eax, pGlobal
    mov eax, [eax].FFT_DATA.len
    mov edx,0
    div step
    mov N,eax

  ;  If length of data == 1, return
    cmp eax,1
    jne @F
    ret


@@: ; else, calculate DFT and do math



  ; Calculate Step*2 for start of h[] recurse
    mov eax,step
    add eax,eax
    mov step2,eax

    mov ebx, g0
    mov ecx, step
    lea eax, [ebx+ecx*(SIZEOF COMPLEX)]
    mov h0,eax

    invoke FFTRecurse, pGlobal, g0, step2       ; Calculate g[]
    invoke FFTRecurse, pGlobal, h0, step2       ; Calculate h[]

  ; Calculate Base Wo  {2(N/S)(S-1)}
    mov eax, pGlobal
    mov eax, [eax].FFT_DATA.len
    mov edx, 0
    mov ebx, step       ; ebx = S
    shl ebx, 1          ; ebx = 2S
    div ebx             ; eax = (TotN/2S)
    shr ebx, 1          ; ebx = S
    dec ebx             ; ebx = (S-1)
    mul ebx             ; eax = (TotN/2S)(S-1)
    shl eax,1           ; eax = 2(N/S)(S-1)
    mov ebx, pGlobal
    mov ebx, [ebx].FFT_DATA.pWoData
    lea eax, [ebx+eax*(SIZEOF COMPLEX)] ;eax = Pointer to w[0] for current N

  ; Setup pointers to h and g
    mov esi,g0
    mov edi,h0

  ; Setup counter
    mov ecx,N
    shr ecx,1   ; divide N by half (each h and g are N/2 long)

    mov ebx,step
    shl ebx,1       ; step by 2

FFTCalc:
    cmp ecx,0
    je  FFTSwap

  ;  Put H, G, and W on the stack
    push 0
    push [esi].COMPLEX.real  ; Gr
    push 0
    push [esi].COMPLEX.imag  ; Gi
    push [edi].COMPLEX.imag  ; Hi
    push [eax].COMPLEX.imag  ; Wi
    push [edi].COMPLEX.real  ; Hr
    push [eax].COMPLEX.real  ; Wr

  ;  Compute g+Wh, g-Wh

    movsldup xmm0,[esp+0] 
    movshdup xmm1,[esp+0] 
    shufps   xmm1,xmm1,0D8h
    mulps    xmm0,xmm1     
    movaps   xmm1,xmm0     
    shufps   xmm1,xmm1,0EBh
    addsubps xmm0,xmm1     
    shufps   xmm0,xmm0,005h
    movsldup xmm1,[esp+16]
    addsubps xmm1,xmm0     
    shufps   xmm1,xmm1,0D8h
    movaps [esp+0],xmm1   
   
    ; Unpack data into the original area
    mov edx,[esp+0]
    mov [edi].COMPLEX.imag,edx  ;-i
    mov edx,[esp+4]
    mov [edi].COMPLEX.real,edx  ;-r
    mov edx,[esp+8]
    mov [esi].COMPLEX.imag,edx  ;+i
    mov edx,[esp+12]
    mov [esi].COMPLEX.real,edx  ;+r

    ; calculate the next pointers
    add eax, (SIZEOF COMPLEX)   ; move to next Wo
    lea esi, [esi+ebx*(SIZEOF COMPLEX)]   ; move to next Go
    lea edi, [edi+ebx*(SIZEOF COMPLEX)]   ; move to next Ho

    add esp,(SIZEOF COMPLEX)*4  ; fix the stack

    dec ecx
    JMP FFTCalc

FFTSwap:

    mov ebx,0
    mov ecx,N       ; N
    sub ecx,2       ; (N-2)
    shr ecx,1       ; (N-2)/2 = outer counter

FFTSwapOuter:
        cmp ecx,0
        je  @F
   
        inc ebx
        mov eax,step
        mov edx,0
        mul ebx      ; eax=start*pos
        mov esi,g0   
        lea esi, [esi+eax*(SIZEOF COMPLEX)] ; esi is the 'starting element'
        mov edx,0
       
    FFTSwapInner:
        cmp edx,ecx
        je FFTSwapOuterTest
       
        mov eax,step
        lea edi, [esi+eax*(SIZEOF COMPLEX)]   ; load address of next element

        push [esi].COMPLEX.real ;push element n
        push [esi].COMPLEX.imag
       
        push [edi].COMPLEX.real ;push element n+1
        push [edi].COMPLEX.imag
       
        pop [esi].COMPLEX.imag ;pop element n+1 into n
        pop [esi].COMPLEX.real

        pop [edi].COMPLEX.imag ;pop element n into n+1
        pop [edi].COMPLEX.real

        lea esi, [edi+eax*(SIZEOF COMPLEX)]     ; esi points to 1 above edi (new base)

        inc edx
        jmp FFTSwapInner

FFTSwapOuterTest:
      dec ecx
      jmp FFTSwapOuter
   
@@: ret
FFTRecurse ENDP





CaclWo  PROC  pWoData:pReal4, leng:DWORD
    LOCAL r:DWORD           ; r
    LOCAL neg2pi:REAL4      ;-2*pi
    LOCAL theta:REAL4       ;-2*pi/N

    ; W= e^-j*2*pi*r/N

; Calculate size of allocation: (len - 1)
   
    mov eax, leng
    dec eax
    lea ecx, [eax*(SIZEOF COMPLEX)]
    invoke GetProcessHeap
    invoke HeapAlloc, eax, 0, ecx;
    mov edi,eax
    mov eax,pWoData
    mov [eax],edi           ; save the WoArea into the output pointer

; Precalulate -2*pi (ebx is still two)
    push 2
    fild DWORD PTR [esp]    ; load 2
    fchs                    ; -2
    add esp,4               ; ajust stack
    fldpi                   ; load pi
    fmulp st(1),st(0)       ; -2(pi)
    fstp neg2pi


; EBX-(r+4), EDX-len, EDI-pdata, ESI-r
   mov edx, leng

iloop:  ; while (N/2!=1)

    fld neg2pi   ;calculate a new theta
    mov leng,edx
    fild leng
    fdivp st(1),st(0)
    fstp theta

    mov esi,0     ; reinitialize 'r'
    mov eax,edx
    shr eax, 1

    wloop:                  ; while (r<N/2) r= 0-(len-1)
        cmp esi,eax
        jae  ilooptest

            fldz
            fld theta
            mov  r,esi
            fild r
            fmulp st(1),st(0)
            fadd st(1),st(0)
            fcos
            fstp [edi].COMPLEX.real
            fsin
            fstp [edi].COMPLEX.imag
            add edi, SIZEOF COMPLEX

            inc esi         ; loop for each length
            jmp wloop
ilooptest:
    shr edx,1   ; N = N / 2
    cmp edx,1   ; loop while N > 1
    jne iloop

    mov eax,-1  ; return true
    ret
CaclWo  ENDP



end start
Strange women, lying in ponds, distributing swords, is no basis for a system of government

baltoro

red,
Frankly, the Fast Fourier Transform scares the hell out of me. But, so incredibly useful.
Thanks for posting.
Baltoro

redskull

Quote from: baltoro on January 25, 2011, 01:37:14 AM
Frankly, the Fast Fourier Transform scares the hell out of me.

Being scared shitless of mathematics is the first sign you understand it  :toothy

Strange women, lying in ponds, distributing swords, is no basis for a system of government

baltoro

Funny. :bg
You know, I've read about the FFT being used in so many fascinating applications,...and, I've always intended to spend some time and really get acquainted with it,...and, never did. I must do some research. Thanks again,...it's the perfect example for a beginning effort.
Baltoro

redskull

Some light reading  :bdg  http://www.archive.org/stream/analyticaltheory00fourrich

But seriously, i doubt this is the perfect example; my comments are sub-par and the SSE stuff only makes it worse.  I would look for a higher level version, using C++ numerics or MATLAB or something; or start with with regular old continous Fourier transform, or even the fourier series.

-r
Strange women, lying in ponds, distributing swords, is no basis for a system of government

jj2007

The FFT Wiki looks good, but I must admit it's way over my head.

Good to see you found a use for MasmBasic :bg
In case you need it, the Str$() macro handles various output precision:

Quote   Print Str$("The number PI is %Hf[/color]", PI)   ; precision H = 17 digits (I=18 works fine, J may yield rubbish)
Allowed are also all kinds of variables, except MMX regs. But these lines are perfectly valid:
Quotemovlps xmm1, MyReal8
   Print Str$("PI*1000 is %Hf\n", PI*MyReal8)
   Print Str$("PI*1000 is %Hf\n", PI*f:xmm1)
(note the f: telling Str$() that xmm1 is to be interpreted as a float here. No f: means use it as an integer. Same rule for xmm regs in the deb macro)

vanjast

This is really exciting stuff.. if you're that way inclined... that is  :green2
I'm busy with a hardware project involving this type of stuff, and had to see if it was feasable.
So, I wrote a quick proggy to test my ideas..

Essentially I'm transmitting a signal as in the first pic (Input signal).
I made an FFT (DSP) filter which looks really funny, and with a bit of convoluted maths I end up with filtered signals in the top graph.


BUT, the big thing is that the input signals are never perfect, to I added random noise to the input signal which now looks a complete mess.
And the 'Bunny out the Hat' is that the signal peaks are still there... This is just a software filter of one kind, and you can do a gazillion other goodies with DSP.
The FFT makes it real fast, wrt numerical computation.

FORTRANS

Quote from: baltoro on January 25, 2011, 01:59:00 AM
Funny. :bg
You know, I've read about the FFT being used in so many fascinating applications,...and, I've always intended to spend some time and really get acquainted with it,...and, never did. I must do some research. Thanks again,...it's the perfect example for a beginning effort.

Hi,

   I have used FFT's for image processing.  Probably a good
test bed for learning the ins and outs of how they work.  And
I tried using them for convolving functions together to combine
effectiveness measures.  That got bogged down as the complexity
got out of hand.  (For my fuzzy brain at least.)

   I have tried coding up a DFT and FFT in the past, but fell
back to using existing code for practicality reasons.  But trying
to code them up helped to understand what was being done.

   The last course I attended (on lasers) used Fourier transforms
to explain some functionality, and my prior playing around (and/or
using them) helped in understanding what was being described.
Such as the spectral content of a short laser pulse.

   So, try using an FFT on some problem, the experience can
prove useful in understanding another problem.  Image processing
provides visual feedback that I could understand.  YMMV.

Regards,

Steve N.

baltoro

When I was spending time researching Climate Change, I would run into studies that performed FFT on long time series of proxy data (ice cores, deep sea cores, radiocarbon sources), looking for inherent periodicities in the data chaos that could then be used as a comparison with other climate and solar data. The concept in the abstract I understood, but, the actual mathematics was way beyond my intellect. Many of the models were generated by computers.   
I think that's why Global Climate Change scares people,...because, they know intuitively that they can't understand the analysis, but, it's real (and, unfortunately is more or less background noise in a person's life).
Baltoro

Neo

4-way vectorize it, man.  Just use a radix-4 FFT method so that you can simply split the array into 4, interlace it, do 4 concurrent FFTs (optionally with a different radix), then recombine them.  It should be almost a perfect 4x speedup over using scalar SSE operations, and more like a 10x or more speedup over FPU code.  :U

Edit: Oops! The last part of the title didn't show up on the main page and I saw FPU code, so I jumped, lol.  Oh well.  :lol

redskull

Quote from: Neo on January 26, 2011, 09:24:59 AM
..use a radix-4 FFT method...

That's the next project on the list  :toothy
Strange women, lying in ponds, distributing swords, is no basis for a system of government

Farabi

I dont understand it but it is amazing  :U
Those who had universe knowledges can control the world by a micro processor.
http://www.wix.com/farabio/firstpage

"Etos siperi elegi"

baltoro

Quote from: REDSKULLThat's the next project on the list.  :toothy
Wow, Neo comes up with some mind-boggling stuff.   :dazzled:  :dazzled:
Just out of curiosity, redskull, what are you using FFT for (in real life,...)?
Baltoro

redskull

Quote from: baltoro on January 27, 2011, 12:52:43 AM
Just out of curiosity, redskull, what are you using FFT for (in real life,...)?

In day-to-day life, not really anything; I don't do a whole lot of DSP, so I usually end up grinding out the continous fourier transform with paper and pencil, or just using whatever is in MATLAB.  This was basically just a way to refamiliarize myself with the SSE instructions.  But here's a short list of things I can recall doing at some point: solving partial differential equations, filter design, receiver design, edge detection in images... basically, it's faster to convolve functions by transforming, multiplying, and inverse-transforming.  But either way, it's the best way to get a computer to do the fourier transform, and the fourier transform really is what drives almost all our technology, so it's certainly not effort wasted.

-r
Strange women, lying in ponds, distributing swords, is no basis for a system of government

Neo

Quote from: baltoro on January 27, 2011, 12:52:43 AM
Quote from: REDSKULLThat's the next project on the list.  :toothy
Wow, Neo comes up with some mind-boggling stuff.   :dazzled:  :dazzled:
It's the quantum physics simulation that's driven me mind-boggly.  :wink

I started implementing a radix-4 FFT and inverse FFT to compute some crazy correlation function before I realized that for the particular thing the correlation function was in turn being used to compute, it really only needed: bit-shift, XOR, sum the bits, repeat a bunch of times.  That ended up being over 10x faster, whereas the vectorized FFT would've only been 4x faster.

Don't even get me started on eigenvectors and eigenvalues... unless you really need giant sparse matrices diagonalized.