I have recently started learning the sse. Currently I'm trying to port this C code for Goerzel algorithm (http://www.exstrom.com/journal/sigproc/).
The hardest part (goerzel function) is finished, with mixed fpu/sse code. Fpu is used to precompute the sines, and sse for the main loop and log10 on the output. But not this is my problem, I need to convert audio pcm samples to floats: flt[] = pcm[]/32768, but my timings are very bad: up to 6 times faster with sse (timing with rdtsc).
Here's my code:
// PellesC code, console app
#include "stdio.h"
#include "stdlib.h"
#define shuffle(f0,f1, f2,f3) (f0 | f1<<2 | f2<<4 | f3<<6) // reversed _MM_SHUFFLE
#define IN
#define OUT
int g_enable_sse;
int sse1(void) // returns false if SSE is disabled or not available
{
int retval;
__asm
{
mov eax,1
cpuid
and edx,1<<25
mov retval,edx
}
return (retval && g_enable_sse);
}
void w2f(IN short *words,OUT float *floats, int count)
{
// time comparision using rdtsc for count=2205
// fpu: 15*** ticks
// sse: 11*** ticks
if (sse1())
{
__asm
{
push esi
push edi
push ebx
push ebp
movaps xmm1,[w2fdivider]
mov esi,[words]
mov edi,[floats]
mov ebp,[count]
cmp ebp,4
jge w2fnext4 ;// process 4 samples at a time, if (count>=4)
jmp w2fnext1 ;// process 1 sample at a time, if (count<4)
align 16
w2fdivider:
dd 32768.0, 32768.0, 32768.0, 32768.0 ; moved to .data by the compiler
w2fnext4:
;
; convert 4 words to 4 integers
;
movsx eax,word[esi+0]
movsx ebx,word[esi+2]
movsx ecx,word[esi+4]
movsx edx,word[esi+6]
mov dword[esp-16],eax ;// +0
mov dword[esp-12],ebx ;// +4
mov dword[esp- 8],ecx ;// +8
mov dword[esp- 4],edx ;// +12
;
; convert 4 integers to 4 floats
;
cvtpi2ps xmm0,[esp-16];// load two integers to low order floats
movlhps xmm0,xmm0 ;// copy to high order floats
cvtpi2ps xmm0,[esp-8] ;// load two integers to low order floats
shufps xmm0,xmm0,shuffle(2,3,0,1) ;// update order
;
; scale to the range -1,+1
;
divps xmm0,xmm1
;
; store 4 floats from xmm0 to output buffer
movaps [edi],xmm0
add esi,8
add edi,16
sub ebp,4
jz w2fquit
cmp ebp,4
jg w2fnext4
;-------------------------------------------
;
; see if something is left (up to 3 words)
;
test ebp,ebp
jz w2fquit
w2fnext1:
; load single word
movsx eax,word[esi+0]
mov dword[esp-16],eax
cvtsi2ss xmm0,[esp-16]
divss xmm0,xmm1
movss [edi],xmm0
add esi,2
add edi,4
sub ebp,1
jnz w2fnext1
w2fquit:
pop ebp
pop ebx
pop edi
pop esi
}
}
else
{
while (count--)
{
floats[count] = words[count]/32768.0f;
}
}
}
void __rdtsc(unsigned __int64 *pout)
{
__asm {
mov ecx,pout
rdtsc
mov [ecx],eax
mov [ecx+4],edx
}
}
int main(void)
{
int samplesPerPacket = 2200;
char *memo = (char*)malloc(samplesPerPacket * (sizeof(float) + sizeof(short)) + 16);
float *floats = (float*) ((int)(memo+16) & 0xFFFFFFF0);
short *pcm = (short*) &floats[samplesPerPacket];
// load something to pcm array. Done, we have some trash
// call w2f to convert pcm to floats
int a;
unsigned __int64 time1, time2, time3;
g_enable_sse = 0;
time3=0ll;
for (a=0; a<100; a++)
{
__rdtsc(&time1);
w2f(pcm, floats, samplesPerPacket);
__rdtsc(&time2);
time3 = (time3 + (time2-time1)) / 2;
}
printf("FPU: %d ticks\n", (int)time3);
g_enable_sse = 1;
if (sse1())
{
time3=0ll;
for (a=0; a<100; a++)
{
__rdtsc(&time1);
w2f(pcm, floats, samplesPerPacket);
__rdtsc(&time2);
time3 = (time3 + (time2-time1)) / 2;
}
printf("SSE: %d ticks\n", (int)time3);
}
else
{
puts("sorry, no SSE found");
}
free(memo);
return 0;
}
For what you intend to achieve, it is not necessary to use the FPU for converting WORD integers to floats in the range of -1 to +1. I have just tested it and the following procedure is approximately 3x faster than using simple FPU divisions.
push ebx
push esi
push edi
mov ebx,count
lea esi,words
lea edi,floats
.while ebx != 0
lodsw
xor ecx,ecx
xor edx,edx
or eax,eax
jz endcal
.if SIGN?
neg eax
mov edx,80000000h
.endif
bsr cx,ax
push ecx
add cl,70h
ror ecx,9
add edx,ecx
pop ecx
neg cl
add cl,16
shl ax,cl
shl eax,7
add eax,edx
endcal:
stosd
dec ebx
.endw
I have no idea if such code could be converted to sse instructions.
i am sure Jochen has written one using SSE
let me see if i can find the thread
no luck - we will have to wait for him to pop in
I have just tested it, and found that the sign test will never succeed, because "or eax,eax" is too wide, I think it should be "or ax,ax". But unfortunatelly even with this fixup, this subroutine produces wrong output for me, and the total speed is equal to the FPU version on my machine (Pentium M).
I have also tried to replace lodsw with movsx, leaving the whole eax register. The result was more accurate:
words floats
-55 -0.00335693
-11 -0.00067139
0 0.00000000
11 0.00067139
55 0.00335693
what is the range of input values ?
how many bits resolution in and out ?
The range is 32767 > R >= -32768 (15 bit precision), as defined for signed word. The normalization to +-1 (division by 32768) can be applied inside the DSP loop, so the word->float transform does not need to divide each word by 32768.
Anyway, there is my nasm sse port for the goerzel code (the core of this project). It calculates amplitudes of 4 frequencies at a time. The division by 32768 can be moved to this function, or even only the computed amplitude can be divided to make it more friendly for logarithm function.
[attachment deleted by admin]
Quotereplace lodsw with movsx, leaving the whole eax register
You are absolutely right. I should have tested it myself with an array before submitting that code.
this is a non-SSE approach
the advantage is that the scaling is combined with the float conversion
it still uses a loop, which leaves room for SSE to beat it, especially if you can use cvtdq2ps to convert 4 values at once
EDIT - a refined version - i swapped assignments of eax and ecx for a little reduction
movzx eax,word ptr InVal16s
or ax,ax
jz Scale1
mov ecx,8Fh
jns Scale0
neg ax
mov ch,1
Scale0: dec ecx
shl ax,1
jnc Scale0
; CH bit 0 = sign
; CL = exponent
; AX = fraction/128
shr ecx,1
rcr ax,1
ror ecx,8
shl eax,8
or eax,ecx
Scale1: mov OutValReal4,eax
After looking at the first post, at least two probably big optimizations.:
mov dword[esp-16],eax ;// +0
mov dword[esp-12],ebx ;// +4
mov dword[esp- 8],ecx ;// +8
mov dword[esp- 4],edx ;// +12
cvtpi2ps xmm0,[esp-16];// load two integers to low order floats
movlhps xmm0,xmm0 ;// copy to high order floats
cvtpi2ps xmm0,[esp-8] ;// load two integers to low order floats
shufps xmm0,xmm0,shuffle(2,3,0,1) ;// update order
by reording those first 4 mov's, you can avoid that shuffle.... right?
and then:
divps xmm0,xmm1
This can be converted into a multiplication by 1/32768, which will be so much faster you wont even believe it
Quote from: dedndave on August 02, 2009, 02:45:31 AM
i am sure Jochen has written one using SSE
let me see if i can find the thread
no luck - we will have to wait for him to pop in
Dave, you have too much confidence in me :red
Ask Nightware - he is the true SSE2 guru here :bg
... and I see Rockoon has jumped on that train, too :bg
i want to be an SSE programer, too :(
lol
i am reading up on it - once i get the registers straight, look out ! :bdg
now, you have to admit that my x86 routine looks pretty snappy - lol
meh.. this really isnt a good application for floating point SSE since the data isnt starting as floating point, or even in a convienient size. I assume its being yanked off a 16-bit ADC of some kind (DirectSound record buffer?)
If its performance-at-all-costs I would strongly consider a 13-bit lookup table (just use the upper 13 bits of the samples) for the conversion to floats via LUT, because that still is only 32KB of memory and will fit quite nicely in a 64KB L1 cache (almost all, if not all modern processors have at least 64KB L1) .. the odds are that the lowest 3 bits from any analog source are plagued by noise anyways.
So something like:
@@do:
movzx eax, word [esi]
movzx ebx, word [esi + 2]
movzx ecx, word [esi + 4]
movzx edx, word [esi + 6]
shr eax, 3
shr ebx, 3
shr ecx, 3
shr edx, 3
mov eax, table[4 * eax]
mov ebx, table[4 * ebx]
mov ecx, table[4 * ecx]
mov edx, table[4 * edx]
mov [edi], eax
mov [edi + 4], ebx
mov [edi + 8], ecx
mov [edi + 12], edx
add esi, 8
add edi, 16
dec ebp
jnz @@do
This technique also has the nice property that the table can be loaded with just about anything, so this all can be made into a highly generic translator (or several, with different shift constants)
Yes, it is processing wave files or sound captured from the mixer or so. It will probably replace FFT, because I have already a GUI FFT program for "sound decompilation" - you select a part of the sound, and it shows which piano keys are active. But in order to keep good resolution for the lowest frequencies, the FFT must be 32768 samples big, and there is a small overhead because I do not need to measure 32768/2 frequencies, I need only 88. So this is why I'm trying with Goerzel algorithm.
Hopefully calling the GoerzelSSE subroutine 88/4 times will be faster even than FFTW. All the current FPU stuff will be removed, because I have 88 fixed frequencies to lookup in the spectrum.
Rockoon, I have removed the shuffle and tried with mulps, but the result was incorrect. A test DTMF decoder program displayed trash instead successive "digits". Probably I need to increase the 32768 value, because the reciprocal of 32768 loaded into SSE is/may be too small. I'll check this. BTW 1/32768 as real32 (FPU) is working ok.
Dedndave, your code is working perfectly, thanks! I have never even tried to learn IEEE 754, but the time is coming.
floating point math is not at all difficult
Ray has a great tutorial that is easy to understand...
http://www.ray.masmcode.com/tutorial/index.html
for in-depth definition of floating formats...
http://dlc.sun.com/pdf/819-3693/819-3693.pdf
i wish we had an SSE tutorial like Rays FPU tute - lol
let us know if the code is fast enough - if not, Rockoon may have the right idea - a look-up table
p.s. i am sure the SSE method is there, as well - just out of reach for me - lol
if you are recognizing notes from a keyboard, they are all harmonically related
once you have one octave going on, the rest are powers of 2
i might be inclined to look for a way to take advantage of the relationship of octaves to a computers' numbering system
i did a little work with lattice and FIR filters a long time ago, but have forgotten most of what i new
keeping all the values in fixed-point integers comes to mind
you are multiplying and dividing values by constants - you might even be able to avoid MUL and DIV altogether
certainly, the 2*Pi/32768 could be made into one constant
I agree,realW = 2.0*cos(2.0*pi*k/N);
imagW = sin(2.0*pi*k/N);
d1 = 0.0;
d2 = 0.0;
for (n=0; n<N; ++n)
realW and imagW (or the other variables - theta, CO, CO2, S0 from my attachment) will be precomputed for all tones.
The wroted FFT program I want to optimize with Goerzel can be downloaded from http://www.ionicwind.com/forums/index.php?topic=3472.0 - this is my topic and idea. It requires FFTW (http://www.fftw.org/) binaries.
multiplying to divide is nothing new
you are simply multiplying by the reciprical of the constant
the MUL instructions are somewhat faster than the DIV instructions, so that technique gets a lot of mileage
unsigned multiplication by a constant is even better
this is a simplified example
if we want to multiply by 10, that is 1010 in binary
so, it's shift and add, shift and add
the shift/add pattern matches the 1010 constant pattern
movzx eax,word ptr InputVal16u
shl eax,1 ;2X in eax
mov edx,eax ;2X in edx
shl eax,2 ;8X in eax
add eax,edx ;10X in eax
Hutch showed me a trick to speed that up
movzx eax,word ptr InputVal16u
lea eax,[eax+4*eax] ;5X in eax
shl eax,1 ;10X in eax
by selecting the right shift/add pattern, you can multiply by any value, so long as you are aware of overflow
combining the multiply-to-divide and hard-coded multiplication by constants methods can eliminate all MULs and DIVs
you are not really too concerned over a slight loss in resolution, which makes it a great application for the methods
shifts and adds are VERY fast, especially register to register
the entire DFT would be blazingly fast
Quote from: akane on August 02, 2009, 10:41:25 PM
Yes, it is processing wave files or sound captured from the mixer or so. It will probably replace FFT, because I have already a GUI FFT program for "sound decompilation" - you select a part of the sound, and it shows which piano keys are active. But in order to keep good resolution for the lowest frequencies, the FFT must be 32768 samples big
Aha.. well now! This is a much more attackable problem.
Did you know that if you downsample that 32768 sample buffer to say, 4096 samples, that you still get to measure those low frequencies?
They will just appear like they are 3 octaves higher (their wavelength will be 1/8th as long as originally) The downsampling will destroy the highest frequency information, cause some aliasing in the new highest frequencies, but the low frequency information will remain intact.
Downsampling is a form of low-pass filter.
http://en.wikipedia.org/wiki/Downsampling
there ya go - that's your octave switch - only look at half the samples to switch down an octave
set up your 12 notes for one octave and you have it made - lol
in my routine, the
mov ecx,8Fh
sets the biased exponent - 8Fh is for a sample count of N=32768 (it gets decremented to 8Eh for -32768)
for N=16384, change the 8Fh to 90h
for N=8192, use 91h
for N=4096, use 92h, and so on
put it in a variable or calculate it based on the number of samples
mov ecx,SampleCountExponent
Quotesets the biased exponent - 8Fh is for a sample count of N=32768
You mean sample
amplitude, right? Because 1/32768 in your code is used to decrease amplitude of all samples, in order to keep constant output level from dsp routine:
words[] = from_wave()
floats[] = words[]/32768
complex[] = dsp(floats, count, samplerate)
Amplitude scaling to +-1 or +-2 (PEP) is sometimes important for custom sin/cos routines. You never know which algorithm is used for sine, while FPU is accepting any value. For example the raw tailor polynomial for sine is accepting up to +-PI
my bad - i thought that was the number of samples
The version with float lookup table is much faster than the one with "manual float creation". All the floats are initially divided by 32768, .
movzx eax,word[esi+0]
movzx edx,word[esi+2]
mov eax,[g_floatlookup+eax*4]
mov edx,[g_floatlookup+edx*4]
mov [edi+0],eax
mov [edi+4],edx
The SSE version will be faster if SSE2 is available - instead cvtpi2ps+movlhps+cvtpi2ps we need only CVTDQ2PS xmm,mem; it will convert 4 integers to floats.
Funny, Olly v1 does not recognize CVTDQ2PS as instruction.
A table lookup will always be the fastest if it fits in L1.
For curiosity, I timed some of the algos with QueryPerformanceCounter on my old P4, for 1000 repeats of the algo. The reported counts are corrected for the counts of running only the reduction of the 1000 down to 0, which was 10 on my system. (Simply invoking QueryPerformanceCounter twice in succession results in a difference of 6 counts.)
Keeping the 32768 as the divisor on the FPU, the sequence
fild word + fdiv st,st(1) + fstp mem requires 87 counts.
Keeping the reciprocal of 32768 as the multiplier on the FPU, the sequence
fild word + fmul st,st(1) + fstp mem requires 15 counts.
My suggested algo for converting word integers to a float in the range of -1 to +1 requires 47 counts.
None of the above are sensitive to the value of the integer.
dedndave's suggested algo to convert a word integer to a float is very sensitive to the size of the integer; the smaller it is, the more time it requires.
For word=4321, 38 counts
For word=432, 62 counts
For word=43, 95 counts
For word=4, 125 counts.
The identical result to dedndave's algo can be obtained with the following FPU sequence
fild word + fstp mem which only requires 15 counts and is not sensitive to the word value.
Surprisingly, that time was the same as when the word was also multiplied by the reciprocal before storing it.
i knew it wasn't blazing - lol
did you see how fast it is for zero !!! ::)
i think it is as fast as it gets with no FPU or SSE
the loop kills it
to be fair, a "typical" value for a sine wave should be more than 8192 away from zero
the RMS of a full-range sine wave is ~ +/-11584
that means that the loop will typically only make 3 passes
if we assume that the levels of the complex waveform cut that in half, it is 4 passes
that means it's a pretty good guess-timation that the 38-62 counts will be nominal for my routine
you're biased, Ray - lol (pun)
it's a moot point, really, as neither method can beat the SSE2 instructions
if someone wanted to spend the time on the code, no floating point math would be required at all
it would be as fast as a bullet up to the complex, which would require some bit-twiddling
i don't think it would require any loops, though
after looking at some of the SSE instructions, i can see it would be difficult to beat their times
they are FAST ! - lol
no use in messing with the fixed point methods
i learned 2 new instructions, BSWAP and BSR/BSF - well, new for me at least - lol
because i am a bit-twiddler, i really like them
i used them to eliminate the loop and speed up the final positioning of bits
i gave preference to values nearest to 0, of course
movzx eax,word ptr InVal16s
or ax,ax
jz Scale2
mov cx,16
jns Scale0
neg ax
mov ch,1
Scale0: bsr dx,ax
cmp dl,7
ja Scale1
xchg al,ah
mov cl,8
Scale1: sub cl,dl
add dl,7Fh
mov dh,ch
shl eax,cl
shr edx,1
rcr ax,1
xchg al,dl
bswap eax
mov ah,dl
Scale2: mov OutValReal4,eax
some fair test values might be:
8191
4095
1023
255
-255
-1023
-4095
-8191
i made a comparison between Ray's code and my new code
for Ray's method, i made 2 measurements; one with the multiplier resident and one non-resident
the program with source code is attached
resident method:
finit
fld real10 ptr Recip
;start timer loop
fild word ptr InVal16s
fmul st,st(1)
fstp real4 ptr OutValReal4
;stop timer loop
non-resident method:
finit
;start timer loop
fld real10 ptr Recip
fild word ptr InVal16s
fmul st,st(1)
fstp real4 ptr OutValReal4
ffree st
;stop timer loop
it looks like the aging newbie/FPU guy wins again :U
Total System Processor Cores: 2
CPU 0: Intel(R) Pentium(R) 4 CPU 3.00GHz MMX SSE3 Cores: 2
Aging n00b/bit-twiddler guy new method:
8191: 22 21 22 clock cycles
4095: 21 22 22 clock cycles
1023: 22 22 21 clock cycles
255: 22 22 23 clock cycles
-255: 25 26 26 clock cycles
-1023: 25 25 25 clock cycles
-4095: 24 25 24 clock cycles
-8191: 24 24 24 clock cycles
Aging newbie/FPU guy method (resident multiplier):
8191: 7 7 7 clock cycles
4095: 7 7 7 clock cycles
1023: 7 7 7 clock cycles
255: 7 7 7 clock cycles
-255: 7 7 7 clock cycles
-1023: 7 7 7 clock cycles
-4095: 7 7 7 clock cycles
-8191: 7 7 7 clock cycles
Aging newbie/FPU guy method (non-resident multiplier):
8191: 15 15 15 clock cycles
4095: 15 15 15 clock cycles
1023: 15 15 15 clock cycles
255: 15 15 15 clock cycles
-255: 15 15 15 clock cycles
-1023: 15 15 15 clock cycles
-4095: 15 15 15 clock cycles
-8191: 15 15 15 clock cycles
[attachment deleted by admin]
Strange, i get this when i run it on my E5400:
Total System Processor Cores: 2
CPU 0: Intel(R) Pentium(R) CPU E5400 @ 2.70GHz MMX SSSE3 Cores: 2
Aging n00b/bit-twiddler guy new method:
8191: 16 16 16 clock cycles
4095: 16 16 16 clock cycles
1023: 16 16 16 clock cycles
255: 17 17 17 clock cycles
-255: 17 17 17 clock cycles
-1023: 16 16 16 clock cycles
-4095: 16 16 16 clock cycles
-8191: 16 16 16 clock cycles
Aging newbie/FPU guy method (resident multiplier):
8191: 0 0 0 clock cycles
4095: 0 0 0 clock cycles
1023: 0 0 0 clock cycles
255: 0 0 0 clock cycles
-255: 0 0 0 clock cycles
-1023: 0 0 0 clock cycles
-4095: 0 0 0 clock cycles
-8191: 0 0 0 clock cycles
Aging newbie/FPU guy method (non-resident multiplier):
8191: 0 0 0 clock cycles
4095: 0 0 0 clock cycles
1023: 0 0 0 clock cycles
255: 0 0 0 clock cycles
-255: 0 0 0 clock cycles
-1023: 0 0 0 clock cycles
-4095: 0 0 0 clock cycles
-8191: 0 0 0 clock cycles
HR,
Ghandi
Quote from: Ghandi on March 06, 2010, 06:31:33 AM
Strange, i get this when i run it on my E5400:
Zero cycles is cheating, man!
Total System Processor Cores: 1
CPU 0: Intel(R) Celeron(R) M CPU 420 @ 1.60GHz MMX SSE3 Cores: 1
Aging n00b/bit-twiddler guy new method:
8191: 17 17 17 clock cycles
4095: 16 17 17 clock cycles
1023: 17 17 17 clock cycles
255: 18 18 18 clock cycles
-255: 18 18 18 clock cycles
-1023: 17 17 17 clock cycles
-4095: 17 17 17 clock cycles
-8191: 17 17 17 clock cycles
Aging newbie/FPU guy method (resident multiplier):
8191: 1 1 1 clock cycles
4095: 1 1 1 clock cycles
1023: 1 1 1 clock cycles
255: 1 1 1 clock cycles
-255: 1 1 1 clock cycles
-1023: 1 1 1 clock cycles
-4095: 1 1 1 clock cycles
-8191: 1 1 1 clock cycles
Aging newbie/FPU guy method (non-resident multiplier):
8191: 1 1 1 clock cycles
4095: 1 1 1 clock cycles
1023: 1 1 1 clock cycles
255: 1 1 1 clock cycles
-255: 1 1 1 clock cycles
-1023: 1 1 1 clock cycles
-4095: 1 1 1 clock cycles
-8191: 1 1 1 clock cycles
Yes, its definitely strange. I havent modified the application at all, it is straight out of the box giving me those values. o0
HR,
Ghandi
Hello
A bit late, but here's an example how to convert 16 bit signed numbers the fast way ( SSE2 ) and scale them down to single floats -1.0 to 1.0
align 16 ; samples must be 128 bit aligned ( else use movdqu )
Div32768 real4 0.000030517578125,0.000030517578125,0.000030517578125,0.000030517578125 ; 1/32768
Samples_16bit dw -32768,-24576,-16384,-8192,0,8191,16383,24575
ResultBuffer real4 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lea esi,Samples_16bit
lea edi,ResultBuffer
lea eax,Div32768
movdqa xmm2,[eax] ; 1/32768
movdqa xmm0,[esi] ; xmm0 7 6 5 4 3 2 1 0 ( 8 (stereo) 16 bit samples )
movdqa xmm1,xmm0 ; xmm1 7 6 5 4 3 2 1 0 make a copy
punpcklwd xmm0,xmm0 ; xmm0 3 3 2 2 1 1 0 0 unpack lowest 4 samples
punpckhwd xmm1,xmm1 ; xmm1 7 7 6 6 5 5 4 4 unpack highest 4 samples
psrad xmm0,16 ; xmm0 - 3 - 2 - 1 - 0 make dwords by shifting
psrad xmm1,16 ; xmm1 - 7 - 6 - 5 - 4 make dwords by shifting
cvtdq2ps xmm0,xmm0 ; convert 4 * 32 bit to single floating-point
cvtdq2ps xmm1,xmm1 ; convert the next 4 * 32 bit to single floating-point
mulps xmm0,xmm2 ; divide 4 samples by 32768
mulps xmm1,xmm2 ; divide the next 4 samples by 32768
movdqa [edi],xmm0 ; save 4 samples
movdqa [edi+16],xmm1 ; save the next 4 samples
This is the fastest way I know, maybe there is a faster method.....