The MASM Forum Archive 2004 to 2012

General Forums => The Laboratory => Topic started by: akane on August 01, 2009, 11:23:51 PM

Title: Converting words to floats using sse
Post by: akane on August 01, 2009, 11:23:51 PM
I have recently started learning the sse. Currently I'm trying to port this C code for Goerzel algorithm (http://www.exstrom.com/journal/sigproc/).
The hardest part (goerzel function) is finished, with mixed fpu/sse code. Fpu is used to precompute the sines, and sse for the main loop and log10 on the output. But not this is my problem, I need to convert audio pcm samples to floats: flt[] = pcm[]/32768, but my timings are very bad: up to 6 times faster with sse (timing with rdtsc).

Here's my code:
// PellesC code, console app
#include "stdio.h"
#include "stdlib.h"

#define shuffle(f0,f1, f2,f3) (f0 | f1<<2 | f2<<4 | f3<<6) // reversed _MM_SHUFFLE
#define IN
#define OUT

int g_enable_sse;

int sse1(void) // returns false if SSE is disabled or not available
{
int retval;
__asm
{
mov eax,1
cpuid
and edx,1<<25
mov retval,edx
}
return (retval && g_enable_sse);
}


void w2f(IN short *words,OUT float *floats, int count)
{
// time comparision using rdtsc for count=2205
// fpu: 15*** ticks
// sse: 11*** ticks
if (sse1())
{
__asm
{
push esi
push edi
push ebx
push ebp

movaps xmm1,[w2fdivider]
mov    esi,[words]
mov    edi,[floats]
mov    ebp,[count]
cmp    ebp,4
jge    w2fnext4    ;// process 4 samples at a time, if (count>=4)
jmp    w2fnext1    ;// process 1 sample at a time, if (count<4)
align  16
w2fdivider:
dd 32768.0, 32768.0, 32768.0, 32768.0 ; moved to .data by the compiler
w2fnext4:
;
; convert 4 words to 4 integers
;
movsx  eax,word[esi+0]
movsx  ebx,word[esi+2]
movsx  ecx,word[esi+4]
movsx  edx,word[esi+6]
mov    dword[esp-16],eax ;// +0
mov    dword[esp-12],ebx ;// +4
mov    dword[esp- 8],ecx ;// +8
mov    dword[esp- 4],edx ;// +12
;
; convert 4 integers to 4 floats
;
cvtpi2ps xmm0,[esp-16];// load two integers to low order floats
movlhps  xmm0,xmm0    ;// copy to high order floats
cvtpi2ps xmm0,[esp-8] ;// load two integers to low order floats
shufps xmm0,xmm0,shuffle(2,3,0,1) ;// update order
;
; scale to the range -1,+1
;
divps  xmm0,xmm1
;
; store 4 floats from xmm0 to output buffer
movaps [edi],xmm0
add    esi,8
add    edi,16
sub    ebp,4
jz     w2fquit
cmp    ebp,4
jg     w2fnext4
;-------------------------------------------
;
; see if something is left (up to 3 words)
;
test ebp,ebp
jz   w2fquit
w2fnext1:
; load single word
movsx  eax,word[esi+0]
mov    dword[esp-16],eax
cvtsi2ss xmm0,[esp-16]
divss  xmm0,xmm1
movss  [edi],xmm0
add    esi,2
add    edi,4
sub    ebp,1
jnz    w2fnext1
w2fquit:

pop ebp
pop ebx
pop edi
pop esi
}
}
else
{
while (count--)
{
floats[count] = words[count]/32768.0f;
}
}
}

void __rdtsc(unsigned __int64 *pout)
{
__asm {
mov ecx,pout
rdtsc
mov [ecx],eax
mov [ecx+4],edx
}
}


int main(void)
{
int samplesPerPacket = 2200;
char *memo    = (char*)malloc(samplesPerPacket * (sizeof(float) + sizeof(short)) + 16);
float *floats = (float*) ((int)(memo+16) & 0xFFFFFFF0);
short *pcm    = (short*) &floats[samplesPerPacket];
// load something to pcm array. Done, we have some trash
// call w2f to convert pcm to floats
int a;
unsigned __int64 time1, time2, time3;

g_enable_sse = 0;
time3=0ll;
for (a=0; a<100; a++)
{
__rdtsc(&time1);
w2f(pcm, floats, samplesPerPacket);
__rdtsc(&time2);
time3 = (time3 + (time2-time1)) / 2;
}
printf("FPU: %d ticks\n", (int)time3);

g_enable_sse = 1;
if (sse1())
{
time3=0ll;
for (a=0; a<100; a++)
{
__rdtsc(&time1);
w2f(pcm, floats, samplesPerPacket);
__rdtsc(&time2);
time3 = (time3 + (time2-time1)) / 2;
}
printf("SSE: %d ticks\n", (int)time3);
}
else
{
puts("sorry, no SSE found");
}

free(memo);
return 0;
}
Title: Re: Converting words to floats using sse
Post by: raymond on August 02, 2009, 02:14:32 AM
For what you intend to achieve, it is not necessary to use the FPU for converting WORD integers to floats in the range of -1 to +1. I have just tested it and the following procedure is approximately 3x faster than using simple FPU divisions.

   push  ebx
   push  esi
   push  edi
   mov   ebx,count
   lea   esi,words
   lea   edi,floats
.while ebx != 0
   lodsw
   xor   ecx,ecx
   xor   edx,edx
   or    eax,eax
   jz    endcal
   .if   SIGN?
         neg   eax
         mov   edx,80000000h
   .endif
   bsr   cx,ax
   push  ecx
   add   cl,70h
   ror   ecx,9
   add   edx,ecx
   pop   ecx
   neg   cl
   add   cl,16
   shl   ax,cl
   shl   eax,7
   add   eax,edx
endcal:
   stosd
   dec   ebx
.endw


I have no idea if such code could be converted to sse instructions.
Title: Re: Converting words to floats using sse
Post by: dedndave on August 02, 2009, 02:45:31 AM
i am sure Jochen has written one using SSE
let me see if i can find the thread

no luck - we will have to wait for him to pop in
Title: Re: Converting words to floats using sse
Post by: akane on August 02, 2009, 10:20:53 AM
I have just tested it, and found that the sign test will never succeed, because "or eax,eax" is too wide, I think it should be "or ax,ax". But unfortunatelly even with this fixup, this subroutine produces wrong output for me, and the total speed is equal to the FPU version on my machine (Pentium M).

I have also tried to replace lodsw with movsx, leaving the whole eax register. The result was more accurate:
words  floats
-55   -0.00335693
-11   -0.00067139
0      0.00000000
11     0.00067139
55     0.00335693
Title: Re: Converting words to floats using sse
Post by: dedndave on August 02, 2009, 11:02:49 AM
what is the range of input values ?
how many bits resolution in and out ?
Title: Re: Converting words to floats using sse
Post by: akane on August 02, 2009, 01:19:51 PM
The range is 32767 > R >= -32768 (15 bit precision), as defined for signed word. The normalization to +-1 (division by 32768) can be applied inside the DSP loop, so the word->float transform does not need to divide each word by 32768.

Anyway, there is my nasm sse port for the goerzel code (the core of this project). It calculates amplitudes of 4 frequencies at a time. The division by 32768 can be moved to this function, or even only the computed amplitude can be divided to make it more friendly for logarithm function.

[attachment deleted by admin]
Title: Re: Converting words to floats using sse
Post by: raymond on August 02, 2009, 04:52:09 PM
Quotereplace lodsw with movsx, leaving the whole eax register

You are absolutely right. I should have tested it myself with an array before submitting that code.
Title: Re: Converting words to floats using sse
Post by: dedndave on August 02, 2009, 06:38:26 PM
this is a non-SSE approach
the advantage is that the scaling is combined with the float conversion
it still uses a loop, which leaves room for SSE to beat it, especially if you can use cvtdq2ps to convert 4 values at once

EDIT - a refined version - i swapped assignments of eax and ecx for a little reduction

        movzx   eax,word ptr InVal16s
        or      ax,ax
        jz      Scale1

        mov     ecx,8Fh
        jns     Scale0

        neg     ax
        mov     ch,1

Scale0: dec     ecx
        shl     ax,1
        jnc     Scale0

; CH bit 0 = sign
; CL = exponent
; AX = fraction/128

        shr     ecx,1
        rcr     ax,1
        ror     ecx,8
        shl     eax,8
        or      eax,ecx

Scale1: mov     OutValReal4,eax

Title: Re: Converting words to floats using sse
Post by: Rockoon on August 02, 2009, 07:54:32 PM
After looking at the first post, at least two probably big optimizations.:

mov    dword[esp-16],eax ;// +0
mov    dword[esp-12],ebx ;// +4
mov    dword[esp- 8],ecx ;// +8
mov    dword[esp- 4],edx ;// +12
cvtpi2ps xmm0,[esp-16];// load two integers to low order floats
movlhps  xmm0,xmm0    ;// copy to high order floats
cvtpi2ps xmm0,[esp-8] ;// load two integers to low order floats
shufps xmm0,xmm0,shuffle(2,3,0,1) ;// update order

by reording those first 4 mov's, you can avoid that shuffle.... right?



and then:

divps  xmm0,xmm1

This can be converted into a multiplication by 1/32768, which will be so much faster you wont even believe it

Title: Re: Converting words to floats using sse
Post by: jj2007 on August 02, 2009, 08:58:04 PM
Quote from: dedndave on August 02, 2009, 02:45:31 AM
i am sure Jochen has written one using SSE
let me see if i can find the thread

no luck - we will have to wait for him to pop in

Dave, you have too much confidence in me :red
Ask Nightware - he is the true SSE2 guru here :bg
... and I see Rockoon has jumped on that train, too :bg
Title: Re: Converting words to floats using sse
Post by: dedndave on August 02, 2009, 09:18:35 PM
i want to be an SSE programer, too  :(
lol
i am reading up on it - once i get the registers straight, look out !  :bdg

now, you have to admit that my x86 routine looks pretty snappy - lol
Title: Re: Converting words to floats using sse
Post by: Rockoon on August 02, 2009, 09:31:38 PM
meh.. this really isnt a good application for floating point SSE since the data isnt starting as floating point, or even in a convienient size. I assume its being yanked off a 16-bit ADC of some kind (DirectSound record buffer?)

If its performance-at-all-costs I would strongly consider a 13-bit lookup table (just use the upper 13 bits of the samples) for the conversion to floats via LUT, because that still is only 32KB of memory and will fit quite nicely in a 64KB L1 cache (almost all, if not all modern processors have at least 64KB L1) .. the odds are that the lowest 3 bits from any analog source are plagued by noise anyways.

So something like:

@@do:
  movzx eax, word [esi]
  movzx ebx, word [esi + 2]
  movzx ecx, word [esi + 4]
  movzx edx, word [esi + 6]
  shr eax, 3
  shr ebx, 3
  shr ecx, 3
  shr edx, 3
  mov eax, table[4 * eax]
  mov ebx, table[4 * ebx]
  mov ecx, table[4 * ecx]
  mov edx, table[4 * edx]
  mov [edi], eax
  mov [edi + 4], ebx
  mov [edi + 8], ecx
  mov [edi + 12], edx
  add esi, 8
  add edi, 16
  dec ebp
  jnz @@do

This technique also has the nice property that the table can be loaded with just about anything, so this all can be made into a highly generic translator (or several, with different shift constants)
Title: Re: Converting words to floats using sse
Post by: akane on August 02, 2009, 10:41:25 PM
Yes, it is processing wave files or sound captured from the mixer or so. It will probably replace FFT, because I have already a GUI FFT program for "sound decompilation" - you select a part of the sound, and it shows which piano keys are active. But in order to keep good resolution for the lowest frequencies, the FFT must be 32768 samples big, and there is a small overhead because I do not need to measure 32768/2 frequencies, I need only 88. So this is why I'm trying with Goerzel algorithm.
Hopefully calling the GoerzelSSE subroutine 88/4 times will be faster even than FFTW. All the current FPU stuff will be removed, because I have 88 fixed frequencies to lookup in the spectrum.

Rockoon, I have removed the shuffle and tried with mulps, but the result was incorrect. A test DTMF decoder program displayed trash instead successive "digits". Probably I need to increase the 32768 value, because the reciprocal of 32768 loaded into SSE is/may be too small. I'll check this. BTW 1/32768 as real32 (FPU) is working ok.

Dedndave, your code is working perfectly, thanks! I have never even tried to learn IEEE 754, but the time is coming.
Title: Re: Converting words to floats using sse
Post by: dedndave on August 02, 2009, 10:50:17 PM
floating point math is not at all difficult
Ray has a great tutorial that is easy to understand...
http://www.ray.masmcode.com/tutorial/index.html

for in-depth definition of floating formats...
http://dlc.sun.com/pdf/819-3693/819-3693.pdf

i wish we had an SSE tutorial like Rays FPU tute - lol

let us know if the code is fast enough - if not, Rockoon may have the right idea - a look-up table

p.s. i am sure the SSE method is there, as well - just out of reach for me - lol
Title: Re: Converting words to floats using sse
Post by: dedndave on August 02, 2009, 11:26:57 PM
if you are recognizing notes from a keyboard, they are all harmonically related
once you have one octave going on, the rest are powers of 2
i might be inclined to look for a way to take advantage of the relationship of octaves to a computers' numbering system
i did a little work with lattice and FIR filters a long time ago, but have forgotten most of what i new
keeping all the values in fixed-point integers comes to mind
you are multiplying and dividing values by constants - you might even be able to avoid MUL and DIV altogether
certainly, the 2*Pi/32768 could be made into one constant
Title: Re: Converting words to floats using sse
Post by: akane on August 02, 2009, 11:51:15 PM
I agree,realW = 2.0*cos(2.0*pi*k/N);
imagW = sin(2.0*pi*k/N);

d1 = 0.0;
d2 = 0.0;
for (n=0; n<N; ++n)

realW and imagW (or the other variables - theta, CO, CO2, S0 from my attachment) will be precomputed for all tones.
The wroted FFT program I want to optimize with Goerzel can be downloaded from http://www.ionicwind.com/forums/index.php?topic=3472.0 - this is my topic and idea. It requires FFTW (http://www.fftw.org/) binaries.
Title: Re: Converting words to floats using sse
Post by: dedndave on August 03, 2009, 12:12:35 AM
multiplying to divide is nothing new
you are simply multiplying by the reciprical of the constant
the MUL instructions are somewhat faster than the DIV instructions, so that technique gets a lot of mileage

unsigned multiplication by a constant is even better

this is a simplified example
if we want to multiply by 10, that is 1010 in binary
so, it's shift and add, shift and add
the shift/add pattern matches the 1010 constant pattern

        movzx   eax,word ptr InputVal16u
        shl     eax,1                      ;2X in eax
        mov     edx,eax                    ;2X in edx
        shl     eax,2                      ;8X in eax
        add     eax,edx                    ;10X in eax

Hutch showed me a trick to speed that up

        movzx   eax,word ptr InputVal16u
        lea     eax,[eax+4*eax]            ;5X in eax
        shl     eax,1                      ;10X in eax

by selecting the right shift/add pattern, you can multiply by any value, so long as you are aware of overflow
combining the multiply-to-divide and hard-coded multiplication by constants methods can eliminate all MULs and DIVs
you are not really too concerned over a slight loss in resolution, which makes it a great application for the methods
shifts and adds are VERY fast, especially register to register
the entire DFT would be blazingly fast
Title: Re: Converting words to floats using sse
Post by: Rockoon on August 03, 2009, 12:15:14 AM
Quote from: akane on August 02, 2009, 10:41:25 PM
Yes, it is processing wave files or sound captured from the mixer or so. It will probably replace FFT, because I have already a GUI FFT program for "sound decompilation" - you select a part of the sound, and it shows which piano keys are active. But in order to keep good resolution for the lowest frequencies, the FFT must be 32768 samples big

Aha.. well now! This is a much more attackable problem.

Did you know that if you downsample that 32768 sample buffer to say, 4096 samples, that you still get to measure those low frequencies?

They will just appear like they are 3 octaves higher (their wavelength will be 1/8th as long as originally) The downsampling will destroy the highest frequency information, cause some aliasing in the new highest frequencies, but the low frequency information will remain intact.

Downsampling is a form of low-pass filter.

http://en.wikipedia.org/wiki/Downsampling
Title: Re: Converting words to floats using sse
Post by: dedndave on August 03, 2009, 02:39:44 PM
there ya go - that's your octave switch - only look at half the samples to switch down an octave
set up your 12 notes for one octave and you have it made - lol
in my routine, the

        mov     ecx,8Fh

sets the biased exponent - 8Fh is for a sample count of N=32768 (it gets decremented to 8Eh for -32768)
for N=16384, change the 8Fh to 90h
for N=8192, use 91h
for N=4096, use 92h, and so on
put it in a variable or calculate it based on the number of samples

        mov     ecx,SampleCountExponent
Title: Re: Converting words to floats using sse
Post by: akane on August 03, 2009, 03:43:50 PM
Quotesets the biased exponent - 8Fh is for a sample count of N=32768
You mean sample amplitude, right? Because 1/32768 in your code is used to decrease amplitude of all samples, in order to keep constant output level from dsp routine:
words[] = from_wave()
floats[] = words[]/32768
complex[] = dsp(floats, count, samplerate)


Amplitude scaling to +-1 or +-2 (PEP) is sometimes important for custom sin/cos routines. You never know which algorithm is used for sine, while FPU is accepting any value. For example the raw tailor polynomial for sine is accepting up to +-PI
Title: Re: Converting words to floats using sse
Post by: dedndave on August 03, 2009, 04:10:28 PM
my bad - i thought that was the number of samples
Title: Re: Converting words to floats using sse
Post by: akane on August 03, 2009, 07:35:17 PM
The version with float lookup table is much faster than the one with "manual float creation". All the floats are initially divided by 32768, .
movzx eax,word[esi+0]
movzx edx,word[esi+2]
mov   eax,[g_floatlookup+eax*4]
mov   edx,[g_floatlookup+edx*4]
mov   [edi+0],eax
mov   [edi+4],edx


The SSE version will be faster if SSE2 is available - instead cvtpi2ps+movlhps+cvtpi2ps we need only CVTDQ2PS xmm,mem; it will convert 4 integers to floats.
Funny, Olly v1 does not recognize CVTDQ2PS as instruction.
Title: Re: Converting words to floats using sse
Post by: raymond on August 03, 2009, 11:52:54 PM
A table lookup will always be the fastest if it fits in L1.

For curiosity, I timed some of the algos with QueryPerformanceCounter on my old P4, for 1000 repeats of the algo. The reported counts are corrected for the counts of running only the reduction of the 1000 down to 0, which was 10 on my system. (Simply invoking QueryPerformanceCounter twice in succession results in a difference of 6 counts.)

Keeping the 32768 as the divisor on the FPU, the sequence
fild word + fdiv st,st(1) + fstp mem requires 87 counts.

Keeping the reciprocal of 32768 as the multiplier on the FPU, the sequence
fild word + fmul st,st(1) + fstp mem requires 15 counts.

My suggested algo for converting word integers to a float in the range of -1 to +1 requires 47 counts.

None of the above are sensitive to the value of the integer.

dedndave's suggested algo to convert a word integer to a float is very sensitive to the size of the integer; the smaller it is, the more time it requires.
For word=4321, 38 counts
For word=432, 62 counts
For word=43, 95 counts
For word=4, 125 counts.

The identical result to dedndave's algo can be obtained with the following FPU sequence
fild word + fstp mem which only requires 15 counts and is not sensitive to the word value.

Surprisingly, that time was the same as when the word was also multiplied by the reciprocal before storing it.

Title: Re: Converting words to floats using sse
Post by: dedndave on August 04, 2009, 12:43:31 AM
i knew it wasn't blazing - lol
did you see how fast it is for zero !!!  ::)

i think it is as fast as it gets with no FPU or SSE
the loop kills it
to be fair, a "typical" value for a sine wave should be more than 8192 away from zero
the RMS of a full-range sine wave is ~ +/-11584
that means that the loop will typically only make 3 passes
if we assume that the levels of the complex waveform cut that in half, it is 4 passes
that means it's a pretty good guess-timation that the 38-62 counts will be nominal for my routine
you're biased, Ray - lol (pun)

it's a moot point, really, as neither method can beat the SSE2 instructions
if someone wanted to spend the time on the code, no floating point math would be required at all
it would be as fast as a bullet up to the complex, which would require some bit-twiddling
i don't think it would require any loops, though
Title: Re: Converting words to floats using sse
Post by: dedndave on August 04, 2009, 03:31:45 AM
after looking at some of the SSE instructions, i can see it would be difficult to beat their times
they are FAST ! - lol
no use in messing with the fixed point methods
Title: Re: Converting words to floats using sse
Post by: dedndave on August 04, 2009, 03:48:16 PM
i learned 2 new instructions, BSWAP and BSR/BSF - well, new for me at least - lol
because i am a bit-twiddler, i really like them
i used them to eliminate the loop and speed up the final positioning of bits
i gave preference to values nearest to 0, of course

        movzx   eax,word ptr InVal16s
        or      ax,ax
        jz      Scale2

        mov     cx,16
        jns     Scale0

        neg     ax
        mov     ch,1

Scale0: bsr     dx,ax
        cmp     dl,7
        ja      Scale1

        xchg    al,ah
        mov     cl,8

Scale1: sub     cl,dl
        add     dl,7Fh
        mov     dh,ch
        shl     eax,cl
        shr     edx,1
        rcr     ax,1
        xchg    al,dl
        bswap   eax
        mov     ah,dl

Scale2: mov     OutValReal4,eax

some fair test values might be:
8191
4095
1023
255
-255
-1023
-4095
-8191
Title: Re: Converting words to floats using sse
Post by: dedndave on August 04, 2009, 07:38:56 PM
i made a comparison between Ray's code and my new code
for Ray's method, i made 2 measurements; one with the multiplier resident and one non-resident
the program with source code is attached

resident method:

        finit
        fld real10 ptr Recip

;start timer loop

        fild word ptr InVal16s
        fmul    st,st(1)
        fstp real4 ptr OutValReal4

;stop timer loop

non-resident method:

        finit

;start timer loop

        fld real10 ptr Recip
        fild word ptr InVal16s
        fmul    st,st(1)
        fstp real4 ptr OutValReal4
        ffree   st

;stop timer loop

it looks like the aging newbie/FPU guy wins again   :U

Total System Processor Cores: 2
CPU 0: Intel(R) Pentium(R) 4 CPU 3.00GHz MMX SSE3 Cores: 2

Aging n00b/bit-twiddler guy new method:
8191:   22      21      22      clock cycles
4095:   21      22      22      clock cycles
1023:   22      22      21      clock cycles
255:    22      22      23      clock cycles
-255:   25      26      26      clock cycles
-1023:  25      25      25      clock cycles
-4095:  24      25      24      clock cycles
-8191:  24      24      24      clock cycles

Aging newbie/FPU guy method (resident multiplier):
8191:   7       7       7       clock cycles
4095:   7       7       7       clock cycles
1023:   7       7       7       clock cycles
255:    7       7       7       clock cycles
-255:   7       7       7       clock cycles
-1023:  7       7       7       clock cycles
-4095:  7       7       7       clock cycles
-8191:  7       7       7       clock cycles

Aging newbie/FPU guy method (non-resident multiplier):
8191:   15      15      15      clock cycles
4095:   15      15      15      clock cycles
1023:   15      15      15      clock cycles
255:    15      15      15      clock cycles
-255:   15      15      15      clock cycles
-1023:  15      15      15      clock cycles
-4095:  15      15      15      clock cycles
-8191:  15      15      15      clock cycles



[attachment deleted by admin]
Title: Re: Converting words to floats using sse
Post by: Ghandi on March 06, 2010, 06:31:33 AM
Strange, i get this when i run it on my E5400:


Total System Processor Cores: 2
CPU 0: Intel(R) Pentium(R)  CPU       E5400  @ 2.70GHz MMX SSSE3 Cores: 2

Aging n00b/bit-twiddler guy new method:
8191:   16      16      16      clock cycles
4095:   16      16      16      clock cycles
1023:   16      16      16      clock cycles
255:    17      17      17      clock cycles
-255:   17      17      17      clock cycles
-1023:  16      16      16      clock cycles
-4095:  16      16      16      clock cycles
-8191:  16      16      16      clock cycles

Aging newbie/FPU guy method (resident multiplier):
8191:   0       0       0       clock cycles
4095:   0       0       0       clock cycles
1023:   0       0       0       clock cycles
255:    0       0       0       clock cycles
-255:   0       0       0       clock cycles
-1023:  0       0       0       clock cycles
-4095:  0       0       0       clock cycles
-8191:  0       0       0       clock cycles

Aging newbie/FPU guy method (non-resident multiplier):
8191:   0       0       0       clock cycles
4095:   0       0       0       clock cycles
1023:   0       0       0       clock cycles
255:    0       0       0       clock cycles
-255:   0       0       0       clock cycles
-1023:  0       0       0       clock cycles
-4095:  0       0       0       clock cycles
-8191:  0       0       0       clock cycles


HR,
Ghandi
Title: Re: Converting words to floats using sse
Post by: jj2007 on March 06, 2010, 07:19:01 AM
Quote from: Ghandi on March 06, 2010, 06:31:33 AM
Strange, i get this when i run it on my E5400:

Zero cycles is cheating, man!

Total System Processor Cores: 1
CPU 0: Intel(R) Celeron(R) M CPU        420  @ 1.60GHz MMX SSE3 Cores: 1

Aging n00b/bit-twiddler guy new method:
8191:   17      17      17      clock cycles
4095:   16      17      17      clock cycles
1023:   17      17      17      clock cycles
255:    18      18      18      clock cycles
-255:   18      18      18      clock cycles
-1023:  17      17      17      clock cycles
-4095:  17      17      17      clock cycles
-8191:  17      17      17      clock cycles

Aging newbie/FPU guy method (resident multiplier):
8191:   1       1       1       clock cycles
4095:   1       1       1       clock cycles
1023:   1       1       1       clock cycles
255:    1       1       1       clock cycles
-255:   1       1       1       clock cycles
-1023:  1       1       1       clock cycles
-4095:  1       1       1       clock cycles
-8191:  1       1       1       clock cycles

Aging newbie/FPU guy method (non-resident multiplier):
8191:   1       1       1       clock cycles
4095:   1       1       1       clock cycles
1023:   1       1       1       clock cycles
255:    1       1       1       clock cycles
-255:   1       1       1       clock cycles
-1023:  1       1       1       clock cycles
-4095:  1       1       1       clock cycles
-8191:  1       1       1       clock cycles
Title: Re: Converting words to floats using sse
Post by: Ghandi on March 06, 2010, 02:54:54 PM
Yes, its definitely strange. I havent modified the application at all, it is straight out of the box giving me those values. o0

HR,
Ghandi
Title: Re: Converting words to floats using sse
Post by: Siekmanski on March 06, 2010, 06:56:29 PM
Hello

A bit late, but here's an example how to convert 16 bit signed numbers the fast way ( SSE2 ) and scale them down to single floats -1.0 to 1.0


align 16        ; samples must be 128 bit aligned ( else use movdqu )

Div32768          real4 0.000030517578125,0.000030517578125,0.000030517578125,0.000030517578125 ; 1/32768

Samples_16bit     dw   -32768,-24576,-16384,-8192,0,8191,16383,24575
ResultBuffer      real4 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


    lea          esi,Samples_16bit
    lea          edi,ResultBuffer
    lea          eax,Div32768
    movdqa       xmm2,[eax]    ; 1/32768

    movdqa       xmm0,[esi]    ; xmm0   7  6  5  4  3  2  1  0  ( 8 (stereo) 16 bit samples )
    movdqa       xmm1,xmm0     ; xmm1   7  6  5  4  3  2  1  0  make a copy
    punpcklwd    xmm0,xmm0     ; xmm0   3  3  2  2  1  1  0  0  unpack lowest 4 samples
    punpckhwd    xmm1,xmm1     ; xmm1   7  7  6  6  5  5  4  4  unpack highest 4 samples
    psrad        xmm0,16       ; xmm0   -  3  -  2  -  1  -  0  make dwords by shifting
    psrad        xmm1,16       ; xmm1   -  7  -  6  -  5  -  4  make dwords by shifting
    cvtdq2ps     xmm0,xmm0     ; convert 4 * 32 bit to single floating-point
    cvtdq2ps     xmm1,xmm1     ; convert the next 4 * 32 bit to single floating-point
    mulps        xmm0,xmm2     ; divide 4 samples by 32768
    mulps        xmm1,xmm2     ; divide the next 4 samples by 32768
    movdqa       [edi],xmm0    ; save 4 samples
    movdqa       [edi+16],xmm1 ; save the next 4 samples



This is the fastest way I know, maybe there is a faster method.....