Converting words to floats using sse

akane · August 01, 2009, 11:23:51 PM

I have recently started learning the sse. Currently I'm trying to port this C code for Goerzel algorithm.
The hardest part (goerzel function) is finished, with mixed fpu/sse code. Fpu is used to precompute the sines, and sse for the main loop and log10 on the output. But not this is my problem, I need to convert audio pcm samples to floats: flt[] = pcm[]/32768, but my timings are very bad: up to 6 times faster with sse (timing with rdtsc).

Here's my code:

Code Select

// PellesC code, console app
#include "stdio.h"
#include "stdlib.h"

#define shuffle(f0,f1, f2,f3) (f0 | f1<<2 | f2<<4 | f3<<6) // reversed _MM_SHUFFLE
#define IN
#define OUT

int g_enable_sse;

int sse1(void) // returns false if SSE is disabled or not available
{
	int retval;
	__asm
	{
		mov eax,1
		cpuid
		and edx,1<<25
		mov retval,edx
	}
	return (retval && g_enable_sse);
}


void w2f(IN short *words,OUT float *floats, int count)
{
	// time comparision using rdtsc for count=2205
	// fpu: 15*** ticks
	// sse: 11*** ticks
	if (sse1())
	{
		__asm
		{
		push esi
		push edi
		push ebx
		push ebp

		movaps xmm1,[w2fdivider]
		mov    esi,[words]
		mov    edi,[floats]
		mov    ebp,[count]
		cmp    ebp,4
		jge    w2fnext4    ;// process 4 samples at a time, if (count>=4)
		jmp    w2fnext1    ;// process 1 sample at a time, if (count<4)
		align  16
w2fdivider:
		dd 32768.0, 32768.0, 32768.0, 32768.0 ; moved to .data by the compiler
w2fnext4:
		;
		; convert 4 words to 4 integers
		;
		movsx  eax,word[esi+0]
		movsx  ebx,word[esi+2]
		movsx  ecx,word[esi+4]
		movsx  edx,word[esi+6]
		mov    dword[esp-16],eax ;// +0
		mov    dword[esp-12],ebx ;// +4
		mov    dword[esp- 8],ecx ;// +8
		mov    dword[esp- 4],edx ;// +12
		;
		; convert 4 integers to 4 floats
		;
		cvtpi2ps xmm0,[esp-16];// load two integers to low order floats
		movlhps  xmm0,xmm0    ;// copy to high order floats
		cvtpi2ps xmm0,[esp-8] ;// load two integers to low order floats
		shufps xmm0,xmm0,shuffle(2,3,0,1) ;// update order
		;
		; scale to the range -1,+1
		;
		divps  xmm0,xmm1
		;
		; store 4 floats from xmm0 to output buffer
		movaps [edi],xmm0
		add    esi,8
		add    edi,16
		sub    ebp,4
		jz     w2fquit
		cmp    ebp,4
		jg     w2fnext4
		;-------------------------------------------
		;
		; see if something is left (up to 3 words)
		;
		test ebp,ebp
		jz   w2fquit
w2fnext1:
		; load single word
		movsx  eax,word[esi+0]
		mov    dword[esp-16],eax
		cvtsi2ss xmm0,[esp-16]
		divss  xmm0,xmm1
		movss  [edi],xmm0
		add    esi,2
		add    edi,4
		sub    ebp,1
		jnz    w2fnext1
w2fquit:

		pop ebp
		pop ebx
		pop edi
		pop esi
		}
	}
	else
	{
		while (count--)
		{
			floats[count] = words[count]/32768.0f;
		}
	}
}

void __rdtsc(unsigned __int64 *pout)
{
	__asm {
	mov ecx,pout
	rdtsc
	mov [ecx],eax
	mov [ecx+4],edx
	}
}


int main(void)
{
	int samplesPerPacket = 2200;
	char *memo    = (char*)malloc(samplesPerPacket * (sizeof(float) + sizeof(short)) + 16);
	float *floats = (float*) ((int)(memo+16) & 0xFFFFFFF0);
	short *pcm    = (short*) &floats[samplesPerPacket];
	// load something to pcm array. Done, we have some trash
	// call w2f to convert pcm to floats
	int a;
	unsigned __int64 time1, time2, time3;

	g_enable_sse = 0;
	time3=0ll;
	for (a=0; a<100; a++)
	{
		__rdtsc(&time1);
		w2f(pcm, floats, samplesPerPacket);
		__rdtsc(&time2);
		time3 = (time3 + (time2-time1)) / 2;
	}
	printf("FPU: %d ticks\n", (int)time3);

	g_enable_sse = 1;
	if (sse1())
	{
		time3=0ll;
		for (a=0; a<100; a++)
		{
			__rdtsc(&time1);
			w2f(pcm, floats, samplesPerPacket);
			__rdtsc(&time2);
			time3 = (time3 + (time2-time1)) / 2;
		}
		printf("SSE: %d ticks\n", (int)time3);
	}
	else
	{
		puts("sorry, no SSE found");
	}

	free(memo);
	return 0;
}

raymond · August 02, 2009, 02:14:32 AM

For what you intend to achieve, it is not necessary to use the FPU for converting WORD integers to floats in the range of -1 to +1. I have just tested it and the following procedure is approximately 3x faster than using simple FPU divisions.

Code Select

   push  ebx
   push  esi
   push  edi
   mov   ebx,count
   lea   esi,words
   lea   edi,floats
.while ebx != 0
   lodsw
   xor   ecx,ecx
   xor   edx,edx
   or    eax,eax
   jz    endcal
   .if   SIGN?
         neg   eax
         mov   edx,80000000h
   .endif
   bsr   cx,ax
   push  ecx
   add   cl,70h
   ror   ecx,9
   add   edx,ecx
   pop   ecx
   neg   cl
   add   cl,16
   shl   ax,cl
   shl   eax,7
   add   eax,edx
endcal:
   stosd
   dec   ebx
.endw

I have no idea if such code could be converted to sse instructions.

dedndave · August 02, 2009, 02:45:31 AM

i am sure Jochen has written one using SSE
let me see if i can find the thread

no luck - we will have to wait for him to pop in

akane · August 02, 2009, 10:20:53 AM

I have just tested it, and found that the sign test will never succeed, because "or eax,eax" is too wide, I think it should be "or ax,ax". But unfortunatelly even with this fixup, this subroutine produces wrong output for me, and the total speed is equal to the FPU version on my machine (Pentium M).

I have also tried to replace lodsw with movsx, leaving the whole eax register. The result was more accurate:

Code Select

words  floats
-55   -0.00335693
-11   -0.00067139
0      0.00000000
11     0.00067139
55     0.00335693

dedndave · August 02, 2009, 11:02:49 AM

what is the range of input values ?
how many bits resolution in and out ?

akane · August 02, 2009, 01:19:51 PM

The range is 32767 > R >= -32768 (15 bit precision), as defined for signed word. The normalization to +-1 (division by 32768) can be applied inside the DSP loop, so the word->float transform does not need to divide each word by 32768.

Anyway, there is my nasm sse port for the goerzel code (the core of this project). It calculates amplitudes of 4 frequencies at a time. The division by 32768 can be moved to this function, or even only the computed amplitude can be divided to make it more friendly for logarithm function.

[attachment deleted by admin]

raymond · August 02, 2009, 04:52:09 PM

Quotereplace lodsw with movsx, leaving the whole eax register

You are absolutely right. I should have tested it myself with an array before submitting that code.

dedndave · August 02, 2009, 06:38:26 PM

this is a non-SSE approach
the advantage is that the scaling is combined with the float conversion
it still uses a loop, which leaves room for SSE to beat it, especially if you can use cvtdq2ps to convert 4 values at once

EDIT - a refined version - i swapped assignments of eax and ecx for a little reduction

movzx eax,word ptr InVal16s
or ax,ax
jz Scale1

mov ecx,8Fh
jns Scale0

neg ax
mov ch,1

Scale0: dec ecx
shl ax,1
jnc Scale0

; CH bit 0 = sign
; CL = exponent
; AX = fraction/128

shr ecx,1
rcr ax,1
ror ecx,8
shl eax,8
or eax,ecx

Scale1: mov OutValReal4,eax

Rockoon · August 02, 2009, 07:54:32 PM

After looking at the first post, at least two probably big optimizations.:

mov dword[esp-16],eax ;// +0
mov dword[esp-12],ebx ;// +4
mov dword[esp- 8],ecx ;// +8
mov dword[esp- 4],edx ;// +12
cvtpi2ps xmm0,[esp-16];// load two integers to low order floats
movlhps xmm0,xmm0 ;// copy to high order floats
cvtpi2ps xmm0,[esp-8] ;// load two integers to low order floats
shufps xmm0,xmm0,shuffle(2,3,0,1) ;// update order

by reording those first 4 mov's, you can avoid that shuffle.... right?

and then:

divps xmm0,xmm1

This can be converted into a multiplication by 1/32768, which will be so much faster you wont even believe it

jj2007 · August 02, 2009, 08:58:04 PM

Quote from: dedndave on August 02, 2009, 02:45:31 AM
i am sure Jochen has written one using SSE
let me see if i can find the thread

no luck - we will have to wait for him to pop in

Dave, you have too much confidence in me :red
Ask Nightware - he is the true SSE2 guru here :bg
... and I see Rockoon has jumped on that train, too :bg

dedndave · August 02, 2009, 09:18:35 PM

i want to be an SSE programer, too :(
lol
i am reading up on it - once i get the registers straight, look out ! :bdg

now, you have to admit that my x86 routine looks pretty snappy - lol

Rockoon · August 02, 2009, 09:31:38 PM

meh.. this really isnt a good application for floating point SSE since the data isnt starting as floating point, or even in a convienient size. I assume its being yanked off a 16-bit ADC of some kind (DirectSound record buffer?)

If its performance-at-all-costs I would strongly consider a 13-bit lookup table (just use the upper 13 bits of the samples) for the conversion to floats via LUT, because that still is only 32KB of memory and will fit quite nicely in a 64KB L1 cache (almost all, if not all modern processors have at least 64KB L1) .. the odds are that the lowest 3 bits from any analog source are plagued by noise anyways.

So something like:

@@do:
movzx eax, word [esi]
movzx ebx, word [esi + 2]
movzx ecx, word [esi + 4]
movzx edx, word [esi + 6]
shr eax, 3
shr ebx, 3
shr ecx, 3
shr edx, 3
mov eax, table[4 * eax]
mov ebx, table[4 * ebx]
mov ecx, table[4 * ecx]
mov edx, table[4 * edx]
mov [edi], eax
mov [edi + 4], ebx
mov [edi + 8], ecx
mov [edi + 12], edx
add esi, 8
add edi, 16
dec ebp
jnz @@do

This technique also has the nice property that the table can be loaded with just about anything, so this all can be made into a highly generic translator (or several, with different shift constants)

akane · August 02, 2009, 10:41:25 PM

Yes, it is processing wave files or sound captured from the mixer or so. It will probably replace FFT, because I have already a GUI FFT program for "sound decompilation" - you select a part of the sound, and it shows which piano keys are active. But in order to keep good resolution for the lowest frequencies, the FFT must be 32768 samples big, and there is a small overhead because I do not need to measure 32768/2 frequencies, I need only 88. So this is why I'm trying with Goerzel algorithm.
Hopefully calling the GoerzelSSE subroutine 88/4 times will be faster even than FFTW. All the current FPU stuff will be removed, because I have 88 fixed frequencies to lookup in the spectrum.

Rockoon, I have removed the shuffle and tried with mulps, but the result was incorrect. A test DTMF decoder program displayed trash instead successive "digits". Probably I need to increase the 32768 value, because the reciprocal of 32768 loaded into SSE is/may be too small. I'll check this. BTW 1/32768 as real32 (FPU) is working ok.

Dedndave, your code is working perfectly, thanks! I have never even tried to learn IEEE 754, but the time is coming.

dedndave · August 02, 2009, 10:50:17 PM

floating point math is not at all difficult
Ray has a great tutorial that is easy to understand...
http://www.ray.masmcode.com/tutorial/index.html

for in-depth definition of floating formats...
http://dlc.sun.com/pdf/819-3693/819-3693.pdf

i wish we had an SSE tutorial like Rays FPU tute - lol

let us know if the code is fast enough - if not, Rockoon may have the right idea - a look-up table

p.s. i am sure the SSE method is there, as well - just out of reach for me - lol

dedndave · August 02, 2009, 11:26:57 PM

if you are recognizing notes from a keyboard, they are all harmonically related
once you have one octave going on, the rest are powers of 2
i might be inclined to look for a way to take advantage of the relationship of octaves to a computers' numbering system
i did a little work with lattice and FIR filters a long time ago, but have forgotten most of what i new
keeping all the values in fixed-point integers comes to mind
you are multiplying and dividing values by constants - you might even be able to avoid MUL and DIV altogether
certainly, the 2*Pi/32768 could be made into one constant

News:

Converting words to floats using sse