This particular function takes up 85% of execution time, and I can't change the hash function -- I _think_ there's a way to implement this with MMX or SSEx but wanted to get opinions before taking the risk of breakage with unpredictable data.
void hash(ULONG * in, ULONG * out, ULONG * table)
{
unsigned int vv = out[0] ;
vv += in[0];
vv *= table[0];
vv = (vv >> 16) | (vv << 16);
vv *= table[1];
vv = (vv >> 16) | (vv << 16);
vv *= table[2];
vv = (vv >> 16) | (vv << 16);
vv *= table[3];
vv = (vv >> 16) | (vv << 16);
vv *= table[4];
vv += table[10];
out[1] += vv;
vv += in[1];
vv *= table[5];
vv = (vv >> 16) | (vv << 16);
vv *= table[6];
vv = (vv >> 16) | (vv << 16);
vv *= table[7];
vv = (vv >> 16) | (vv << 16);
vv *= table[8];
vv = (vv >> 16) | (vv << 16);
vv *= table[9];
vv += table[11];
out[1] += vv;
out[0] = vv;
}
void hash_buffer(ULONG * buf, unsigned int count,
ULONG * out, ULONG * table)
{
int i;
ULONG * p = &buf[0];
out[0] = 0;
out[1] = 0;
for (i = 0; i < count/8; i++)
{
hash(p,out, table);
p+=2;
}
}
(* edit - added context and fixed a silly error I'd made in trying to tweak the code)
The compiler generates .
imul r, mem
rol r, 16d
imul r, mem
rol r, 16d
which seems to be the best that can be done in integer registers.
From a naive reading of some optimization guides, it appears that I could do
pmaddwd (8 clocks)
punpck??/pshufw (2 clocks)
padd (2 clocks)
pand (2 clocks)
in less time than
imul r, mem (16 clocks)
rol r, 16 ( 4 clocks)
with the bonus that with MMX, all the operands would be in registers. I dont know if I can assume 90% of all processors have PSHUFW at this point?
It appears that given 2 32-bit integers A & B, broken into 16-bits, Ahi and Alo, Bhi and Blo
pmaddwd <Ahi><Alo><0><Ahi>, <Blo><Bhi><0><Blo>
followed by a rearrangement and an add has the same numeric result.
Does this seem reasonable?
I think the question of how many processor support MMX has to be asked.
Otherwise your solution will work great on only a few machines.
How do you intend to check for MMX support and fall back to a non-mmx routine ?
Could you show/attach the actual unoptimized assembly code produced by the compiler ?
MMX in Intel CPUs since Pentium 2 (before in some Pentium).
MMX in AMD CPUs since the Athlon (before in most K6).
pshufw is a SSE instruction that uses MMX registers,
so it needs a SSE supporting CPU not just MMX.
You could have multiple versions depending on instruction sets supported.
Not having to call a subroutine would remove function call overhead,
ie the moving to/from the stack.
How is unsigned int vv = table[10]; done ?
Do negative values get truncated to zero or is the sign ignored
and the value just copied to the unsigned int ?
If ints and unsigned ints are 32 bit what about the high 32 bits
from the result of the muliplication ?
Is there a problem with imul ?
If the unsigned parameter is big enough
it would be considered negative even though it is postive.
Are the results of signed and unsigned multiplies the same ?
I can attach the code from the compiler tonight - the compiler's optimization inlines this routine with the buffer indexing code, so it's awkward to follow.
All values are treated as unsigned (despite the imul) 32-bits
the crux of the hashing mechanism is the 32x32 => 64, then discarding the upper 32
the low 32 bits are word-exchanged (probably for better convolution)
Without pshufw, is it still possible to do this quickly in MMX registers using punpck?
The following is ALU code, will look into the extended instruction ( MMX, SSE, SSE2) version.
SSE2 in particular is promising, each 128 bit register can hold 4 dwords so 4 of the 8 would
hold all 16 dwords without extra memory access.
.586
.model flat,stdcall
option casemap:none
include \masm32\include\windows.inc
include \masm32\include\user32.inc
include \masm32\include\kernel32.inc
includelib \masm32\lib\user32.lib
includelib \masm32\lib\kernel32.lib
.data
ALIGN 16
inn dd 3, 5
owt dd 0, 0
tab dd 11, 13, 17, 19
dd 23, 29, 31, 37
dd 41, 43, 47, 51
.code
start:
mov esi, offset tab
mov edi, 4
mov eax, tab + 40
mov ecx, 16
add eax, inn
mul dword ptr [esi] ; tab
add esi, edi
mov ebx, 4 ; had been unrolled 4 times
again0:
rol eax, cl ; 16
mul dword ptr [esi] ; tab + 4
add esi, edi
dec ebx
jnz again0
mov edx, tab + 44
mov owt + 4, eax
add owt + 4, edx
add eax, inn + 4
mul dword ptr [esi] ; tab + 20
add esi, edi
mov ebx, 4
again1:
rol eax, cl ; 16
mul dword ptr [esi] ; tab + 24
add esi, edi
dec ebx
jnz again1
mov owt, eax
add owt + 4, eax
invoke ExitProcess,0
end start
Using visual studio, the output for the hashbuffer function is:
Line 45
push ebx
; Line 51
mov ebx, DWORD PTR _count$[esp]
shr ebx, 3
push esi
mov esi, DWORD PTR _out$[esp+4]
push edi
mov edi, DWORD PTR _buf$[esp+8]
mov DWORD PTR [esi], 0
mov DWORD PTR [esi+4], 0
je SHORT $LN1@routine1
mov edx, DWORD PTR _table$[esp+8]
$LL3@routine1:
; Line 53
mov eax, DWORD PTR [edi]
add eax, DWORD PTR [esi]
; Line 54
add edi, 8
imul eax, DWORD PTR [edx]
rol eax, 16 ; 00000010H
imul eax, DWORD PTR [edx+4]
rol eax, 16 ; 00000010H
imul eax, DWORD PTR [edx+8]
rol eax, 16 ; 00000010H
imul eax, DWORD PTR [edx+12]
rol eax, 16 ; 00000010H
imul eax, DWORD PTR [edx+16]
add eax, DWORD PTR [edx+40]
add DWORD PTR [esi+4], eax
mov ecx, DWORD PTR [edi-4]
add ecx, eax
imul ecx, DWORD PTR [edx+20]
rol ecx, 16 ; 00000010H
imul ecx, DWORD PTR [edx+24]
rol ecx, 16 ; 00000010H
imul ecx, DWORD PTR [edx+28]
rol ecx, 16 ; 00000010H
imul ecx, DWORD PTR [edx+32]
rol ecx, 16 ; 00000010H
imul ecx, DWORD PTR [edx+36]
add ecx, DWORD PTR [edx+44]
add DWORD PTR [esi+4], ecx
sub ebx, 1
mov DWORD PTR [esi], ecx
jne SHORT $LL3@routine1
$LN1@routine1:
pop edi
pop esi
pop ebx
; Line 56
ret 0
I attached a simple testbed, I'm sure there's better ways of doing profiling, this was as simple as I could come up with
[attachment deleted by admin]
A CPU routine that simulates mul but only calculates the lower dword (eax)
instead of the normal (edx:eax) uses four registers instead of two.
The result isn't effected by which register ebx or edx gets b or a.
The value of a or b that has the first low bit set (1) at the highest spot
should go to edx or if either is 0 then it should go to edx then
the early out will get triggered quicker.
mov ebx, b
mov eax, 0 ; zero eax, will hold 32 bit result
mov edx, a
mov ecx, 32 ; ecx range 0 to 31, do for all 32 bits
more:
shr ebx, 1 ; bit 0 -> cf if cf == 1 add
jnc skip ; cf == 0 skip
add eax, edx
skip:
shl edx, 1
jz done ; if edx == 0 early out done Can be removed.
dec ecx
jne more
done:
Working on a MMX version, it works but is very large
and requires extensive setup and post processing.
Dsouza -
I'm pretty sure that pmaddwd + pshufw would work, I just couldn't quite wrap my head around using punpck / pck to do it in MMX that'd work everywhere?
Since IMUL r, mem is going to only calculate the low 32-bits anyway, I can't understand why this'd help?
Code using SSE2 (untested), the swap parameters are in base4 (clearer)
It is possible to keep it all within SSE2 registers until the end when storing to owt,
removing memory (cache) access (latency), just needs a little tweaking.
Even inn and owt could be held in xmm3,
then just load inn, owt, tab at the start
do all the calculations
then store owt at the end.
.data
ALIGN 16
inn dd 123, 456
owt dd 789, 0
tab dd 12 dup (123456)
.code
mov eax, owt
add eax, inn
; mov edx, tab+10*4
movdqa xmm0, tab
movdqa xmm1, tab+16
movdqa xmm2, tab+32
pxor xmm4, xmm4
movd xmm4, eax ; ? zero 321e ? matter
pmuludq xmm4, xmm0 ; v*t0
pshuflw xmm4, xmm4, 3201 ; swap low words low quad
pshufd xmm0, xmm0, 3201
pmuludq xmm4, xmm0 ; v*t1
pshuflw xmm4, xmm4, 3201 ; swap low words low quad
pshufd xmm0, xmm0, 3102
pmuludq xmm4, xmm0 ; v*t2
pshuflw xmm4, xmm4, 3201 ; swap low words low quad
pshufd xmm0, xmm0, 2103
pmuludq xmm4, xmm0 ; v*t3
pshuflw xmm4, xmm4, 3201 ; swap low words low quad
pmuludq xmm4, xmm1 ; v*t4
pshufd xmm1, xmm1, 3201
; movd xmm6, edx ; t10 could be gotten from xmm2, x2xx
movdqa xmm6, xmm2, 3012 ; t10 and so it is ;)
paddd xmm4, xmm6 ; v+t10
movd edx, xmm6 ; v
add owt+4, edx ; o1+v
mov eax, inn+4
movd xmm6, eax ; i1
paddd xmm4, xmm6 ; v+i1
pmuludq xmm4, xmm1 ; v*t5
pshuflw xmm4, xmm4, 3201 ; swap low words low quad
pshufd xmm1, xmm1, 3102
pmuludq xmm4, xmm1 ; v*t6
pshuflw xmm4, xmm4, 3201 ; swap low words low quad
pshufd xmm1, xmm1, 2103
pmuludq xmm4, xmm1 ; v*t7
pshuflw xmm4, xmm4, 3201 ; swap low words low quad
pmuludq xmm4, xmm2 ; v*t8
pshuflw xmm4, xmm4, 3201 ; swap low words low quad
pshufd xmm2, xmm2, 3201
pmuludq xmm4, xmm2 ; v*t9
pshufd xmm2, xmm2, 1203
paddd xmm4, xmm2
movd eax, xmm4
add owt+4, eax
mov owt, eax
The CPU code from my previous post was an alternative to imul not mul,
it only calculated the low dword of a 32 x 32 multiply, it wasn't tested
for speed versus imul but it probably is slower.
nice work