News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Summing up bytes into a word

Started by King Nak, April 29, 2007, 01:34:42 PM

Previous topic - Next topic

King Nak

Hi there!

I have a problem and am searching for the fastest solution:
I have a 256-bytes array in memory and want to sum up, let's say, the first 135 bytes in it.
But the result should not be stored in a byte (overflow...), I need the full count.
The only way I see is the following:

xor eax, eax
mov ecx, [count]
counter_loop:
movzx ebx, BYTE PTR [esi]
add eax, ebx
inc esi
loop counter_loop


But I don't think it's the best solution. Sure, some loop unrolling can be done,
but isn't there a better approach? like using MMX or something...

Thanks in advance
bye
King Nak

Jimg

#1
Well, here's some to play with-
.data
dtst db 256 dup (0)
dd 0 ; guard bytes
tans dd 0
dans dd 256*255
.code
setup proc
mov edi,offset dtst
mov ecx,256
mov edx,0
mov eax,0
.repeat
add edx,eax
stosb
inc eax
dec ecx
.until ecx==0
mov dans,edx
ret
setup EndP
Doit1:       ;---------------------------------------------------------------------------
Desc 'movzx, add'
call setup
BeginCounter
mov esi,offset dtst
xor eax, eax
mov ecx, 256
counter_loop:
movzx ebx, BYTE PTR [esi]
add eax, ebx
inc esi
loop counter_loop
mov tans,eax
EndCounter
mov eax,tans
.if eax!=dans
msg "error"
.endif
ret

Doit2:       ;---------------------------------------------------------------------------
Desc 'movzx, add, no LOOP'
call setup
BeginCounter
mov esi,offset dtst
xor eax, eax
mov ecx, 256
.repeat
movzx ebx, BYTE PTR [esi]
add eax, ebx
inc esi
dec ecx
.until ecx==0
mov tans,eax
EndCounter
mov eax,tans
.if eax!=dans
msg "error"
.endif
ret

Doit3:       ;---------------------------------------------------------------------------
Desc '2 byte add'
call setup
BeginCounter
mov esi,offset dtst+255-3
xor eax, eax
.repeat
mov edx,[esi]
and edx,0FF00FFh
add eax,edx
sub esi,4
.until esi<offset dtst

mov esi,offset dtst+255-2
.repeat
mov edx,[esi]
and edx,0FF00FFh
add eax,edx
sub esi,4
.until esi<offset dtst
mov edx,eax
shr eax,16
and edx,0FFFFh
add eax,edx
mov tans,eax
EndCounter
mov eax,tans
.if eax!=dans
msg "error"
.endif
ret

.data
mmxmask db 255,00,255,00,255,0,255,0
qans dq 0
.code
Doit4:       ;---------------------------------------------------------------------------
Desc 'mmx'
call setup
BeginCounter
emms
mov esi,offset dtst+256-8
pxor mm0,mm0
mov edi,offset mmxmask
movq mm2,[edi]
.repeat
movq mm1,[esi]
pand mm1,mm2
paddw mm0,mm1
sub esi,8
.until esi<offset dtst

mov esi,offset dtst+256-7
.repeat
movq mm1,[esi]
pand mm1,mm2
paddw mm0,mm1
sub esi,8
.until esi<offset dtst
movq qans,mm0
movq mm1,qans
movzx eax,word ptr qans
movzx edx,word ptr [qans+2]
add eax,edx
movzx edx,word ptr [qans+4]
add eax,edx
movzx edx,word ptr [qans+6]
add eax,edx
mov tans,eax

EndCounter
mov eax,tans
.if eax!=dans
msg "error"
.endif
ret


Doit1 is your starting point.  et=1124 cycles on my computer
Doit2 is same but manually doing loop (LOOP opcode is slow on later computers)  et=646 cycles
Doit3 is adding two bytes at a time   et=492 cycles
Doit4 is using mmx  doing 4 bytes at a time.  et=247 cycles.

Perhaps a moderator will move this to the laboratory so you can get some real guru's to help. :wink

Edit: removed file, newer one below.


drizz

here's mineĀ  :eek

.code
align 16
SumBytes proc stdcall _pByteArray:ptr byte,_dwCount:dword
option prologue:none
option epilogue:none
pByteArray equ <dword ptr [esp+1*4][4]>
dwCount     equ <dword ptr [esp+2*4][4]>
push ebx
mov ecx,dwCount
xor eax,eax
mov edx,ecx
shr ecx,3
mov ebx,pByteArray
jz @F
pxor mm2,mm2
pxor mm3,mm3
.repeat
movq mm0,[ebx]
add ebx,8
movq mm1,mm0
punpcklbw mm0,mm2
punpckhbw mm1,mm2
paddw mm3,mm0
paddw mm3,mm1
sub ecx,1
.until zero?
movq mm0,mm3
punpcklwd mm0,mm2
punpckhwd mm3,mm2
paddd mm0,mm3
movd ecx,mm0
add eax,ecx
psrlq mm0,32
movd ecx,mm0
add eax,ecx
@@: and edx,7
jz @F
.repeat
movzx ecx,byte ptr[ebx]
add eax,ecx
add ebx,1
sub edx,1
.until zero?
@@: pop ebx
ret 2*4
option prologue:prologuedef
option epilogue:epiloguedef
SumBytes endp


.data
testcnt equ 135
xi =0
yi =0
align 16
myarray label byte
rept 256
db xi
if xi lt testcnt
yi = yi +xi
endif
xi = xi +1
endm
testmy dd yi
.code
start:
invoke SumBytes,addr myarray,testcnt
.if eax != testmy
; error
.endif

EDIT: slightly modified
The truth cannot be learned ... it can only be recognized.

Jimg

And a couple more:
Doit6:       ;---------------------------------------------------------------------------
Desc 'mmx 1loop'
call setup
BeginCounter
emms
mov esi,offset dtst+256-8
pxor mm0,mm0
mov edi,offset mmxmask
movq mm2,[edi]
.repeat
movq mm1,[esi]
pand mm1,mm2
paddw mm0,mm1
movq mm3,[esi+1]
pand mm3,mm2
paddw mm0,mm3
sub esi,8
.until esi<offset dtst

movq qans,mm0
movq mm1,qans
movzx eax,word ptr qans
movzx edx,word ptr [qans+2]
add eax,edx
movzx edx,word ptr [qans+4]
add eax,edx
movzx edx,word ptr [qans+6]
add eax,edx
mov tans,eax

EndCounter
mov eax,tans
.if eax!=dans
msg "error"
.endif
ret


Doit7:       ;---------------------------------------------------------------------------
Desc 'mmx 2 interleaved'
call setup
BeginCounter
emms
mov esi,offset dtst+256-16
pxor mm0,mm0
mov edi,offset mmxmask
movq mm2,[edi]
.repeat
movq mm1,[esi]
movq mm3,[esi+1]
movq mm4,[esi+8]
movq mm5,[esi+9]
pand mm1,mm2
pand mm3,mm2
pand mm4,mm2
pand mm5,mm2
paddw mm0,mm1
paddw mm4,mm5
paddw mm0,mm3
paddw mm0,mm4
sub esi,16
.until esi<offset dtst

movq qans,mm0
movq mm1,qans
movzx eax,word ptr qans
movzx edx,word ptr [qans+2]
add eax,edx
movzx edx,word ptr [qans+4]
add eax,edx
movzx edx,word ptr [qans+6]
add eax,edx
mov tans,eax

EndCounter
mov eax,tans
.if eax!=dans
msg "error"
.endif
ret


driz - 171 cycles
doit 6 - 172 cycles
doit 7 - 151 cycles


[attachment deleted by admin]

dsouza123

Only works if count is a multiple of 4, uses only ALU registers.


   mov edi, 0ff00ffh
   xor eax, eax
   mov ecx, [count]
   shr ecx, 2
@@:
   mov edx, dword ptr [esi]
   mov ebx, edx
   and edx, edi
   bswap ebx
   add eax, edx
   and ebx, edi
   add eax, ebx
   lea esi, [esi+4]
   dec ecx
   jnz @B

   mov edx, eax
   movzx eax, ax
   shr edx, 16
   add eax, edx

Jimg

Quote from: dsouza123 on April 29, 2007, 10:16:20 PM
Only works if count is a multiple of 4, uses only ALU registers.

Yeah, I went back an reread the original request.  He wanted the first 135 bytes of 256 so all mine are no good for his purposes, but it was fun!

King Nak

Thank you all!

I studied now all the proposals, and did some test on my machine too.
Seems like I have a older machine, since the DoIt1 and DoIt2 perform almost the same,
despite the LOOP...

nethertheless, I've already though about a solution with interleaving, but I didn't neither
think about interleaving by myself, nor that you could insert 0-Bytes/Words with punpcklbw et al.

I think I will use drizz's solution, since it is the only one that really solves my problem, and
also performs best on my computer...

Thank you very much

bye
King Nak

zooba

Quote from: Jimg on April 30, 2007, 01:55:20 AM
Quote from: dsouza123 on April 29, 2007, 10:16:20 PM
Only works if count is a multiple of 4, uses only ALU registers.

Yeah, I went back an reread the original request. He wanted the first 135 bytes of 256 so all mine are no good for his purposes, but it was fun!

Could always go to 136 and subtract the last number. If the algorithm which requires a multiple of 4 is faster than the rest by at least the time it takes to do a single subtraction, you win :bg

Cheers,

Zooba :U

King Nak

yeah, would be possible. but the MMX version that already returns the correct
sum takes 272 cycles, and the ALU version, which had to be adopted, takes
492 cycles in the current version... both for summing up all 256 elements

bye
King Nak

Jimg

#9
What kind of timing do you get with this one?  Should work with variable Count sizes-
Doit2:       ;---------------------------------------------------------------------------
Desc 'mmx 1 Loop, interleaved fixed'
call setup
BeginCounter
;mov esi,offset dtst+256-16
mov esi,offset dtst
mov ecx,Count
xor eax,eax
shr ecx,4
.if ecx!=0
pxor mm0,mm0
mov edi,offset mmxmask
movq mm2,[edi]
.repeat
movq mm1,[esi]
movq mm3,[esi+1]
pand mm1,mm2
movq mm4,[esi+8]
paddw mm0,mm1
pand mm3,mm2
movq mm5,[esi+9]
pand mm4,mm2
pand mm5,mm2
paddw mm4,mm5
paddw mm0,mm3
paddw mm0,mm4
add esi,16
sub ecx,1
.until zero?
movq qans,mm0
movzx eax,word ptr qans
movzx edx,word ptr [qans+2]
add eax,edx
movzx edx,word ptr [qans+4]
add eax,edx
movzx edx,word ptr [qans+6]
add eax,edx
.endif
mov ecx,Count
and ecx,0Fh ; remainder to do, up to 15 bytes.
.if !Zero?
.repeat
movzx edx,byte ptr[esi]
add eax,edx
inc esi
sub ecx,1
.until zero?
.endif

mov tans,eax

EndCounter
mov eax,tans
.if eax!=dans
msg "error"
.endif
mov eax,TestNum
add eax,6000
inv SetDlgItemInt,hWin,eax,tans,0
ret

edit: a few small changes.  File updated a couple posts down-

drizz

and here is something fast, totally unrolled.

:8)


align 16
SumBytesUnrolled proc stdcall _pByteArray:ptr byte,_dwCount:dword
option prologue:none
option epilogue:none
pByteArray equ <dword ptr [esp+1*4]>
dwCount    equ <dword ptr [esp+2*4]>
pxor mm0,mm0
pxor mm1,mm1
movd mm4,edi
movd mm5,esi
mov edx,dwCount
mov esi,pByteArray
mov ecx,edx
shr ecx,3
and edx,7
lea edi,[esi+ecx*8]
add ecx,-32
add edx,-8
; 256 is to force DWORD displacement
lea esi,[esi-256+ecx*8]
lea edi,[edi+edx-1]
neg ecx
neg edx
imul ecx,sdword ptr block_size8
imul edx,sdword ptr block_size1
add ecx,offset block_start8
add edx,offset block_start1
jmp ecx
align 4
xi = 0
block_start8:
movq mm2,[esi+256+xi*8]
movq mm3,mm2
punpcklbw mm2,mm1
punpckhbw mm3,mm1
paddw mm0,mm2
paddw mm0,mm3
block_size8 = ($-block_start8)
rept 32-1
xi = xi + 1
movq mm2,[esi+256+xi*8]
movq mm3,mm2
punpcklbw mm2,mm1
punpckhbw mm3,mm1
paddw mm0,mm2
paddw mm0,mm3
endm
movq mm2,mm0
punpcklwd mm0,mm1
punpckhwd mm2,mm1
paddd mm0,mm2
movq mm2,mm0
punpckldq mm2,mm1
punpckhdq mm0,mm1
paddd mm0,mm2
movd eax,mm0
jmp edx
xi = 1
align 4
block_start1:
movzx ecx,byte ptr [edi+xi]
add eax,ecx
block_size1 = ($-block_start1)
rept 8-1
xi = xi + 1
movzx ecx,byte ptr [edi+xi]
add eax,ecx
endm
movd edi,mm4
movd esi,mm5
ret 2*4
option prologue:prologuedef
option epilogue:epiloguedef
SumBytesUnrolled endp
The truth cannot be learned ... it can only be recognized.

Jimg

Strangely it didn't seem to make much difference on my crummy old athlon.  Maybe an alignment issue?

Well, this is certainly disappointing.  I dusted off my old celeron laptop and tried it.  My attempt is MUCH slower on the celeron.

Athon results:

Driz1: 120 cycles
Mine:  89 cycles
Driz2: 115 cycles

Celeron results:

Driz1: 104 cycles
Mine: 240 cycles
Driz2: 104 cycles


Such an incredible difference????  I guess I'm done here :(


[attachment deleted by admin]

drizz

this is what i get on my P3
143
129
93
The truth cannot be learned ... it can only be recognized.