Summing up bytes into a word

King Nak · April 29, 2007, 01:34:42 PM

Hi there!

I have a problem and am searching for the fastest solution:
I have a 256-bytes array in memory and want to sum up, let's say, the first 135 bytes in it.
But the result should not be stored in a byte (overflow...), I need the full count.
The only way I see is the following:

Code Select


xor eax, eax
mov ecx, [count]
counter_loop:
movzx ebx, BYTE PTR [esi]
add eax, ebx
inc esi
loop counter_loop

But I don't think it's the best solution. Sure, some loop unrolling can be done,
but isn't there a better approach? like using MMX or something...

Thanks in advance
bye
King Nak

Jimg · April 29, 2007, 06:55:39 PM

Well, here's some to play with-

Code Select

.data
dtst db 256 dup (0)
	 dd 0	; guard bytes
tans dd 0
dans dd 256*255
.code
setup proc
	mov edi,offset dtst
	mov ecx,256
	mov edx,0
	mov eax,0
	.repeat
		add edx,eax
		stosb
		inc eax
		dec ecx
	.until ecx==0
	mov dans,edx
	ret
setup EndP
Doit1:       ;---------------------------------------------------------------------------
	Desc 'movzx, add'
	call setup
	BeginCounter
		mov esi,offset dtst
		xor eax, eax
		mov ecx, 256
		counter_loop:
		movzx ebx, BYTE PTR [esi]
		add eax, ebx
		inc esi
		loop counter_loop
		mov tans,eax
	EndCounter
	mov eax,tans
	.if eax!=dans
		msg "error"
	.endif
	ret

Doit2:       ;---------------------------------------------------------------------------
	Desc 'movzx, add, no LOOP'
	call setup
	BeginCounter
		mov esi,offset dtst
		xor eax, eax
		mov ecx, 256
		.repeat
			movzx ebx, BYTE PTR [esi]
			add eax, ebx
			inc esi
			dec ecx
		.until ecx==0
		mov tans,eax
	EndCounter
	mov eax,tans
	.if eax!=dans
		msg "error"
	.endif
	ret

Doit3:       ;---------------------------------------------------------------------------
	Desc '2 byte add'
	call setup
	BeginCounter
		mov esi,offset dtst+255-3
		xor eax, eax
		.repeat
			mov edx,[esi]
			and edx,0FF00FFh
			add eax,edx
			sub esi,4
		.until esi<offset dtst

		mov esi,offset dtst+255-2
		.repeat
			mov edx,[esi]
			and edx,0FF00FFh
			add eax,edx
			sub esi,4
		.until esi<offset dtst
		mov edx,eax
		shr eax,16
		and edx,0FFFFh
		add eax,edx
		mov tans,eax
	EndCounter
	mov eax,tans
	.if eax!=dans
		msg "error"
	.endif
	ret

.data
mmxmask db 255,00,255,00,255,0,255,0
qans dq 0
.code
Doit4:       ;---------------------------------------------------------------------------
	Desc 'mmx'
	call setup
	BeginCounter
		emms
		mov esi,offset dtst+256-8
		pxor mm0,mm0
		mov edi,offset mmxmask
		movq mm2,[edi]
		.repeat
			movq mm1,[esi]
			pand mm1,mm2
			paddw mm0,mm1
			sub esi,8
		.until esi<offset dtst

		mov esi,offset dtst+256-7
		.repeat
			movq mm1,[esi]
			pand mm1,mm2
			paddw mm0,mm1
			sub esi,8
		.until esi<offset dtst
		movq qans,mm0
		movq mm1,qans
		movzx eax,word ptr qans
		movzx edx,word ptr [qans+2]
		add eax,edx
		movzx edx,word ptr [qans+4]
		add eax,edx
		movzx edx,word ptr [qans+6]
		add eax,edx
		mov tans,eax
		
	EndCounter
	mov eax,tans
	.if eax!=dans
		msg "error"
	.endif
	ret

Doit1 is your starting point. et=1124 cycles on my computer
Doit2 is same but manually doing loop (LOOP opcode is slow on later computers) et=646 cycles
Doit3 is adding two bytes at a time et=492 cycles
Doit4 is using mmx doing 4 bytes at a time. et=247 cycles.

Perhaps a moderator will move this to the laboratory so you can get some real guru's to help. :wink

Edit: removed file, newer one below.

drizz · April 29, 2007, 07:35:26 PM

here's mine :eek

Code Select


.code
align 16
SumBytes proc stdcall _pByteArray:ptr byte,_dwCount:dword
	option prologue:none
	option epilogue:none
	pByteArray equ <dword ptr [esp+1*4][4]>
	dwCount     equ <dword ptr [esp+2*4][4]>
	push ebx
	mov ecx,dwCount
	xor eax,eax
	mov edx,ecx
	shr ecx,3
	mov ebx,pByteArray
	jz @F
	pxor mm2,mm2
	pxor mm3,mm3
	.repeat
		movq mm0,[ebx]
		add ebx,8
		movq mm1,mm0
		punpcklbw mm0,mm2
		punpckhbw mm1,mm2
		paddw mm3,mm0
		paddw mm3,mm1
		sub ecx,1
	.until zero?
	movq mm0,mm3
	punpcklwd mm0,mm2
	punpckhwd mm3,mm2
	paddd mm0,mm3
	movd ecx,mm0
	add eax,ecx
	psrlq mm0,32
	movd ecx,mm0
	add eax,ecx
@@:	and edx,7
	jz @F
	.repeat
		movzx ecx,byte ptr[ebx]
		add eax,ecx
		add ebx,1	
		sub edx,1
	.until zero?
@@:	pop ebx
	ret 2*4
	option prologue:prologuedef
	option epilogue:epiloguedef
SumBytes endp


.data
testcnt equ 135
xi =0
yi =0
align 16
myarray label byte
rept 256
	db xi
	if xi lt testcnt
		yi = yi +xi
	endif	
	xi = xi +1
endm
testmy dd yi
.code
start:
	invoke SumBytes,addr myarray,testcnt
	.if eax != testmy
		; error 
	.endif

EDIT: slightly modified

Jimg · April 29, 2007, 08:47:06 PM

And a couple more:

Code Select

Doit6:       ;---------------------------------------------------------------------------
	Desc 'mmx 1loop'
	call setup
	BeginCounter
		emms
		mov esi,offset dtst+256-8
		pxor mm0,mm0
		mov edi,offset mmxmask
		movq mm2,[edi]
		.repeat
			movq mm1,[esi]
			pand mm1,mm2
			paddw mm0,mm1
			movq mm3,[esi+1]
			pand mm3,mm2
			paddw mm0,mm3
			sub esi,8
		.until esi<offset dtst

		movq qans,mm0
		movq mm1,qans
		movzx eax,word ptr qans
		movzx edx,word ptr [qans+2]
		add eax,edx
		movzx edx,word ptr [qans+4]
		add eax,edx
		movzx edx,word ptr [qans+6]
		add eax,edx
		mov tans,eax
		
	EndCounter
	mov eax,tans
	.if eax!=dans
		msg "error"
	.endif
	ret


Doit7:       ;---------------------------------------------------------------------------
	Desc 'mmx 2 interleaved'
	call setup
	BeginCounter
		emms
		mov esi,offset dtst+256-16
		pxor mm0,mm0
		mov edi,offset mmxmask
		movq mm2,[edi]
		.repeat
			movq mm1,[esi]
			movq mm3,[esi+1]
			movq mm4,[esi+8]
			movq mm5,[esi+9]
			pand mm1,mm2
			pand mm3,mm2
			pand mm4,mm2
			pand mm5,mm2
			paddw mm0,mm1
			paddw mm4,mm5
			paddw mm0,mm3
			paddw mm0,mm4
			sub esi,16
		.until esi<offset dtst

		movq qans,mm0
		movq mm1,qans
		movzx eax,word ptr qans
		movzx edx,word ptr [qans+2]
		add eax,edx
		movzx edx,word ptr [qans+4]
		add eax,edx
		movzx edx,word ptr [qans+6]
		add eax,edx
		mov tans,eax
		
	EndCounter
	mov eax,tans
	.if eax!=dans
		msg "error"
	.endif
	ret

driz - 171 cycles
doit 6 - 172 cycles
doit 7 - 151 cycles

[attachment deleted by admin]

dsouza123 · April 29, 2007, 10:16:20 PM

Only works if count is a multiple of 4, uses only ALU registers.

Code Select


   mov edi, 0ff00ffh
   xor eax, eax
   mov ecx, [count]
   shr ecx, 2
@@:
   mov edx, dword ptr [esi]
   mov ebx, edx
   and edx, edi
   bswap ebx
   add eax, edx
   and ebx, edi
   add eax, ebx
   lea esi, [esi+4]
   dec ecx
   jnz @B

   mov edx, eax
   movzx eax, ax
   shr edx, 16
   add eax, edx

Jimg · April 30, 2007, 01:55:20 AM

Quote from: dsouza123 on April 29, 2007, 10:16:20 PM
Only works if count is a multiple of 4, uses only ALU registers.

Yeah, I went back an reread the original request. He wanted the first 135 bytes of 256 so all mine are no good for his purposes, but it was fun!

King Nak · April 30, 2007, 11:43:00 AM

Thank you all!

I studied now all the proposals, and did some test on my machine too.
Seems like I have a older machine, since the DoIt1 and DoIt2 perform almost the same,
despite the LOOP...

nethertheless, I've already though about a solution with interleaving, but I didn't neither
think about interleaving by myself, nor that you could insert 0-Bytes/Words with punpcklbw et al.

I think I will use drizz's solution, since it is the only one that really solves my problem, and
also performs best on my computer...

Thank you very much

bye
King Nak

zooba · April 30, 2007, 12:38:44 PM

Quote from: Jimg on April 30, 2007, 01:55:20 AM
Quote from: dsouza123 on April 29, 2007, 10:16:20 PM
Only works if count is a multiple of 4, uses only ALU registers.

Yeah, I went back an reread the original request. He wanted the first 135 bytes of 256 so all mine are no good for his purposes, but it was fun!

Could always go to 136 and subtract the last number. If the algorithm which requires a multiple of 4 is faster than the rest by at least the time it takes to do a single subtraction, you win :bg

Cheers,

Zooba :U

King Nak · April 30, 2007, 01:26:13 PM

yeah, would be possible. but the MMX version that already returns the correct
sum takes 272 cycles, and the ALU version, which had to be adopted, takes
492 cycles in the current version... both for summing up all 256 elements

bye
King Nak

Jimg · April 30, 2007, 03:03:35 PM

What kind of timing do you get with this one? Should work with variable Count sizes-

Code Select

Doit2:       ;---------------------------------------------------------------------------
	Desc 'mmx 1 Loop, interleaved fixed'
	call setup
	BeginCounter
		;mov esi,offset dtst+256-16
		mov esi,offset dtst
		mov ecx,Count
		xor eax,eax
		shr ecx,4
		.if ecx!=0
			pxor mm0,mm0
			mov edi,offset mmxmask
			movq mm2,[edi]
			.repeat
				movq mm1,[esi]
				movq mm3,[esi+1]
				pand mm1,mm2
				movq mm4,[esi+8]
				paddw mm0,mm1
				pand mm3,mm2
				movq mm5,[esi+9]
				pand mm4,mm2
				pand mm5,mm2
				paddw mm4,mm5
				paddw mm0,mm3
				paddw mm0,mm4
				add esi,16
				sub ecx,1
			.until zero?
			movq qans,mm0
			movzx eax,word ptr qans
			movzx edx,word ptr [qans+2]
			add eax,edx
			movzx edx,word ptr [qans+4]
			add eax,edx
			movzx edx,word ptr [qans+6]
			add eax,edx
		.endif
		mov ecx,Count
		and ecx,0Fh		; remainder to do, up to 15 bytes.
		.if !Zero?
			.repeat
				movzx edx,byte ptr[esi]
				add eax,edx
				inc esi
				sub ecx,1
			.until zero?
		.endif

		mov tans,eax
	
	EndCounter
	mov eax,tans
	.if eax!=dans
		msg "error"
	.endif
	mov eax,TestNum
	add eax,6000
	inv SetDlgItemInt,hWin,eax,tans,0
	ret

edit: a few small changes. File updated a couple posts down-

drizz · April 30, 2007, 07:53:42 PM

and here is something fast, totally unrolled.

:8)

Code Select


align 16
SumBytesUnrolled proc stdcall _pByteArray:ptr byte,_dwCount:dword
	option prologue:none
	option epilogue:none
	pByteArray equ <dword ptr [esp+1*4]>
	dwCount    equ <dword ptr [esp+2*4]>
	pxor mm0,mm0
	pxor mm1,mm1
	movd mm4,edi
	movd mm5,esi
	mov edx,dwCount
	mov esi,pByteArray
	mov ecx,edx
	shr ecx,3
	and edx,7
	lea edi,[esi+ecx*8]
	add ecx,-32
	add edx,-8
	; 256 is to force DWORD displacement
	lea esi,[esi-256+ecx*8]
	lea edi,[edi+edx-1]
	neg ecx
	neg edx
	imul ecx,sdword ptr block_size8
	imul edx,sdword ptr block_size1
	add ecx,offset block_start8
	add edx,offset block_start1
	jmp ecx
	align 4
	xi = 0
block_start8:
	movq mm2,[esi+256+xi*8]
	movq mm3,mm2
	punpcklbw mm2,mm1
	punpckhbw mm3,mm1
	paddw mm0,mm2
	paddw mm0,mm3
block_size8 = ($-block_start8)
	rept 32-1
	xi = xi + 1
	movq mm2,[esi+256+xi*8]
	movq mm3,mm2
	punpcklbw mm2,mm1
	punpckhbw mm3,mm1
	paddw mm0,mm2
	paddw mm0,mm3
	endm
	movq mm2,mm0
	punpcklwd mm0,mm1
	punpckhwd mm2,mm1
	paddd mm0,mm2
	movq mm2,mm0
	punpckldq mm2,mm1
	punpckhdq mm0,mm1
	paddd mm0,mm2
	movd eax,mm0
	jmp edx
	xi = 1
	align 4
block_start1:
	movzx ecx,byte ptr [edi+xi]
	add eax,ecx
block_size1 = ($-block_start1)
	rept 8-1
	xi = xi + 1
	movzx ecx,byte ptr [edi+xi]
	add eax,ecx
	endm
	movd edi,mm4
	movd esi,mm5
	ret 2*4
	option prologue:prologuedef
	option epilogue:epiloguedef
SumBytesUnrolled endp

Jimg · April 30, 2007, 08:20:10 PM

Strangely it didn't seem to make much difference on my crummy old athlon. Maybe an alignment issue?

Well, this is certainly disappointing. I dusted off my old celeron laptop and tried it. My attempt is MUCH slower on the celeron.

Athon results:

Driz1: 120 cycles
Mine: 89 cycles
Driz2: 115 cycles

Celeron results:

Driz1: 104 cycles
Mine: 240 cycles
Driz2: 104 cycles

Such an incredible difference???? I guess I'm done here :(

[attachment deleted by admin]

drizz · April 30, 2007, 10:02:21 PM

this is what i get on my P3

Code Select

143
129
93

News:

Summing up bytes into a word

King Nak

dsouza123

King Nak

King Nak