Hello.
I have graphics code whose bottleneck is copying a buffer of 24-bit pixels to a buffer of reversed pixels. That is, the source buffer is RGB and the destination is to be filled with BGR pixels.
My current scheme is to read 32-bits (x,R,G,B), swap and shift the bytes, and write out 32-bits (B,G,R,0). That trailing 0 byte is garbage and is overwritten in the next loop, or by the next pixel in the same loop. The source and dest pointers are incremented by 3 to advance past the valid 3 bytes just handled. I've unrolled this to handle 3 pixels per loop, to avoid back-to-back operations on the same register. The current code is shown below. It works, but there's a couple of problems with this code from a performance perspective.
First, there's the mis-alignment. The destination of the write is only aligned on every 4th write, since the pointers are being advanced by 3 bytes per pixel.
Second, it bothers me that 8 of each 32 bites I write is wasted, since that byte is overwritten on the next pixel to be copied. Bad enough that the writes are so often mis-aligned but to throw away that expensive data irks me.
I know that this code will always be run on machines that are SSE-capable, but my attempts to go the SIMD route have all resulted in code that is slower than generic x86 handling.
I continue to think that there must be a way to gather 3 4-byte reads and output 4 3-byte pixels, but I haven't been able to think of a way that is more efficient than writing 3 valid bytes and a single garbage byte.
Any idea on how better to reverse-copy a buffer of 24-bit pixels?
Thanks.
// s = address of RGB source buffer
// d = address of BGR destination buffer
// len = number of RGB pixels to copy
__forceinline void swap3basm(char *d, char *s, int len)
{
_asm {
push ebx
mov ecx, len
mov esi, s
mov edi, d
cmp ecx, 3
jb singtest
align 16
bulkcopy:
sub ecx, 3
mov eax, dword ptr [esi ]
mov ebx, dword ptr [esi+3]
mov edx, dword ptr [esi+6]
bswap eax
bswap ebx
bswap edx
shr eax, 8
shr ebx, 8
shr edx, 8
mov dword ptr [edi ], eax
mov dword ptr [edi+3], ebx
mov dword ptr [edi+6], edx
add esi, 9
add edi, 9
cmp ecx, 3
jae bulkcopy
singtest:
cmp ecx, 1
jb done
singles:
mov al, word ptr [esi ]
mov bl, byte ptr [esi+1]
mov dl, byte ptr [esi+2]
mov byte ptr [edi+2], al
mov byte ptr [edi+1], bl
mov byte ptr [edi ], dl
add esi, 3
add edi, 3
dec ecx
jnz singles
done:
pop ebx
}
}
The following code will change RGB in EAX/EBX/ECX to BGR in EBX/ECX/EDX:
mov edx, ecx
bswap edx
shrd ecx, ebx, 16
bswap ecx
shrd edx, ecx, 8
shl ecx, 8
shrd ebx, eax, 8
bswap ebx
shrd ecx, ebx, 16
bswap eax
shl ebx, 8
shrd ebx, eax, 24
I don't know if this is faster, however it does eliminate the mis-aligned read/writes. It can probably optimized further, and using mmx or sse is probably faster than this.
Hmmmm... I don't see how this works. Maybe I'm misunderstanding your intent.
This is the start of the input string (source buffer):
0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x20, 0x21, 0x22, 0x30, 0x31, 0x32,
My code above generates this output (start of dest buffer):
0x02 0x01 0x00 0x12 0x11 0x10 0x22 0x21 0x20 0x32 0x31 0x30
Using your suggested code (see below), I get this output:
0x20 0x10 0x02 0x01 0x11 0x32 0x00 0x21 0x31 0x30 0x22 0x12
I took your brief comment to mean that EAX:EBX:ECX == RGBRGBRGBRGB and that the output, after swapping, would be EBX:ECX:EDX == BGRBGRBGRBGR. That apparently is not the case.
Would you mind elaborating?
Thanks.
void swap3b2(char *d, char *s, int len)
{
_asm {
mov esi, s
mov edi, d
cmp len, 4
jb singtest
align 16
bulkcopy:
sub len, 4
mov ecx, dword ptr [esi+8]
mov ebx, dword ptr [esi+4]
mov eax, dword ptr [esi ]
mov edx, ecx
bswap edx
shrd ecx, ebx, 16
bswap ecx
shrd edx, ecx, 8
shl ecx, 8
shrd ebx, eax, 8
bswap ebx
shrd ecx, ebx, 16
bswap eax
shl ebx, 8
shrd ebx, eax, 24
mov dword ptr [edi+8], edx
mov dword ptr [edi+4], ecx
mov dword ptr [edi ], ebx
add esi, 12
add edi, 12
cmp len, 4
jae bulkcopy
singtest:
cmp len, 1
jb done
mov ecx, len
align 16
singles:
mov ax, word ptr [esi ]
mov bl, byte ptr [esi+2]
ror ax, 8
mov byte ptr [edi ], bl
mov word ptr [edi+1], ax
add esi, 3
add edi, 3
dec ecx
jnz singles
done:
}
}
SwapRGB_4pix proc
;ABCD EFGH IJKL
;CBAF EDIH GLKJ
mov eax,[esi+0]
mov ebx,[esi+4]
mov ecx,[esi+8]
; byte 0123
bswap eax ; DCBA
; [ABCD->CBAF]
mov edx,eax
mov al,bh ; eax=FCBA
ror eax,8 ; eax=CBAF, done
; [EFGH->EDIH]
ror ebx,8 ; ebx=FGHE
mov [esi+0],eax
mov eax,ebx
mov bh,cl ; ebx=FIHE
mov bl,dl ; ebx=DIHE
rol ebx,8 ; ebx=EDIH, done
; [IJKL->GLKJ]
bswap ecx ; ecx=LKJI
shl ecx,8 ; ecx=0LKJ
mov cl,ah
mov [esi+4],ebx
mov [esi+8],ecx
ret
SwapRGB_4pix endp
main proc
mov esi,T("ABCDEFGHIJKL")
call SwapRGB_4pix
prints esi ; this outputs "CBAFEDIHGLKJ"
ret
main endp
Takes.... 3 cycles to convert 4 pixels on my Sempron3000+ . :) (add 4 cycles for the looping)
swsnyder,
I can't check the code now, while at work. It produced the correct output last night at home (unless my perceptions were altered by my fatigue at that late hour). In any case, Ultrano's optmized version is much better than mine.
Thanks, Ultrano!
I tried making an SSE2 variant on this one, but it's way slower than general-purpose code. (currently 3 cycles per pixel)
EDIT: Now that I timed ultrano's code, my sse code seems to be faster though. So, forget what I said.
Ultrana: your code needs about 3 cycles per pixel, at least in my benchmark. Not per 4 pixels. (originally, it was 4 cycles per pixel, but adding prefetch code improved it's speed)
As soon as I finish my sse code, I'll post it.
on what cpu? I presume a P4. :lol
This is the benchmark I did:
;=====[[ Benchmarking macros >>===\
TEST_ITERS equ 10000000
TEST_ID = 0
START_TEST macro
TEST_ID = TEST_ID + 1
rdtsc
push eax
mov ecx,TEST_ITERS
align 16
@CatStr(<testlabl>,%TEST_ID,<:>)
endm
END_TEST macro where
dec ecx
jnz @CatStr(<testlabl>,%TEST_ID)
rdtsc
pop edx
sub eax,edx
xor edx,edx
mov ecx,TEST_ITERS
div ecx
.if eax<where
mov where,eax
.endif
endm
;=======/
SwapRGB_4pix proc
local time1
mov time1,-1
START_TEST
push ecx
;comment % <---------- when I uncomment this, it's 9 cycles. Otherwise, 6 cycles
;ABCD EFGH IJKL
;CBAF EDIH GLKJ
mov eax,[esi+0]
mov ebx,[esi+4]
mov ecx,[esi+8]
; byte 0123
bswap eax ; DCBA
; [ABCD->CBAF]
mov edx,eax
mov al,bh ; eax=FCBA
ror eax,8 ; eax=CBAF, done
; [EFGH->EDIH]
ror ebx,8 ; ebx=FGHE
mov [esi+0],eax
mov eax,ebx
mov bh,cl ; ebx=FIHE
mov bl,dl ; ebx=DIHE
rol ebx,8 ; ebx=EDIH, done
; [IJKL->GLKJ]
bswap ecx ; ecx=LKJI
shl ecx,8 ; ecx=0LKJ
mov cl,ah
mov [esi+4],ebx
mov [esi+8],ecx
;%
pop ecx
END_TEST time1
print time1
ret
SwapRGB_4pix endp
main proc
mov esi,T("ABCDEFGHIJKL")
call SwapRGB_4pix
prints esi
ret
main endp
Thus, as you see... it's doing work on only 12 bytes, not on a whole megabyte or so. Thus, your code is what he'll need :)
Of all things, "Ultrana"... :red ... it's the female form of names in my language :red
regular and mmx (Pentium MMX or better) version,...i didn't benchmark it.
;plain mmx version
.data
align 16
msk1 dq 000FF0000FF0000FFh
msk2 dq 0FF0000FF0000FF00h
msk3 dq 00000FF0000FF0000h
.code
.mmx
movq mm0,qword ptr rgb[0*8]
movq mm1,qword ptr rgb[1*8]
movq mm2,qword ptr rgb[2*8]
movq mm7,msk1
movq mm3,mm0
movq mm4,mm0
movq mm5,mm0
movq mm6,mm1
pand mm0,mm7;msk1
pand mm3,mm7;msk1
pand mm4,msk2
pand mm5,msk3
pand mm6,mm7;msk1
psrlq mm0,64-16
psllq mm3,16
psrlq mm5,16
psllq mm6,64-16
por mm3,mm4
por mm3,mm5
por mm3,mm6
movq mm4,mm1
movq mm5,mm1
movq mm6,mm2
pand mm1,mm7;msk1
pand mm4,msk2
psrlq mm1,16
pand mm5,msk3
por mm1,mm0
pand mm6,msk2
movq mm0,mm4
psllq mm4,16
psllq mm6,64-16
psrlq mm0,64-16
por mm1,mm4
por mm1,mm5
por mm1,mm6
movq mm5,mm2
movq mm6,mm2
pand mm2,mm7;msk1
pand mm5,msk2
pand mm6,msk3
psrlq mm5,16
psllq mm6,16
por mm2,mm0
por mm2,mm5
por mm2,mm6
movq qword ptr bgr[0*8],mm3;BGRBGRBGRBGR
movq qword ptr bgr[1*8],mm1
movq qword ptr bgr[2*8],mm2
;; reg 32
mov eax,dword ptr rgb[0*4];RGBRGBRGBRGB
mov ebx,dword ptr rgb[1*4]
mov ecx,dword ptr rgb[2*4]
bswap eax
xchg al,bh
bswap ebx
ror eax,8
xchg bh,cl
bswap ebx
bswap ecx
rol ecx,8
mov dword ptr bgr[0*4],eax;BGRBGRBGRBGR
mov dword ptr bgr[1*4],ebx
mov dword ptr bgr[2*4],ecx
17 cycles per 8 pixels for mmx (loop-time excluded) here, drizz
2 cycles per 4 pixels for regular instructions. BUT, if you replace "rgb[]" and "bgr[]" with [esi+], this code takes whole 16 cycles. But if you replace "rgb" with esi, and bgr with edi (esi!=edi), it takes 2 cycles again :) .
(loop time is 6 cycles)