The MASM Forum Archive 2004 to 2012

General Forums => The Laboratory => Topic started by: swsnyder on November 13, 2006, 11:43:39 PM

Title: Fast(er) RGB --> BGR buffer copy?
Post by: swsnyder on November 13, 2006, 11:43:39 PM
Hello.

I have graphics code whose bottleneck is copying a buffer of 24-bit pixels to a buffer of reversed pixels.  That is, the source buffer is RGB and the destination is to be filled with BGR pixels.

My current scheme is to read 32-bits (x,R,G,B), swap and shift the bytes, and write out 32-bits (B,G,R,0).  That trailing 0 byte is garbage and is overwritten in the next loop, or by the next pixel in the same loop.  The source and dest pointers are incremented by 3 to advance past the valid 3 bytes just handled.  I've unrolled this to handle 3 pixels per loop, to avoid back-to-back operations on the same register.  The current code is shown below.  It works, but there's a couple of problems with this code from a performance perspective.

First, there's the mis-alignment.  The destination of the write is only aligned on every 4th write, since the pointers are being advanced by 3 bytes per pixel.

Second, it bothers me that 8 of each 32 bites I write is wasted, since that byte is overwritten on the next pixel to be copied.  Bad enough that the writes are so often mis-aligned but to throw away that expensive data irks me.

I know that this code will always be run on machines that are SSE-capable, but my attempts to go the SIMD route have all resulted in code that is slower than generic x86 handling.

I continue to think that there must be a way to gather 3 4-byte reads and output 4 3-byte pixels, but I haven't been able to think of a way that is more efficient than writing 3 valid bytes and a single garbage byte.

Any idea on how better to reverse-copy a buffer of 24-bit pixels?

Thanks.



// s   = address of RGB source buffer
// d   = address of BGR destination buffer
// len = number of RGB pixels to copy
__forceinline void swap3basm(char *d, char *s, int len)
{
   _asm {
      push  ebx

      mov   ecx, len
      mov   esi, s
      mov   edi, d

      cmp   ecx, 3
      jb    singtest

      align 16
  bulkcopy:
      sub  ecx, 3

      mov  eax, dword ptr [esi  ]
      mov  ebx, dword ptr [esi+3]
      mov  edx, dword ptr [esi+6]

      bswap eax
      bswap ebx
      bswap edx

      shr  eax, 8
      shr  ebx, 8
      shr  edx, 8

      mov  dword ptr [edi  ], eax
      mov  dword ptr [edi+3], ebx
      mov  dword ptr [edi+6], edx

      add  esi, 9
      add  edi, 9

      cmp  ecx, 3
      jae  bulkcopy

  singtest:
      cmp  ecx, 1
      jb   done

  singles:
      mov  al, word ptr [esi  ]
      mov  bl, byte ptr [esi+1]
      mov  dl, byte ptr [esi+2]

      mov  byte ptr [edi+2], al
      mov  byte ptr [edi+1], bl
      mov  byte ptr [edi  ], dl

      add  esi, 3
      add  edi, 3

      dec  ecx
      jnz  singles

  done:
      pop  ebx
   }

}


Title: Re: Fast(er) RGB --> BGR buffer copy?
Post by: pro3carp3 on November 14, 2006, 05:52:45 AM
The following code will change RGB in EAX/EBX/ECX to BGR in EBX/ECX/EDX:


mov edx, ecx
bswap edx
shrd ecx, ebx, 16
bswap ecx
shrd edx, ecx, 8
shl ecx, 8
shrd ebx, eax, 8
bswap ebx
shrd ecx, ebx, 16
bswap eax
shl ebx, 8
shrd ebx, eax, 24


I don't know if this is faster, however it does eliminate the mis-aligned read/writes.  It can probably optimized further, and using mmx or sse is probably faster than this.
Title: Re: Fast(er) RGB --> BGR buffer copy?
Post by: swsnyder on November 14, 2006, 02:04:55 PM
Hmmmm... I don't see how this works.  Maybe I'm misunderstanding your intent.

This is the start of the input string (source buffer):


   0x00, 0x01, 0x02,  0x10, 0x11, 0x12,  0x20, 0x21, 0x22,  0x30, 0x31, 0x32,


My code above generates this output (start of dest buffer):


   0x02 0x01 0x00  0x12 0x11 0x10  0x22 0x21 0x20  0x32 0x31 0x30


Using your suggested code (see below), I get this output:


   0x20 0x10 0x02  0x01 0x11 0x32  0x00 0x21 0x31  0x30 0x22 0x12


I took your brief comment to mean that EAX:EBX:ECX == RGBRGBRGBRGB and that the output, after swapping, would be EBX:ECX:EDX == BGRBGRBGRBGR.  That apparently is not the case.

Would you mind elaborating?

Thanks.



void swap3b2(char *d, char *s, int len)
{

   _asm {
      mov   esi, s
      mov   edi, d

      cmp   len, 4
      jb    singtest

      align 16

   bulkcopy:
      sub   len, 4

      mov   ecx, dword ptr [esi+8]
      mov   ebx, dword ptr [esi+4]
      mov   eax, dword ptr [esi  ]

      mov   edx, ecx
      bswap edx
      shrd  ecx, ebx, 16
      bswap ecx
      shrd  edx, ecx, 8
      shl   ecx, 8
      shrd  ebx, eax, 8
      bswap ebx
      shrd  ecx, ebx, 16
      bswap eax
      shl   ebx, 8
      shrd  ebx, eax, 24

      mov   dword ptr [edi+8], edx
      mov   dword ptr [edi+4], ecx
      mov   dword ptr [edi  ], ebx

      add   esi, 12
      add   edi, 12

      cmp   len, 4
      jae   bulkcopy

  singtest:
      cmp   len, 1
      jb    done
      mov   ecx, len

      align 16

  singles:
      mov   ax, word ptr [esi  ]
      mov   bl, byte ptr [esi+2]
      ror   ax, 8
      mov   byte ptr [edi  ], bl
      mov   word ptr [edi+1], ax

      add   esi, 3
      add   edi, 3

      dec   ecx
      jnz   singles

  done:
   }
}



Title: Re: Fast(er) RGB --> BGR buffer copy?
Post by: u on November 14, 2006, 04:30:23 PM

SwapRGB_4pix proc
;ABCD EFGH IJKL
;CBAF EDIH GLKJ

mov eax,[esi+0]
mov ebx,[esi+4]
mov ecx,[esi+8]
     ; byte 0123
bswap eax ; DCBA
; [ABCD->CBAF]
mov edx,eax
mov al,bh ; eax=FCBA
ror eax,8 ; eax=CBAF, done
; [EFGH->EDIH]
ror ebx,8 ; ebx=FGHE
mov [esi+0],eax
mov eax,ebx
mov bh,cl ; ebx=FIHE
mov bl,dl ; ebx=DIHE
rol ebx,8 ; ebx=EDIH, done
; [IJKL->GLKJ]
bswap ecx ; ecx=LKJI
shl ecx,8 ; ecx=0LKJ
mov cl,ah


mov [esi+4],ebx
mov [esi+8],ecx


ret
SwapRGB_4pix endp


main proc
mov esi,T("ABCDEFGHIJKL")
call SwapRGB_4pix
prints esi ; this outputs "CBAFEDIHGLKJ"
ret
main endp


Takes.... 3 cycles to convert 4 pixels on my Sempron3000+ . :)  (add 4 cycles for the looping)
Title: Re: Fast(er) RGB --> BGR buffer copy?
Post by: pro3carp3 on November 14, 2006, 05:48:22 PM
swsnyder,

I can't check the code now, while at work.  It produced the correct output last night at home (unless my perceptions were altered by my fatigue at that late hour).  In any case, Ultrano's optmized version is much better than mine.
Title: Re: Fast(er) RGB --> BGR buffer copy?
Post by: swsnyder on November 14, 2006, 06:36:33 PM
Thanks, Ultrano!
Title: Re: Fast(er) RGB --> BGR buffer copy?
Post by: stanhebben on November 14, 2006, 08:43:02 PM
I tried making an SSE2 variant on this one, but it's way slower than general-purpose code. (currently 3 cycles per pixel)

EDIT: Now that I timed ultrano's code, my sse code seems to be faster though. So, forget what I said.

Ultrana: your code needs about 3 cycles per pixel, at least in my benchmark. Not per 4 pixels. (originally, it was 4 cycles per pixel, but adding prefetch code improved it's speed)

As soon as I finish my sse code, I'll post it.
Title: Re: Fast(er) RGB --> BGR buffer copy?
Post by: u on November 14, 2006, 09:02:30 PM
on what cpu? I presume a P4.  :lol

This is the benchmark I did:

;=====[[ Benchmarking macros >>===\
TEST_ITERS equ 10000000
TEST_ID = 0

START_TEST macro



TEST_ID = TEST_ID + 1
rdtsc
push eax
mov ecx,TEST_ITERS
align 16
@CatStr(<testlabl>,%TEST_ID,<:>)
endm
END_TEST macro where
dec ecx
jnz @CatStr(<testlabl>,%TEST_ID)
rdtsc
pop edx
sub eax,edx
xor edx,edx
mov ecx,TEST_ITERS
div ecx
.if eax<where
mov where,eax
.endif

endm
;=======/




SwapRGB_4pix proc
local time1
mov time1,-1

START_TEST
push ecx
;comment %    <---------- when I uncomment this, it's 9 cycles. Otherwise, 6 cycles
;ABCD EFGH IJKL
;CBAF EDIH GLKJ

mov eax,[esi+0]
mov ebx,[esi+4]
mov ecx,[esi+8]
     ; byte 0123
bswap eax ; DCBA
; [ABCD->CBAF]
mov edx,eax
mov al,bh ; eax=FCBA
ror eax,8 ; eax=CBAF, done
; [EFGH->EDIH]
ror ebx,8 ; ebx=FGHE
mov [esi+0],eax
mov eax,ebx
mov bh,cl ; ebx=FIHE
mov bl,dl ; ebx=DIHE
rol ebx,8 ; ebx=EDIH, done
; [IJKL->GLKJ]
bswap ecx ; ecx=LKJI
shl ecx,8 ; ecx=0LKJ
mov cl,ah


mov [esi+4],ebx
mov [esi+8],ecx
;%
pop ecx
END_TEST time1
print time1


ret
SwapRGB_4pix endp


main proc
mov esi,T("ABCDEFGHIJKL")
call SwapRGB_4pix
prints esi







ret
main endp


Thus, as you see... it's doing work on only 12 bytes, not on a whole megabyte or so. Thus, your code is what he'll need :)

Of all things, "Ultrana"...  :red ... it's the female form of names in my language  :red
Title: Re: Fast(er) RGB --> BGR buffer copy?
Post by: drizz on November 14, 2006, 09:33:56 PM
regular and mmx (Pentium MMX or better) version,...i didn't benchmark it.

;plain mmx version
.data
align 16
msk1 dq 000FF0000FF0000FFh
msk2 dq 0FF0000FF0000FF00h
msk3 dq 00000FF0000FF0000h
.code
.mmx
movq mm0,qword ptr rgb[0*8]
movq mm1,qword ptr rgb[1*8]
movq mm2,qword ptr rgb[2*8]
movq mm7,msk1
movq mm3,mm0
movq mm4,mm0
movq mm5,mm0
movq mm6,mm1
pand mm0,mm7;msk1
pand mm3,mm7;msk1
pand mm4,msk2
pand mm5,msk3
pand mm6,mm7;msk1
psrlq mm0,64-16
psllq mm3,16
psrlq mm5,16
psllq mm6,64-16
por mm3,mm4
por mm3,mm5
por mm3,mm6
movq mm4,mm1
movq mm5,mm1
movq mm6,mm2
pand mm1,mm7;msk1
pand mm4,msk2
psrlq mm1,16
pand mm5,msk3
por mm1,mm0
pand mm6,msk2
movq mm0,mm4
psllq mm4,16
psllq mm6,64-16
psrlq mm0,64-16
por mm1,mm4
por mm1,mm5
por mm1,mm6
movq mm5,mm2
movq mm6,mm2
pand mm2,mm7;msk1
pand mm5,msk2
pand mm6,msk3
psrlq mm5,16
psllq mm6,16
por mm2,mm0
por mm2,mm5
por mm2,mm6
movq qword ptr bgr[0*8],mm3;BGRBGRBGRBGR
movq qword ptr bgr[1*8],mm1
movq qword ptr bgr[2*8],mm2

;; reg 32
mov eax,dword ptr rgb[0*4];RGBRGBRGBRGB
mov ebx,dword ptr rgb[1*4]
mov ecx,dword ptr rgb[2*4]
bswap eax
xchg al,bh
bswap ebx
ror eax,8
xchg bh,cl
bswap ebx
bswap ecx
rol ecx,8
mov dword ptr bgr[0*4],eax;BGRBGRBGRBGR
mov dword ptr bgr[1*4],ebx
mov dword ptr bgr[2*4],ecx
Title: Re: Fast(er) RGB --> BGR buffer copy?
Post by: u on November 14, 2006, 09:53:04 PM
17 cycles per 8 pixels for mmx (loop-time excluded) here, drizz
2 cycles per 4 pixels for regular instructions. BUT, if you replace "rgb[]" and "bgr[]" with [esi+], this code takes whole 16 cycles. But if you replace "rgb" with esi, and bgr with edi  (esi!=edi), it takes 2 cycles again :) .
(loop time is 6 cycles)