Here is my putpixel function, anyone have idea how to optimize it?
PosCount proc x:dword,y:dword,maxX:dword,maxY:dword
mov eax,maxY ; 1 clock cycle
cmp y,eax ; 2 clock cycle
jae brs ; 1 clock cycle
mov eax,maxX ; 1 clock cycle
cmp x,eax ; 2 clock cycle
jae brs ; 1 clock cycle
xor ecx,ecx
cmp x,ecx
jl brs
cmp y,ecx
jl brs
mov ecx,y
; mov eax,maxX ; 1 clock cycle eax:=maxX
; mul y ; 42 clock cyclee ax:=maxX*y
shl ecx,10
mov eax,ecx
add eax,x ; 2 clock cycle eax:=(MaxX * Y)+x
ret ; 1 clock cycle
; 54 clock cycle
brs:
xor eax,eax
dec eax
ret
PosCount endp
PixelPut proc uses esi edi x:dword,y:dword,color:dword
mov edi,main_screen
invoke PosCount,x,y,1024,768
cmp eax,-1
jnz @f
ret
@@:
mov edx,color
mov [edi+eax*4+2],dl ;r
mov [edi+eax*4+1],dh ;g
shr edx,16
mov [edi+eax*4+0],dl ;b
ret
PixelPut endp
I prefer to do it macro, putpixel is often gonna get called in a loop and most cpus has no to extremely little when it comes to branchprediction for "call"
here is my macros, one of them also do blend and one only outputs one color channel
this works for hires floormapping/skymapping 1280x1024x32bit, ddraw thats why I fetch screenadr with ddsd.lpsurface, plotadot is just a temporary storage dword
this is also designed for simple usage of change ecx,edx for loop a series of pixels
;usage ecx= screenx,edx=screeny,ARGB=EAX
plot MACRO
.IF ecx<1280
.IF ecx>0
.IF edx<1024
.IF edx>0
mov ebx,edx
sal ebx,6
mov [plotadot],ebx
sal ebx,2
add ebx,[plotadot]
sal ebx,2 ;1280
add ebx,ecx
sal ebx,2 ;2
add ebx,[ddsd.lpSurface]
mov [ebx],eax
.ENDIF
.ENDIF
.ENDIF
.ENDIF
ENDM
plot2 MACRO
.IF ecx<1280
.IF ecx>0
.IF edx<1024
.IF edx>0
mov ebx,edx
sal ebx,6
mov [plotadot],ebx
sal ebx,2
add ebx,[plotadot]
sal ebx,2 ;1280
add ebx,ecx
sal ebx,2
add ebx,[ddsd.lpSurface]
add eax,[ebx]
mov [ebx],eax
.ENDIF
.ENDIF
.ENDIF
.ENDIF
ENDM
plotb MACRO
.IF ecx<1280
.IF ecx>0
.IF edx<1024
.IF edx>0
mov ebx,edx
sal ebx,6
mov [plotadot],ebx
sal ebx,2
add ebx,[plotadot]
sal ebx,2 ;1280
add ebx,ecx
sal ebx,2
add ebx,[ddsd.lpSurface]
;add ebx,2
mov [ebx],al
.ENDIF
.ENDIF
.ENDIF
.ENDIF
ENDM
plot3 MACRO
pushad
push ebx
mov eax,edx
;sub eax,512
xor edx,edx
idiv ebx
;add eax,512
mov Y1,eax
mov eax,ecx
;sub eax,640
xor edx,edx
pop ebx
idiv ebx
;add eax,640
mov X1,eax
popad
mov ecx,X1
mov edx,Y1
plot
ENDM
daydreamer :| use the unsigned hack to merge the [0;1280) comparison.
Here's my version, that takes care of structured clipping:
sdSetPixel proc PUBLIC uses eax ecx edx x,y,dwColor
mov ecx,x
mov edx,y
add ecx,SDDrawOffs.x
add edx,SDDrawOffs.y
xor eax,eax
cmp ecx,SDBound.left
jl _ret
cmp ecx,SDBound.right
jge _ret
cmp edx,SDBound.top
jl _ret
cmp edx,SDBound.bottom
jge _ret
imul edx,sTarget_Data.wid
mov eax,sTarget_Data.bits
add ecx,edx
mov edx,dwColor
mov [eax+ecx*4],edx
_ret: ret
sdSetPixel endp
Certainly there's some room for improvement.
"unsigned hack" also for farabi
PosCount proc x:dword,y:dword,maxX:dword,maxY:dword
mov ecx,x
mov eax,y
cmp ecx,maxX
jae brs
cmp eax,maxY
jae brs
; mov edx,maxX
; mul edx
shl eax,10
add eax,ecx
ret
brs:
mov eax,-1
ret
PosCount endp
PixelPut proc x:dword,y:dword,color:dword
invoke PosCount,x,y,1024,768
test eax,eax
jns @f
ret
@@:
mov ecx,main_screen
mov edx,color
mov [ecx+eax*4],edx
ret
PixelPut endp
Quote from: Ultrano on March 22, 2008, 08:51:21 PM
daydreamer :| use the unsigned hack to merge the [0;1280) comparison.
Here's my version, that takes care of structured clipping:
sdSetPixel proc PUBLIC uses eax ecx edx x,y,dwColor
mov ecx,x
mov edx,y
add ecx,SDDrawOffs.x
add edx,SDDrawOffs.y
xor eax,eax
cmp ecx,SDBound.left
jl _ret
cmp ecx,SDBound.right
jge _ret
cmp edx,SDBound.top
jl _ret
cmp edx,SDBound.bottom
jge _ret
imul edx,sTarget_Data.wid
mov eax,sTarget_Data.bits
add ecx,edx
mov edx,dwColor
mov [eax+ecx*4],edx
_ret: ret
sdSetPixel endp
Certainly there's some room for improvement.
clipping should be taken care of with help of MMX/SSE2 and PAND results of all comparision together for a final conditional branch, you could even compare a list of rectangles this way and lots of rectangles can shape anything, wonder if you also should unroll pixeladresscalculation with pmuld instead of imul?
I think farabi's way of having poscount and pixelput in separate proc is having more potential an initial call to poscount and put code here to setup this kinda macro
plotdelta MACRO
mov ebx,pixeladress
mov [ebx],eax
add ebx,pixeldelta
mov pixeladress,ebx
ENDM
where pixeldelta is -4 or +4 combined with lPitch or -lPitch (ddraws way of tell how many bytes between scanlines)
anyone can come up with a fast way to shift for 1680 screens? I am clueless
About 1680,
it requires mixing the results of 4 shifts in all cases, it's simply inconvenient. An imul will be better (3 cycles).
What about using a lookup table then you can avoid the shifts or imul .. especially for 1680.
;ebx = y
;edx = x
mov edi,lookupTable[ebx*4]
add edi,edx
that can take care of your DDraw surface pitch and actually *Y calculation before hand.
the bad side of a LUT is that it's L2-cache stalled or *gasp* RAM-stalled. Meanwhile an imul always takes 3 cycles or less.
So, in some designs a LUT is better (randomly plotting points around), but in most cases it's not recommended.
Calling a proc to compute the address - blah! Don't let the C++ compiler beat us so easily, please! Just make several procs like DDraw::LockRect. Or macros, even better.
Quote from: Ultrano on March 25, 2008, 07:39:29 PM
the bad side of a LUT is that it's L2-cache stalled or *gasp* RAM-stalled. Meanwhile an imul always takes 3 cycles or less.
So, in some designs a LUT is better (randomly plotting points around), but in most cases it's not recommended.
Calling a proc to compute the address - blah! Don't let the C++ compiler beat us so easily, please! Just make several procs like DDraw::LockRect. Or macros, even better.
I avoid LUT's except for keep slow fsin's in it and especially not interesting in the case when you need to plot tons of pixels you probably have heavy usage of cache for textures anyway and dont want to read in LUTs that eats bandwidth
I have some SSE code that makes use of a fsincos LUT, but parallel 3drotationcode makes heavy usage of only 4 entries in the LUT
and now when I make use of MMX anyway for saturation, integer packed muls takes even less, but isnt there a penalty for bring it over to general regs?
otherwise unroll a pixelplotter to plot several pixels at once with different Y's could be an option to speedup things
for spriterenderer,simple
ADD ebx,lPitchminusspritewidth