News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Slow Graphics vs. Bad Programming

Started by redskull, May 05, 2006, 10:05:16 PM

Previous topic - Next topic

redskull

     I was writing a 'Scrolling Landscape' program, which is nothing more than a bunch of diagonal lines that simulate a mountainside which scroll left and right.  It works, but it's wicked slow, even at max speed. I know Windows GDI has a reputation of not being the fastest graphics system out there, but i figured it would of been more than adaqute for this simple task?  Basically, the larger the window is, the slower the 'landscape' scrolls by.  There's a lot of small optimizations I haven't done to the code, but this seems like one of those times that better code is faster than optimized slow code.
     Basically, I've got the points stored in an array, and the current left hand edge is stored in 'scrollpoint'.  The 'DrawPoints' function just subtracts the scrollpoint from the x coordinate to readjust the whole thing to the client area, and the scrollpoint is increased once a cycle.  I guess what i'm asking is if there's anything to significantly speed this up, short of switching DirectX.  Any redesign comments are appreciated.  BTW, i run this on a 533mhz P3 (win2k).

.386
.model flat, stdcall
option casemap:none

include \masm32\include\windows.inc
include \masm32\include\kernel32.inc
include \masm32\include\user32.inc
include \masm32\include\gdi32.inc
includelib \masm32\lib\kernel32.lib
includelib \masm32\lib\user32.lib
includelib \masm32\lib\gdi32.lib

WinMain proto :DWORD,:DWORD,:DWORD,:DWORD
DrawPoints proto :DWORD
ScrollMain proto

.data
ClassName db "ScrollClass",0
AppName db "Scolling Demo",0
scrollpoint DWORD 0
direction DWORD 0
Terrain POINT <0,0>,<100,200>,<200,320>,<400,350>,<500,300>,\
                       <640,100>,<740,300>,<940,350>,<1040,300>,\
                       <1140,270>,<1280,250>

.data?
hInstance HINSTANCE ?  ; instance handle
hwnd HWND ?
hdcmem HDC ?
cxClient DWORD ?
cyClient DWORD ?

.const

.code
start:
  invoke GetModuleHandle, NULL   ; retrieve the instance handle
   mov hInstance, eax             ; store it in hInstance
  invoke WinMain, hInstance, NULL, NULL, SW_SHOWDEFAULT  ; Call the main program
  invoke ExitProcess, eax        ; exit the program

WinMain proc hInst:HINSTANCE,hPrevInst:HINSTANCE,CmdLine:LPSTR,CmdShow:DWORD
LOCAL wc:WNDCLASSEX                    ; create a variable for our window class parameters
LOCAL msg:MSG
mov wc.cbSize, SIZEOF WNDCLASSEX       ; size of the class
mov wc.style, CS_HREDRAW or CS_VREDRAW ; style
mov wc.lpfnWndProc, OFFSET WndProc     ; location of the window procedure
mov wc.cbClsExtra, NULL                  ; empty 'extra values'
mov wc.cbWndExtra, NULL               
push  hInstance                       
pop   wc.hInstance                     ; our instance handle
mov   wc.hbrBackground,COLOR_WINDOW+1;eax  ; background color
mov   wc.lpszMenuName,NULL             ; menu name
mov   wc.lpszClassName,OFFSET ClassName; name of the class
invoke LoadIcon,NULL,IDI_APPLICATION
mov   wc.hIcon,eax
mov   wc.hIconSm,eax
invoke LoadCursor,NULL,IDC_ARROW
mov   wc.hCursor,eax

invoke RegisterClassEx, addr wc
invoke CreateWindowEx,NULL,\
                ADDR ClassName,\
                ADDR AppName,\
                WS_OVERLAPPEDWINDOW,\
                CW_USEDEFAULT,\
                CW_USEDEFAULT,\
                800,\
                400,\
                NULL,\
                NULL,\
                hInst,\
                NULL
    mov   hwnd,eax

    invoke ShowWindow, hwnd,CmdShow               ; display our window on desktop
    invoke UpdateWindow, hwnd                                 ; refresh the client area

   .WHILE TRUE                                       ; Enter an endless loop
    invoke PeekMessage, ADDR msg,NULL,0,0,PM_REMOVE  ; Check to see if there's a message
    .IF eax != 0                                     ; if there is a message...   
     .BREAK .IF (msg.message == WM_QUIT)
     invoke TranslateMessage, ADDR msg               ;  anything else, translate it...
     invoke DispatchMessage, ADDR msg                ;  dispatch it
    .ENDIF                                           ; end the IF block
     invoke ScrollMain
   .ENDW                                             ; repeat the loop
    mov     eax,msg.wParam                           ; return exit code in eax
    ret
WinMain endp

WndProc proc hWnd:HWND, uMsg:UINT, wParam:WPARAM, lParam:LPARAM
LOCAL ps:PAINTSTRUCT
LOCAL rect:RECT
LOCAL hdc:HDC
LOCAL hBM:HGDIOBJ

.IF uMsg==WM_CREATE
.ELSEIF uMsg==WM_PAINT
.ELSEIF uMsg==WM_DESTROY                         
  invoke DeleteDC, hdcmem
  invoke PostQuitMessage,NULL             
.ELSEIF uMsg==WM_SIZE
mov eax, lParam    ; move the screen size into eax
xor ebx, ebx       ; zero out ebx
mov bx, ax         ; move the X-dimension to ebx
mov cxClient, ebx  ; save the X-dim to cxClient
SHR eax, 16        ; shift the Y-dim down to ax
mov cyClient, eax  ; save the Y-dim to cyClient

invoke DeleteDC, hdcmem                                ; Delete the old memory DC
invoke GetDC, hwnd                                     ; Get the screen DC
  mov hdc, eax                                          ; save the handle in hdc
invoke CreateCompatibleDC, hdc                         ; Create a new memory DC
  mov hdcmem, eax                                       ; Save the handle in hdcmem
invoke CreateCompatibleBitmap, hdc, cxClient, cyClient ; create a compatible bitmap
  mov hBM, eax                                          ; save it in hBM
invoke SelectObject, hdcmem, hBM                       ; set the memory DC
invoke DeleteObject, hBM                               ; Delete the old object
invoke ReleaseDC, hwnd, hdc                            ; release the screen DC
invoke Rectangle, hdcmem, 0,0,cxClient,cyClient        ; clear the back buffer
.ELSE
invoke DefWindowProc,hWnd,uMsg,wParam,lParam     ; Default message processing
ret
.ENDIF
xor eax,eax
ret
WndProc endp
;>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
DrawPoints PROC startcol:DWORD
;Draws the terains from a passed screen coordinate to the end of the screen
mov eax, startcol               ; Move the left hand edge of screen into EAX
mov ebx, 0                      ; zero out EBX
.WHILE Terrain[ebx].x <= eax     ; scroll through the terrain array until we
add ebx, 8                     ; find the first point to plot
.ENDW

.IF ebx != 0                    ; if any other point except the first one, step
sub ebx, 8                     ; back one point (to paint from left-hand edge to
.ENDIF                          ; first point

mov ecx, Terrain[ebx].x
sub ecx, scrollpoint
invoke MoveToEx, hdcmem, ecx, Terrain[ebx].y, NULL ; move to the first point
mov eax, startcol               ; determine the right hand edge of the screen by adding
add eax, cxClient               ; the left hand edge plus the size of the screen

.REPEAT

mov ecx, Terrain[ebx].x        ; move the landscape point to a temp register
sub ecx, scrollpoint           ; subtract the offset to position the point in the window
PUSH eax                       ; save the right hand edge
invoke LineTo, hdcmem, ecx, Terrain[ebx].y ; draw a line to it
POP eax                        ; restore the right hand edge
add ebx, 8                     ; move to the next point

.UNTIL Terrain[ebx].x > eax     ; keep doing it until we've draw a whole screens worth

mov ecx, Terrain[ebx].x        ; move the landscape point to a temp register
sub ecx, scrollpoint           ; subtract the offset to position the point in the window
PUSH eax                       ; save the right hand edge
invoke LineTo, hdcmem, ecx, Terrain[ebx].y ; draw a line to it
POP eax                        ; restore the right hand edge
add ebx, 8                     ; move to the next point
ret
DrawPoints endp
;>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
ScrollMain PROC
LOCAL ps:PAINTSTRUCT
LOCAL hdc:HDC
LOCAL rect:RECT

invoke Rectangle, hdcmem,0,0,cxClient,cyClient  ; erase the back buffer

mov eax, 1280       ; determine the point at which to switch right scrolling
sub eax, cxClient   ; to left scrolling (1280 is the furthest point)

.IF scrollpoint == eax          ; switch directions back and forth, if necessary
mov direction, 1
.ENDIF
.IF scrollpoint == 0
mov direction, 0
.ENDIF

.IF direction == 0              ; scroll the appropriate direction
inc scrollpoint
.ENDIF
.IF direction != 0
dec scrollpoint
.ENDIF

invoke DrawPoints, scrollpoint  ; draw the points on the back buffer

invoke GetDC,hwnd               ; copy back buffer to screen
mov hdc, eax   
invoke BitBlt, hdc, 0,0, cxClient, cyClient, hdcmem, 0,0,SRCCOPY
invoke ReleaseDC, hwnd, hdc

ret
ScrollMain ENDP
end start
Strange women, lying in ponds, distributing swords, is no basis for a system of government

PBrennick

redskull,
That looks like a good start, to me.  Now, you just need to add cross-hairs and a firebutton.  Then, you can maybe have your favorite polititian pop up from behind a dune and you blow-him-away!

Paul
The GeneSys Project is available from:
The Repository or My crappy website

Ian_B

I'd say this was a good example of the possibilities of using multi-threading. Instead of doing some processing, PEEKing and dispatching messages, then processing again, it's gotta be more efficient to set up a thread just to do the ScrollMain display loop and let the WndProc handle the message loop. Just have a "RequestedToQuit" global flag that the thread checks before it loops. If nothing else, it'll prevent that annoying lag that most apps have when you try to move them around the screen or select a menu when they are "busy".

If it was me, I'd inline the DrawPoints proc straight into ScrollMain since it's only called once. That means you don't have any proc calling overhead. Reserve your EBX/EDI/ESI registers once when you set the thread up (and return them once when you leave) and you can save yourself a lot of push/pop, parameter passing and saving values through the API calls by keeping them in those registers, where currently you're only allowing yourself EBX. Every little may help. If it's still too slow when you've squeezed out all the memory accesses you can and there's no more interruption from handling messages, then sure, find a better draw API. But there's a way to go yet to speed this up. Have fun.  :U

Ian_B

Ian_B

#3
Just remembered. You might want to look at a better way of doing this:

mov ebx, 0                      ; zero out EBX
.WHILE Terrain[ebx].x <= eax     ; scroll through the terrain array until we
add ebx, 8                     ; find the first point to plot
.ENDW

.IF ebx != 0                    ; if any other point except the first one, step
sub ebx, 8                     ; back one point (to paint from left-hand edge to
.ENDIF                          ; first point


I realise the high-level constructs are "neater" for showing code logic, but it's making this small bit of code very inefficient with an extra check whether it's needed or not. Consider just jumping straight forward (and not adding 8) after doing an initial test on Terrain[0].x, THEN doing the while loop by testing Terrain[ebx+8].x which saves a conditional jump and extra test/subtract code, since EBX is already correct then. And XOR EBX, EBX is smaller code than MOV EBX, 0 which never does any harm.

Similar thoughts apply to this, where you set a value to zero, then immediately test for whether it is zero. The "neat" IF/ENDIF blocks could be made more complex but more efficient so that the first pair do the appropriate code after the second tests and jump over further tests.

.IF scrollpoint == eax          ; switch directions back and forth, if necessary
mov direction, 1
.ENDIF
.IF scrollpoint == 0
mov direction, 0
.ENDIF

.IF direction == 0              ; scroll the appropriate direction
inc scrollpoint
.ENDIF
.IF direction != 0
dec scrollpoint
.ENDIF


I'm also not sure how the assembler codes the constructs, but the second pair of tests might be better written as an IF/ELSE/ENDIF. Or try avoiding almost all the jumps entirely, since conditional jumps are one of the main causes of code slowup. As long as you have EAX/EDX free, CDQ can be brilliant for this. Consider something like:

; hold your current EAX value, 1280-cxClient, in other reg except EDX

; .IF scrollpoint == 0
; mov direction, 0
; .ENDIF
xor eax, eax
sub eax, scrollpoint
cdq             ; if scrollpoint was 0, EDX is 0, else EDX = -1
and edx, direction    ; EDX holds new direction value

; now second test pair
xor eax, eax    ; EAX = 0
mov direction, edx    ; save value
sub eax, edx
cdq             ; if direction was 0, EDX is 0, else EDX = -1
lea edx, [edx+edx+1]
add scrollpoint, edx


Should be faster on average than conditional jumping, especially if you are usefully holding scrollpoint and direction in EDI/ESI or other registers.  :U

Ian_B

Ian_B

#4
Another quick thought. If you have the ScrollMain proc in its own thread, you only need to synchronise a change of window size. That's easily done with a global status flag, test just before you do the final BitBlt and if it's set simply redo that loop before you draw on the wrong-sized window (first resetting the flag). I guess that unless you've changed the window size, you can avoid repeatedly getting and releasing DC calls, just get once unless it changes (or even just once in the app setup, I don't do much with bitmaps so I'm not sure if the size of the window is relevant to the DC). Removing any redundant API calls like that is going to be an even better way to speed this up.  :bg

Ian_B

redskull

this is what I was afraid of  :'(
I really appreciate all the suggestions, and will probably end up incorporating all of them, but from a realistic standpoint it's still just too slow.  I mean, this code literally has nothing to slow it down or keep it from zipping across the screen at light speed, and it still crawls, so even a gain of 10-15% speedwise will be offset ten fold by anything else (crosshairs, politicians, etc) that gets added later.  If it is the BitBlt that's taking up all the time (and it must be, since the speed seems mostly related to the size of the screen), then adding a couple more bitmaps, to the mix would make it pretty much stop.  Thanks again for the replies.

alan
Strange women, lying in ponds, distributing swords, is no basis for a system of government

Ratch

redskull,

Quote...Any redesign comments are appreciated.  ...

     I don't think you are going to get any speed from multithreading.  The program is already devoting most of its time to to moving the pixels.  I think using BitBlt is causing the most delay.  The less you call that API, the faster everything will be.  It is already too fast for my eyes on my computer, but I have a faster machine.  I have made some changes coding the program, but it does not speed it up noticeably.  Consider the changes as alternative way of doing things like you requested.  The following are notes of explanation. Ratch

1) First of all, don't put instructions in the leftmost column.  The eye expects to see labels there.
2) Indent your high level constructions (HLC's), and use white space between HLC's and logical sections of code that have a common purpose.
3) Since both "subroutines" in your code are called only once, I put them inline to get rid of the CALL-RET sequence.
4) Notice how I use PUSH instead of MOV to put parameters on the stack for RegisterClassEx.
5) Notice how PeekMessage is tested for an error (negative EAX).  I see a lot of dispatch message handlers that don't do this test.
6) Notice how I put the MSG structure on the top of the stack, so that its address is simply ESP.  You can see this at TranslateMessage and DispatchMessage.
7) I use debug code which can be turned on/off the the DBUG switch.
8) Since WM_CREATE and WM_PAINT messages are sent to default processing, why do you check for these messages in your WINPROC routine?
9) Testing 'scrollpoint' and 'direction' for zero can be shortened one byte by comparing them to a register of zero value, in this case I used EBP=0. PUSH 0 takes twice as many bytes as  PUSH EBP.
10) Try to put parameters into registers instead of memory.  It's faster that way.  Notice how hwnd,hdc,hBM have been eliminated.
11) Notice how my WINPROC puts wParam, lParam into EBX AND EDX respectively.  It also clears EBP to the handy constant zero.
12) Notice how parameters can sometimes be PUSH'ed onto the stack for a CALL later in the program.  This eliminates the need for storing the parameter until later.  See DeleteObject.
13) Use brackets for your memory references.  TASM has a IDEAL mode that enforces this practice.



[attachment deleted by admin]

Eugen

Hi redskull

My past experience with GDI showed me that it can be very very fast ( if it can use driver calls that is  :wink ). Dont be so sure to call it slow until you see your FPS  :green.I guess it depends much on the 2D capabilities of the graphics card and not very much on the CPU (if the card can do 'fill'  and 'draw line' operations under GDI command). If i were you, i would time everything just to be sure were the time is lost, who knows, maybe its lost in the "fill" of the background..? A FPS figure would be meaningfull, especially when the windows size is changing, although its pretty visible how its slowing down when the window is getting bigger  :green

Personally i dont think any optimizations like moving parameters from variables to registers realy matter here, the drawing/screen access/OS calls is what realy eats time...

A small bug noticed: if i maximize the window, to image scrolls to left forever... i suspect it has something to do with window coords when maximized.

Eugen

Mirno

I'd be very surprised if the 2D capability of the card was slowing the GDI too much, both nVidia and ATi have had very good GDI (given how poor the API is) drivers.

Given that the terrain is small, have you tried drawing the whole thing to a back buffer, and then doing two blits to bring it to the screen?
Or on the same principal, use a blit on the existing screen data (from the back buffer) and only redraw the "new" strip of screen.

Mirno

Ratch

redskull,
     Looking back at your code, I don't see where you save and restore the essential registers (EBX,ESI,EDI) in your WinProc.  You change EBX in WM_SIZE without restoring it, but it seems you to got by with doing that this time.  Also your call to default processing is 'invoke DefWindowProc,hWnd,uMsg,wParam,lParam' .  Those parameters are already on the stack, so why splat them on the stack again?  My WinProc simply does a 'JA DefWindowProc'.  Since default processing is called many,many, many times in a program, those extra useless PUSH's can  be significant.  Ratch

daydreamer

you shouldnt use dec/inc so every frame moves a pixel  on a big screen, you should add a signed speed which is inturn added signed acceleration each timeunit
and max/min for acc/speed

redskull

Since the BitBlt is taking up almost all the time, what if I used ChangeDisplaySettings to reduce the resolution?  Right now i'm running it in 24-bit mode, so theoretically if I changed it to 8-bit mode, there would be three times less data to copy on each bit, and hence 3 times faster?  Of course, that means i would have to run it in a full screen, non-sizeable, non-moveable window, to avoid messing up everything else, right?  I was hoping to avoid that, also.

Thanks again to everybody for the responses, you guys are great.

alan
Strange women, lying in ponds, distributing swords, is no basis for a system of government