News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

MMX trouble 16bit<->8bit precision

Started by daydreamer, February 23, 2006, 07:33:40 PM

Previous topic - Next topic

daydreamer

I havent used MMX before much
I want to use internal precision 16bit per channel, blend a light effect from input 32bit ARGB for blend light effect output to 32bit ARGB
but unpack does several copies not zero extend to16bit, 16bit per channel in counter and after for example blue light 255, continues to kick in white specular
the below code is yellow effect, just creates yellow garbage on top of the window

DDSINVOKE mLock, lpDDSPrimary, NULL, ADDR ddsd, DDLOCK_WAIT, NULL
                              ;DDINVOKE WaitForVerticalBlank, lpDD, DDWAITVB_BLOCKBEGIN, NULL
                              mov ebx,ddsd.lpSurface
                              mov edi,0
                              mov edx,100
                              @@l5:

                              xor eax,eax
                              movd MM2,eax
                              punpcklwd MM2,MM2
                              mov eax,01h
                              movd MM3,eax
                              punpcklwd MM3,MM3
                              mov ecx,512
                              @@l4:
                              paddusw MM2,MM3
                              packuswb MM0,MM2
                         
                              movd MM7,[ebx+edi]
                              paddusb MM7,MM0
                              movd [ebx+edi],MM7
                             
                             
                              add edi,4
                              dec ecx
                              jne @@l4
                              dec edx
                              jne @@l5
                              EMMS

this is kinda what I wanna port after I understand MMX a little more + enhance it
it seems really bloated and must produce slow messy code when compiling it, when MMX is just made for just this kinda image processing

public void light(int x,int y,int adress,int adress2){
        int temp,red,green,blue,alpha,l,pixel;
        double li;
        pixel=en[adress2];
        alpha = (pixel >> 24) & 0xff;
        red   = (pixel >> 16) & 0xff;
      green = (pixel >>  8) & 0xff;
        blue  = (pixel      ) & 0xff;
       
        x=x-20;y=y-400;
        li=390-Math.sqrt(x*x+y*y);//hypotenusan, avstånd från centrum av sfären
li=li-390;
li=li;
/*blue=channel(li,blue);
green=channel(li,green);
red=channel(li,red);
*/

blue=(int)(0.5+li*blue/390);
if (blue>255)
blue=255;
if (blue<-256)
blue=0;
green=(int)(0.5+li*green/390);
if (green>255)
green=255;
if (green<-256)
green=0;
red=(int)(0.5+li*red/390);
if (red>255)
red=255;
if (red<-256)
red=0;

temp=0xff000000+red*65536+green*256+blue;
imgx2[adress]=temp;
}//light

EduardoS

If helps...
To duplicate (0, 56h, 78h, 9ah -> 0, 5656h, 7878h, 9a9ah)

MOVD MM0, var
PUNPCKLBW MM0, MM0


To zero extend

MOVD MM0, var
PXOR MM1, MM1
PUNPCKLBW MM0, MM1


To sign extend

MOVD MM0, var
PXOR MM1, MM1
PCMPGTB MM1, MM0
PUNPCKLBW MM0, MM1

daydreamer

is this really right?
@@l4:
                              paddusw MM2,MM3
                              packuswb MM0,MM2 ;debugger shows the bug is here, it works differently than I thought
                          ;it packs words to byte alright after the internal paddusw , but ??? the debugger shows
                           ;the conversion suddenly ends up in the highword of MM0
                            ;sometimes some FF garbage sneaks into the lower word, creating lines between rendering nothing
                              movd MM7,[ebx+edi]
                              paddusb MM7,MM0
                              movd [ebx+edi],MM7
                             
                             
                              add edi,4
                              dec ecx
                              jne @@l4

daydreamer

experimented and fincstp, fdecstp seem to work also with MMX regs on my cpu, but does it do that on most or I should avoid to use it?


EduardoS

Your code is a little confuse... If you explain better what you want things become easier...
Anyway,
I think this part:

packuswb MM0,MM2


Should do it:

if (red>255)
red=255;
if (red<-256)
red=0;


Well... It don't do...
the packuswb works on this way:

if (red>255)
red=255;
if (red<0)
red=0;


Also i see a "blue=(int)(0.5+li*blue/390);" in your code, if you need floats why don't use SSE?

daydreamer

Quote from: EduardoS on February 26, 2006, 12:11:48 AM
Your code is a little confuse... If you explain better what you want things become easier...
Anyway,
I think this part:

packuswb MM0,MM2


Should do it:

if (red>255)
red=255;
if (red<-256)
red=0;


Well... It don't do...
the packuswb works on this way:

if (red>255)
red=255;
if (red<0)
red=0;


Also i see a "blue=(int)(0.5+li*blue/390);" in your code, if you need floats why don't use SSE?
original code is while rendering each scanline w a texture, calling this proc to add light/darkness
light is calculated from 390-sqrt(X*X+Y*Y) and decreases with distance from that even to negative numbers, which I had to add checks for negative numbers not creating artifacts, but the backside of a planet

but that is very slow approach, I instead wanna create light with loops/ adds with fixed point counters for color increase /decrease per scanline, which when working correctly is fast and can be implemented together in the same loop that fetches two textures/blends -> renders each scanline
and in the process learn some MMX- image processing


EduardoS

Quote from: !Czealot on March 01, 2006, 06:01:28 PM
original code is while rendering each scanline w a texture, calling this proc to add light/darkness
light is calculated from 390-sqrt(X*X+Y*Y) and decreases with distance from that even to negative numbers, which I had to add checks for negative numbers not creating artifacts, but the backside of a planet

but that is very slow approach, I instead wanna create light with loops/ adds with fixed point counters for color increase /decrease per scanline, which when working correctly is fast and can be implemented together in the same loop that fetches two textures/blends -> renders each scanline
and in the process learn some MMX- image processing
Now you help me a lot ;)
Looking at your MMX code, i think i know what part is generating the yellow garbage (the word HERE, other comments are to me understand easier):

DDSINVOKE mLock, lpDDSPrimary, NULL, ADDR ddsd, DDLOCK_WAIT, NULL
                              ;DDINVOKE WaitForVerticalBlank, lpDD, DDWAITVB_BLOCKBEGIN, NULL
                              mov ebx,ddsd.lpSurface
                              mov edi,0
                              mov edx,100
                              @@l5:

                              xor eax,eax              ;
                              movd MM2,eax         ;
                              punpcklwd MM2,MM2; This part just reset MM2, the sameas PXOR MM2, MM2
                              mov eax,01h            ;
                              movd MM3,eax         ;
                              punpcklwd MM3,MM3;Here you move to MM3 (0; 1; 0; 1), HERE is the problem, i think it should have (0; 1; 1; 1)
                              mov ecx,512           ;
                              @@l4:                    ; repeat this loop 512 times
                              paddusw MM2,MM3  ; the first time MM2 = (0; 1; 0; 1) the sceond (0; 2; 0; 2), etc
                              packuswb MM0,MM2 ; the low dword of MM0 is the same of above, but with saturation,
                                                           ; so after the 255th the values will be always 255, HERE is another problem, if always 255, always a
                                                           ; white garbage (yellow because of the other problem).
                         
                              movd MM7,[ebx+edi];
                              paddusb MM7,MM0   ; Add to [ebx+edi] the low dword of MM0 with saturation, so after the 255th always white (yellow)
                              movd [ebx+edi],MM7;
                             
                             
                              add edi,4
                              dec ecx
                              jne @@l4
                              dec edx
                              jne @@l5

changing the:

mov eax,01h           
movd MM3,eax         
punpcklwd MM3,MM3

to:

mov eax,10001h           
movd MM3,eax         
punpcklwd MM3,MM3
psrlq MM3, 16


And the:

packuswb MM0,MM2

to

movq MM0,MM2
psrlq MM0, 2
packuswb MM0,MM0


I think you get a more interisting effect, but not what you want yet.

daydreamer

I decided to make it first run in 8bit precision and add extra reg for decimals and increment that with rollaround and check before and after, result smaller after addition = rollaround taken place = time to increment counter
separate channels gonna be 8:8 fixed point with that logic
but MMX conditional gonna mask increments 010101h, not perform conditional jmps
so with greyscale and few special cases, I can use a 8:32 or even 8:64 fixed point, keeping all channels decimal part the same