Hello, I have been playing around with asm and MMX to enhance graphics rendering speed. Here is what I have come up with in the form a routine that produces vertical gradient. If i'm honest it's more luck than judgement as i'm not sure why it works the way it does. I'm hoping that someone can explain it to me in a bit of detail. Also if anyone thinks they can improve upon it then that't be great too.

I'd very much appreciate any comments..

BytePTR is a pointer returned from GetDIBits.
The Bitmap in question is in 32bpp format.

Code:
	BYTE * StartPTR = ppv;

int total = width * height;
int StartColor = 90; // Fixed RGB value to start.
int colorStep = (width >> 1) * (height / 80); //Calculation to determine how many scan lines should be that same color before stepping.






////******* Must Be An Even Width And Height!!!!!! ********//////////

	__asm
	{
		pushad;

		mov ecx, total;  // move the total size into ecx
		mov edi, DWORD PTR [StartPTR]; // move the pointer to the first byte into edi.
		mov esi, [StartColor]; // Move the start color into esi;
		xor eax, eax; //Clear out eax;
		mov ebx, [colorStep]; // Move the step amount into ebx

		shr ecx, 1 // divide the total by 2 as we are processing two pixels at once.. I think!


do_gradientMMX:

		//// Here is determined whether or not the start color should be decreased to form the gradient.
		
		cmp eax, ebx;// Compare eax with step amount is not equal jump to gradient.
		jne startGradient; 

		xor eax, eax; // should reduce the color value so clear out eax again
		dec esi; // decrease the start color so it darkens toward black.



startGradient:

		// Gradient is done here.

		pxor mm0, mm0; //Clear Out mm0;

		movq mm0, [edi]; // Move two pixels into mm0;

		movd mm7, esi; // Move Color Increment into mm7

		punpcklbw mm0, mm1; // Unpack the pixles;

		paddusb mm0, mm7; // add the first color increment;

		pshufw mm0, mm0, 1; // Shuffle the bytes, god knows why this works

		paddusb mm0, mm7; // Add again

		packuswb mm0, mm0; // Pack it all up, still don't know why this works

		movq DWORD PTR [edi], mm0; // Write it back.

		add edi, 8; // Next two pixels

		add eax, 1; // //Increment color counter
		
		dec ecx; // decrement pixel counter

		jnz do_gradientMMX // do loop if needed

		emms // exit MMX

		popad;


	}