Hey, could someone kindly show me how to optimize this code of mine?

It takes 16 bytes from ptrGBGraphicsMem and writes them to 256 bytes at ptrDDSurfaceMem.
The code works fine, I'd just like to see it run a little faster, even though it still takes 0ms to execute
It loads a gameboy tile from ptrGBGraphicsMem and then converts it and puts it into a DirectDrawSurface (the surface is locked before this proc is called).

Thanks in advance!

Code:
DrawTile proc ptrDDSurfaceMem:DWORD, ptrGBGraphicMem:DWORD

LOCAL	YC:BYTE,	XC:BYTE,	WCOL:DWORD

PUSH	EDX					
PUSH	EDI
PUSH	ESI
PUSH	EBX

MOV	CH,	0
MOV	EDI,	ptrGBGraphicMem
MOV	ESI,	ptrDDSurfaceMem
SUB	ESI,	32

YLoop:
	ADD	ESI,	64
	MOV	AL,	BYTE PTR [EDI]
	INC	EDI
	MOV	AH,	BYTE PTR [EDI]
	INC	EDI
	MOV	BL,	0
	SHL	AL,	1
	SHL	AH,	1						
	XLoop:
		SUB	ESI,	4
		SHR	AL,	1
		SHR	AH,	1
		MOV	DL,	AL
		MOV	DH,	AH
		AND	DL,	1
		AND	DH,	1
		SHL	DH,	1
		OR	DL,	DH

		.IF	DL==0
			MOV	EDX,	0FFFFFF00h
			MOV	DWORD PTR [ESI],	EDX
		.ELSEIF	DL==1
			MOV	EDX,	0FFFF0000h
			MOV	DWORD PTR [ESI],	EDX
		.ELSEIF	DL==2
			MOV	EDX,	0FF000000h
			MOV	DWORD PTR [ESI],	EDX
		.ELSEIF	DL==3
			MOV	EDX,	000000000h
			MOV	DWORD PTR [ESI],	EDX
		.ENDIF

		INC	BL
		CMP	BL,	8
		JNE	XLoop

	INC	CH
	CMP	CH,	8
	JNE	YLoop

POP	EBX
POP	ESI
POP	EDI
POP	EDX

	RET
DrawTile endp
BTW, the code is in MASM32 format.