Hi, i need the fastest way to copy the lowbytes of large WORD arrays into BYTE arrays for an exported function in a dll.

this is what i have so far...

Code:
void  __stdcall CopyIntArray(unsigned char * ucDest, unsigned short * usSource, unsigned int * uiLen)
{

	__asm
	{

		mov esi, usSource
		mov edi, ucDest
		mov eax, uiLen
		mov eax, [eax]
		mov ebx, eax
		shr eax, 2
		jz SHORT lendDW
lloopDW:	
		mov ecx, [esi+4]
		mov dl, cl
		shr ecx, 8
		mov dh, ch
		shl edx, 16
		mov ecx, [esi]
		mov dl, cl
		shr ecx, 8
		mov dh, ch
		mov [edi], edx
		add esi, 8
		add edi, 4
		dec eax
		jnz SHORT lloopDW
lendDW:
		and ebx, 3
		jz SHORT lend
lloopB:
		mov cx, [esi]
		mov [edi+eax], cl
		inc eax
		add esi, 2
		cmp ebx, eax
		jnz Short lloopB
lend:

	}

}
the code works fine, but is there a faster way of doing this?