This basic mmx memory copy code corrupts memory in release mode, but only with certain compilers. Visual Studio 2010 in specific. I think it's because this code needs a memory fence, but I'm not sure where it would go or exactly why. This code worked perfectly with Visual Studio 2005.
MMXMemCopy PROC dest:PTR BYTE, src:PTR BYTE, len:DWORD
mov edi, dest
mov esi, src
mov ecx, len
shr ecx, 6 //divide len by 64
mov edx, ecx
shl edx, 6
mov eax, len
sub eax, edx
or ecx, 0
jz lastbytes //if < 64 bytes, copy 8 bytes at a time
copynext: //copies 64 bytes
movq mm0, mmword ptr [esi]
movq mm1, mmword ptr [esi + 8h]
movq mm2, mmword ptr [esi + 10h]
movq mm3, mmword ptr [esi + 18h]
movq mm4, mmword ptr [esi + 20h]
movq mm5, mmword ptr [esi + 28h]
movq mm6, mmword ptr [esi + 30h]
movq mm7, mmword ptr [esi + 38h]
movntq mmword ptr [edi], mm0
movntq mmword ptr [edi + 8h], mm1
movntq mmword ptr [edi + 10h], mm2
movntq mmword ptr [edi + 18h], mm3
movntq mmword ptr [edi + 20h], mm4
movntq mmword ptr [edi + 28h], mm5
movntq mmword ptr [edi + 30h], mm6
movntq mmword ptr [edi + 38h], mm7
add esi, 40h
add edi, 40h
dec ecx
jnz copynext
lastbytes: //copies last 8 bytes
or eax, 0
jz finished
movq mm0, mmword ptr [esi]
movntq mmword ptr [edi], mm0
add esi, 8h
add edi, 8h
sub eax, 8h
jmp lastbytes
finished:
emms
ret
MMXMemCopy ENDP
First, esi
and edi
are callee-saved. I think you also need sfence
for memory ordering.