Search code examples
c++assemblyvisual-c++-2005memory-alignmentsse2

SSE2 - 16-byte aligned dynamic allocation of memory


EDIT:

This is a followup to SSE2 Compiler Error

This is the real bug I experienced before and have reproduced below by changing the _mm_malloc statement as Michael Burr suggested:

Unhandled exception at 0x00415116 in SO.exe: 0xC0000005: Access violation reading location 0xffffffff.

At line label: movdqa xmm0, xmmword ptr [t1+eax]

I'm trying to dynamically allocate t1 and t2 and according to this tutorial, I've used _mm_malloc:

#include <emmintrin.h>
int main(int argc, char* argv[])
{ 
 int *t1, *t2;
 const int n = 100000;
 t1 = (int*)_mm_malloc(n*sizeof(int),16);
 t2 = (int*)_mm_malloc(n*sizeof(int),16);
 __m128i mul1, mul2;
  for (int j = 0; j < n; j++)
  {
  t1[j] = j;
  t2[j] = (j+1);
  } // set temporary variables to random values
  _asm
  {
   mov eax, 0
   label: movdqa xmm0, xmmword ptr [t1+eax]
   movdqa xmm1, xmmword ptr [t2+eax]
   pmuludq xmm0, xmm1
   movdqa mul1, xmm0
   movdqa xmm0, xmmword ptr [t1+eax]
   pshufd xmm0, xmm0, 05fh
   pshufd xmm1, xmm1, 05fh
   pmuludq xmm0, xmm1
   movdqa mul2, xmm0
   add eax, 16
   cmp eax, 100000
   jnge label
  }
     _mm_free(t1);
     _mm_free(t2);

 return 0;
}

Solution

  • I think the 2nd problem is that you're reading at an offset from the pointer variable (not an offset from what the pointer points to).

    Change:

    label: movdqa xmm0, xmmword ptr [t1+eax]
    

    To something like:

    mov ebx, [t1]
    label: movdqa xmm0, xmmword ptr [ebx+eax]
    

    And similarly for your accesses through the t2 pointer.

    This might be even better (though I haven't had an opportunity to test it, so it might not even work):

      _asm
      {
       mov eax, [t1]
       mov ebx, [t1]
       lea ecx, [eax + (100000*4)]
    
       label: movdqa xmm0, xmmword ptr [eax]
       movdqa xmm1, xmmword ptr [ebx]
       pmuludq xmm0, xmm1
       movdqa mul1, xmm0
       movdqa xmm0, xmmword ptr [eax]
       pshufd xmm0, xmm0, 05fh
       pshufd xmm1, xmm1, 05fh
       pmuludq xmm0, xmm1
       movdqa mul2, xmm0
       add eax, 16
       add ebx, 16
       cmp eax, ecx
       jnge label
      }