Order-preserving memcpy in C++

I'm developing a multicore, multithreaded software library in which I want to offer update-order preserving lock-free shared memory objects that might span multiple cache lines.

Specifically, suppose that I have some vector X of cache-line-sized objects: X[0], … X[K] each occupies exactly one cache line. I write to them in index order: X[0] first, then X[1], etc. If thread 2 reads X[K], will it also see a state for X[0] that is "at least as current" as what it sees for X[K]?

From that same thread, obviously I will see memory semantics that respect the update order. But now if some second thread reads X[K] the question arises: will the corresponding updates to X[0]...X[K-1] be observed?

With locking, we do get this guarantee. But with memcpy used to copy something into the vector, we lose this property: memcpy has a POSIX semantic that doesn't guarantee index-order updates or memory-order updates or any other ordering at all. You just are guaranteed that after memcpy finishes, the entire update has been performed.

My question: is there already an order-preserving memcpy with similar speed but with the desired guarantee? And if not, can such a primitive be implemented without locking?

Assume my target platforms are x86 and ARM.

(Editor's note: originally said Intel, so the OP might not care about AMD.)

Solution

I found the answer by Peter Cordes to this question insightful, detailed, and very helpful. However I didn't see his suggestions put into code, so for posterity and future people needing a quick solution to this issue of requiring ordered writes for DMA or lockless algorithms, I'm including the code I wrote based on that answer. I build it using gcc 4.9 on x64 and armv7-a, though I only ran it and tested it on x64.

#include <atomic>
#include <stdlib.h>
#include <algorithm> // min

extern "C" {

static void * linear_memcpy_portable(void *__restrict dest, const void *__restrict src, size_t n)
{
   // Align dest if not already aligned
   if ((uintptr_t)dest & sizeof(uint64_t)) {
      uint8_t *__restrict dst8 = reinterpret_cast<uint8_t *__restrict>(dest);
      const uint8_t *__restrict src8 = reinterpret_cast<const uint8_t *__restrict>(src);
      const size_t align_n = std::min(n, (uintptr_t)dest & sizeof(uint64_t));
      const uint8_t * const endsrc8 = static_cast<const uint8_t * const>(src) + align_n;
      while (src8 < endsrc8) {
         *dst8 = *src8;
         atomic_thread_fence(std::memory_order_release);
         dst8++; src8++;
      }
      dest = dst8;
      src = src8;
      n = n - align_n;
   }
   typedef uint64_t __attribute__((may_alias,aligned(1))) aliasing_unaligned_uint64_t;
   uint64_t *__restrict dst64 = static_cast<uint64_t *__restrict>(dest);
   const aliasing_unaligned_uint64_t *__restrict src64 = static_cast<const aliasing_unaligned_uint64_t *__restrict>(src);
   const uint64_t * const endsrc64 = src64 + n / sizeof(uint64_t);
   const uint8_t * const endsrc8 = static_cast<const uint8_t * const>(src) + n;
   while (src64 < endsrc64) {
      *dst64 = *src64;
      atomic_thread_fence(std::memory_order_release);
      dst64++; src64++;
   }
   if (reinterpret_cast<const uint8_t * const>(endsrc64) != endsrc8) {
      uint8_t *__restrict dst8 = reinterpret_cast<uint8_t *__restrict>(dst64);
      const uint8_t *__restrict src8 = reinterpret_cast<const uint8_t *__restrict>(src64);
      while (src8 < endsrc8) {
         *dst8 = *src8;
         atomic_thread_fence(std::memory_order_release);
         dst8++; src8++;
      }
   }
   return dest;
}

#if (_M_AMD64 || __x86_64__)
#include <immintrin.h>
static void * linear_memcpy_avx2(void *dest, const void * src, size_t n) __attribute__((target("avx2")));
static void * linear_memcpy_avx2(void *dest, const void * src, size_t n)
{
   __m256i *__restrict dst256 = static_cast<__m256i *__restrict>(dest);
   const __m256i *__restrict src256 = static_cast<const __m256i *__restrict>(src);
   const __m256i * const endsrc256 = src256 + n / sizeof(__m256i);
   const uint8_t * const endsrc8 = static_cast<const uint8_t *>(src) + n;
   while (src256 < endsrc256) {
      _mm256_storeu_si256(dst256, _mm256_loadu_si256(src256));
      atomic_thread_fence(std::memory_order_release);
      dst256++; src256++;
   }
   if (reinterpret_cast<const uint8_t * const>(endsrc256) != endsrc8)
      linear_memcpy_portable(dst256, src256, endsrc8 - reinterpret_cast<const uint8_t * const>(endsrc256));
   return dest;
}

static void * linear_memcpy_sse2(void *dest, const void * src, size_t n) __attribute__((target("sse2")));
static void * linear_memcpy_sse2(void *dest, const void * src, size_t n)
{
   __m128i *__restrict dst128 = static_cast<__m128i *__restrict>(dest);
   const __m128i *__restrict src128 = static_cast<const __m128i *__restrict>(src);
   const __m128i * const endsrc128 = src128 + n / sizeof(__m128i);
   const uint8_t * const endsrc8 = static_cast<const uint8_t *>(src) + n;
   while (src128 < endsrc128) {
      _mm_storeu_si128(dst128, _mm_loadu_si128(src128));
      atomic_thread_fence(std::memory_order_release);
      dst128++; src128++;
   }
   if (reinterpret_cast<const uint8_t * const>(endsrc128) != endsrc8)
      linear_memcpy_portable(dst128, src128, endsrc8 - reinterpret_cast<const uint8_t * const>(endsrc128));
   return dest;
}

static void *(*resolve_linear_memcpy(void))(void *, const void *, size_t)
{
   __builtin_cpu_init();
   // All x64 targets support a minimum of SSE2
   return __builtin_cpu_supports("avx2") ? linear_memcpy_avx2 : linear_memcpy_sse2;
}
#ifdef __AVX2__
// IF AVX2 is specified to the compiler, alias to the avx2 impl so it can be inlined
void * linear_memcpy(void *, const void *, size_t) __attribute__((alias("linear_memcpy_avx2")));
#else
void * linear_memcpy(void *, const void *, size_t) __attribute__((ifunc("resolve_linear_memcpy")));
#endif
#else
void * linear_memcpy(void *, const void *, size_t) __attribute__((alias("linear_memcpy_portable")));
#endif

} // extern "C"

I welcome any feedback on the implementation. :)