Search code examples
c++x86armmemcpylock-free

Order-preserving memcpy in C++


I'm developing a multicore, multithreaded software library in which I want to offer update-order preserving lock-free shared memory objects that might span multiple cache lines.

Specifically, suppose that I have some vector X of cache-line-sized objects: X[0], … X[K] each occupies exactly one cache line. I write to them in index order: X[0] first, then X[1], etc. If thread 2 reads X[K], will it also see a state for X[0] that is "at least as current" as what it sees for X[K]?

From that same thread, obviously I will see memory semantics that respect the update order. But now if some second thread reads X[K] the question arises: will the corresponding updates to X[0]...X[K-1] be observed?

With locking, we do get this guarantee. But with memcpy used to copy something into the vector, we lose this property: memcpy has a POSIX semantic that doesn't guarantee index-order updates or memory-order updates or any other ordering at all. You just are guaranteed that after memcpy finishes, the entire update has been performed.

My question: is there already an order-preserving memcpy with similar speed but with the desired guarantee? And if not, can such a primitive be implemented without locking?

Assume my target platforms are x86 and ARM.

(Editor's note: originally said Intel, so the OP might not care about AMD.)


Solution

  • I found the answer by Peter Cordes to this question insightful, detailed, and very helpful. However I didn't see his suggestions put into code, so for posterity and future people needing a quick solution to this issue of requiring ordered writes for DMA or lockless algorithms, I'm including the code I wrote based on that answer. I build it using gcc 4.9 on x64 and armv7-a, though I only ran it and tested it on x64.

    #include <atomic>
    #include <stdlib.h>
    #include <algorithm> // min
    
    extern "C" {
    
    static void * linear_memcpy_portable(void *__restrict dest, const void *__restrict src, size_t n)
    {
       // Align dest if not already aligned
       if ((uintptr_t)dest & sizeof(uint64_t)) {
          uint8_t *__restrict dst8 = reinterpret_cast<uint8_t *__restrict>(dest);
          const uint8_t *__restrict src8 = reinterpret_cast<const uint8_t *__restrict>(src);
          const size_t align_n = std::min(n, (uintptr_t)dest & sizeof(uint64_t));
          const uint8_t * const endsrc8 = static_cast<const uint8_t * const>(src) + align_n;
          while (src8 < endsrc8) {
             *dst8 = *src8;
             atomic_thread_fence(std::memory_order_release);
             dst8++; src8++;
          }
          dest = dst8;
          src = src8;
          n = n - align_n;
       }
       typedef uint64_t __attribute__((may_alias,aligned(1))) aliasing_unaligned_uint64_t;
       uint64_t *__restrict dst64 = static_cast<uint64_t *__restrict>(dest);
       const aliasing_unaligned_uint64_t *__restrict src64 = static_cast<const aliasing_unaligned_uint64_t *__restrict>(src);
       const uint64_t * const endsrc64 = src64 + n / sizeof(uint64_t);
       const uint8_t * const endsrc8 = static_cast<const uint8_t * const>(src) + n;
       while (src64 < endsrc64) {
          *dst64 = *src64;
          atomic_thread_fence(std::memory_order_release);
          dst64++; src64++;
       }
       if (reinterpret_cast<const uint8_t * const>(endsrc64) != endsrc8) {
          uint8_t *__restrict dst8 = reinterpret_cast<uint8_t *__restrict>(dst64);
          const uint8_t *__restrict src8 = reinterpret_cast<const uint8_t *__restrict>(src64);
          while (src8 < endsrc8) {
             *dst8 = *src8;
             atomic_thread_fence(std::memory_order_release);
             dst8++; src8++;
          }
       }
       return dest;
    }
    
    #if (_M_AMD64 || __x86_64__)
    #include <immintrin.h>
    static void * linear_memcpy_avx2(void *dest, const void * src, size_t n) __attribute__((target("avx2")));
    static void * linear_memcpy_avx2(void *dest, const void * src, size_t n)
    {
       __m256i *__restrict dst256 = static_cast<__m256i *__restrict>(dest);
       const __m256i *__restrict src256 = static_cast<const __m256i *__restrict>(src);
       const __m256i * const endsrc256 = src256 + n / sizeof(__m256i);
       const uint8_t * const endsrc8 = static_cast<const uint8_t *>(src) + n;
       while (src256 < endsrc256) {
          _mm256_storeu_si256(dst256, _mm256_loadu_si256(src256));
          atomic_thread_fence(std::memory_order_release);
          dst256++; src256++;
       }
       if (reinterpret_cast<const uint8_t * const>(endsrc256) != endsrc8)
          linear_memcpy_portable(dst256, src256, endsrc8 - reinterpret_cast<const uint8_t * const>(endsrc256));
       return dest;
    }
    
    static void * linear_memcpy_sse2(void *dest, const void * src, size_t n) __attribute__((target("sse2")));
    static void * linear_memcpy_sse2(void *dest, const void * src, size_t n)
    {
       __m128i *__restrict dst128 = static_cast<__m128i *__restrict>(dest);
       const __m128i *__restrict src128 = static_cast<const __m128i *__restrict>(src);
       const __m128i * const endsrc128 = src128 + n / sizeof(__m128i);
       const uint8_t * const endsrc8 = static_cast<const uint8_t *>(src) + n;
       while (src128 < endsrc128) {
          _mm_storeu_si128(dst128, _mm_loadu_si128(src128));
          atomic_thread_fence(std::memory_order_release);
          dst128++; src128++;
       }
       if (reinterpret_cast<const uint8_t * const>(endsrc128) != endsrc8)
          linear_memcpy_portable(dst128, src128, endsrc8 - reinterpret_cast<const uint8_t * const>(endsrc128));
       return dest;
    }
    
    static void *(*resolve_linear_memcpy(void))(void *, const void *, size_t)
    {
       __builtin_cpu_init();
       // All x64 targets support a minimum of SSE2
       return __builtin_cpu_supports("avx2") ? linear_memcpy_avx2 : linear_memcpy_sse2;
    }
    #ifdef __AVX2__
    // IF AVX2 is specified to the compiler, alias to the avx2 impl so it can be inlined
    void * linear_memcpy(void *, const void *, size_t) __attribute__((alias("linear_memcpy_avx2")));
    #else
    void * linear_memcpy(void *, const void *, size_t) __attribute__((ifunc("resolve_linear_memcpy")));
    #endif
    #else
    void * linear_memcpy(void *, const void *, size_t) __attribute__((alias("linear_memcpy_portable")));
    #endif
    
    } // extern "C"
    

    I welcome any feedback on the implementation. :)