I'm developing a multicore, multithreaded software library in which I want to offer update-order preserving lock-free shared memory objects that might span multiple cache lines.
Specifically, suppose that I have some vector X of cache-line-sized objects: X[0], … X[K] each occupies exactly one cache line. I write to them in index order: X[0] first, then X[1], etc. If thread 2 reads X[K], will it also see a state for X[0] that is "at least as current" as what it sees for X[K]?
From that same thread, obviously I will see memory semantics that respect the update order. But now if some second thread reads X[K] the question arises: will the corresponding updates to X[0]...X[K-1] be observed?
With locking, we do get this guarantee. But with memcpy used to copy something into the vector, we lose this property: memcpy has a POSIX semantic that doesn't guarantee index-order updates or memory-order updates or any other ordering at all. You just are guaranteed that after memcpy finishes, the entire update has been performed.
My question: is there already an order-preserving memcpy with similar speed but with the desired guarantee? And if not, can such a primitive be implemented without locking?
Assume my target platforms are x86 and ARM.
(Editor's note: originally said Intel, so the OP might not care about AMD.)
I found the answer by Peter Cordes to this question insightful, detailed, and very helpful. However I didn't see his suggestions put into code, so for posterity and future people needing a quick solution to this issue of requiring ordered writes for DMA or lockless algorithms, I'm including the code I wrote based on that answer. I build it using gcc 4.9 on x64 and armv7-a, though I only ran it and tested it on x64.
#include <atomic>
#include <stdlib.h>
#include <algorithm> // min
extern "C" {
static void * linear_memcpy_portable(void *__restrict dest, const void *__restrict src, size_t n)
{
// Align dest if not already aligned
if ((uintptr_t)dest & sizeof(uint64_t)) {
uint8_t *__restrict dst8 = reinterpret_cast<uint8_t *__restrict>(dest);
const uint8_t *__restrict src8 = reinterpret_cast<const uint8_t *__restrict>(src);
const size_t align_n = std::min(n, (uintptr_t)dest & sizeof(uint64_t));
const uint8_t * const endsrc8 = static_cast<const uint8_t * const>(src) + align_n;
while (src8 < endsrc8) {
*dst8 = *src8;
atomic_thread_fence(std::memory_order_release);
dst8++; src8++;
}
dest = dst8;
src = src8;
n = n - align_n;
}
typedef uint64_t __attribute__((may_alias,aligned(1))) aliasing_unaligned_uint64_t;
uint64_t *__restrict dst64 = static_cast<uint64_t *__restrict>(dest);
const aliasing_unaligned_uint64_t *__restrict src64 = static_cast<const aliasing_unaligned_uint64_t *__restrict>(src);
const uint64_t * const endsrc64 = src64 + n / sizeof(uint64_t);
const uint8_t * const endsrc8 = static_cast<const uint8_t * const>(src) + n;
while (src64 < endsrc64) {
*dst64 = *src64;
atomic_thread_fence(std::memory_order_release);
dst64++; src64++;
}
if (reinterpret_cast<const uint8_t * const>(endsrc64) != endsrc8) {
uint8_t *__restrict dst8 = reinterpret_cast<uint8_t *__restrict>(dst64);
const uint8_t *__restrict src8 = reinterpret_cast<const uint8_t *__restrict>(src64);
while (src8 < endsrc8) {
*dst8 = *src8;
atomic_thread_fence(std::memory_order_release);
dst8++; src8++;
}
}
return dest;
}
#if (_M_AMD64 || __x86_64__)
#include <immintrin.h>
static void * linear_memcpy_avx2(void *dest, const void * src, size_t n) __attribute__((target("avx2")));
static void * linear_memcpy_avx2(void *dest, const void * src, size_t n)
{
__m256i *__restrict dst256 = static_cast<__m256i *__restrict>(dest);
const __m256i *__restrict src256 = static_cast<const __m256i *__restrict>(src);
const __m256i * const endsrc256 = src256 + n / sizeof(__m256i);
const uint8_t * const endsrc8 = static_cast<const uint8_t *>(src) + n;
while (src256 < endsrc256) {
_mm256_storeu_si256(dst256, _mm256_loadu_si256(src256));
atomic_thread_fence(std::memory_order_release);
dst256++; src256++;
}
if (reinterpret_cast<const uint8_t * const>(endsrc256) != endsrc8)
linear_memcpy_portable(dst256, src256, endsrc8 - reinterpret_cast<const uint8_t * const>(endsrc256));
return dest;
}
static void * linear_memcpy_sse2(void *dest, const void * src, size_t n) __attribute__((target("sse2")));
static void * linear_memcpy_sse2(void *dest, const void * src, size_t n)
{
__m128i *__restrict dst128 = static_cast<__m128i *__restrict>(dest);
const __m128i *__restrict src128 = static_cast<const __m128i *__restrict>(src);
const __m128i * const endsrc128 = src128 + n / sizeof(__m128i);
const uint8_t * const endsrc8 = static_cast<const uint8_t *>(src) + n;
while (src128 < endsrc128) {
_mm_storeu_si128(dst128, _mm_loadu_si128(src128));
atomic_thread_fence(std::memory_order_release);
dst128++; src128++;
}
if (reinterpret_cast<const uint8_t * const>(endsrc128) != endsrc8)
linear_memcpy_portable(dst128, src128, endsrc8 - reinterpret_cast<const uint8_t * const>(endsrc128));
return dest;
}
static void *(*resolve_linear_memcpy(void))(void *, const void *, size_t)
{
__builtin_cpu_init();
// All x64 targets support a minimum of SSE2
return __builtin_cpu_supports("avx2") ? linear_memcpy_avx2 : linear_memcpy_sse2;
}
#ifdef __AVX2__
// IF AVX2 is specified to the compiler, alias to the avx2 impl so it can be inlined
void * linear_memcpy(void *, const void *, size_t) __attribute__((alias("linear_memcpy_avx2")));
#else
void * linear_memcpy(void *, const void *, size_t) __attribute__((ifunc("resolve_linear_memcpy")));
#endif
#else
void * linear_memcpy(void *, const void *, size_t) __attribute__((alias("linear_memcpy_portable")));
#endif
} // extern "C"
I welcome any feedback on the implementation. :)