I'm replacing some OpenMP code with standard C++11/C++14 atomics/thread support. Here is the OpenMP minimal code example:
#include <vector>
#include <cstdint>
void omp_atomic_add(std::vector<std::int64_t> const& rows,
std::vector<std::int64_t> const& cols,
std::vector<double>& values,
std::size_t const row,
std::size_t const col,
double const value)
{
for (auto i = rows[row]; i < rows[row+1]; ++i)
{
if (cols[i] == col)
{
#pragma omp atomic
values[i] += value;
return;
}
}
}
The code updates a CSR matrix format and occurs in a hot path for scientific computation. It is technically possible to use a std::mutex
but the values
vector can have millions of elements and is accessed many times more than that so a std::mutex
is too heavy.
Checking the assembly https://godbolt.org/g/nPE9Dt, it seems to use CAS (with the disclaimer my atomic and assembly knowledge is severely limited so my comments are likely incorrect):
mov rax, qword ptr [rdi]
mov rdi, qword ptr [rax + 8*rcx]
mov rax, qword ptr [rax + 8*rcx + 8]
cmp rdi, rax
jge .LBB0_6
mov rcx, qword ptr [rsi]
.LBB0_2: # =>This Inner Loop Header: Depth=1
cmp qword ptr [rcx + 8*rdi], r8
je .LBB0_3
inc rdi
cmp rdi, rax
jl .LBB0_2
jmp .LBB0_6
#### Interesting stuff happens from here onwards
.LBB0_3:
mov rcx, qword ptr [rdx] # Load values pointer into register
mov rax, qword ptr [rcx + 8*rdi] # Offset to value[i]
.LBB0_4: # =>This Inner Loop Header: Depth=1
movq xmm1, rax # Move value into floating point register
addsd xmm1, xmm0 # Add function arg to the value from the vector<double>
movq rdx, xmm1 # Move result to register
lock # x86 lock
cmpxchg qword ptr [rcx + 8*rdi], rdx # Compare exchange on the value in the vector
jne .LBB0_4 # If failed, go back to the top and try again
.LBB0_6:
ret
Is this possible to do using C++ atomics? The examples I've seen only use std::atomic<double> value{}
and nothing in the context of accessing a value through a pointer.
You can create a std::vector<std::atomic<double>>
but you cannot change its size.
The first thing I'd do is get gsl::span
or write my own variant. Then gsl::span<std::atomic<double>>
is a better model for values
than std::vector<std::atomic<double>>
.
Once we have done that, simply remove the #pragma omp atomic
and your code is atomic in c++20. In c++17 and before you have to manually implement +=
.
double old = values[i];
while(!values[i].compare_exchange_weak(old, old+value))
{}
Clang 5 generates:
omp_atomic_add(std::vector<long, std::allocator<long> > const&, std::vector<long, std::allocator<long> > const&, std::vector<std::atomic<double>, std::allocator<std::atomic<double> > >&, unsigned long, unsigned long, double): # @omp_atomic_add(std::vector<long, std::allocator<long> > const&, std::vector<long, std::allocator<long> > const&, std::vector<std::atomic<double>, std::allocator<std::atomic<double> > >&, unsigned long, unsigned long, double)
mov rax, qword ptr [rdi]
mov rdi, qword ptr [rax + 8*rcx]
mov rax, qword ptr [rax + 8*rcx + 8]
cmp rdi, rax
jge .LBB0_6
mov rcx, qword ptr [rsi]
.LBB0_2: # =>This Inner Loop Header: Depth=1
cmp qword ptr [rcx + 8*rdi], r8
je .LBB0_3
inc rdi
cmp rdi, rax
jl .LBB0_2
jmp .LBB0_6
.LBB0_3:
mov rax, qword ptr [rdx]
mov rax, qword ptr [rax + 8*rdi]
.LBB0_4: # =>This Inner Loop Header: Depth=1
mov rcx, qword ptr [rdx]
movq xmm1, rax
addsd xmm1, xmm0
movq rsi, xmm1
lock
cmpxchg qword ptr [rcx + 8*rdi], rsi
jne .LBB0_4
.LBB0_6:
ret
which seems identical to my casual glance.
There is a proposal for atomic_view
that lets you manipulate a non-atomic value through an atomic view. In general, C++ only lets you operate atomically on atomic data.