Atomic increment does not work as expected in interrupt

I have a Google Coral Dev Micro with a RT1176 SoC (800MHz Cortex-m7 and 400MHz Cortex-m4) the m7 is running FreeRTOS and the m4 is running bare-metal, compiling using GCC none eabi 9.3.1 with the following flags:

-Wall -Wno-psabi -mthumb -fno-common -ffunction-sections -fdata-sections -ffreestanding -fno-builtin -mapcs-frame --specs=nano.specs --specs=nosys.specs -u _printf_float -std=gnu99 -g -Os -save-temps -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16 -DNDEBUG

I want to communicate data generated in a timer interrupt between the two cores using shared memory and I have created my own lock-less FIFO buffer in shared memory:

fifo.h:

#include <array>
#include <atomic>
#include <cstddef>
#include <cstdint>

template <typename T, size_t num_elements>
class LockLessFifo {
   public:
    LockLessFifo() : read_idx(0), write_idx(0), buffer({0}) {}

    size_t size() {
        if (read_idx <= write_idx) {
            return write_idx - read_idx;
        } else {
            return num_elements - (read_idx - write_idx);
        }
    }

    constexpr std::size_t capacity() { return buffer.max_size(); }

    void put(const T& entry) {
        // Wait for space
        while ((num_elements - size()) < 2) {
            ;
        }

        size_t new_write_idx = (write_idx + 1) % num_elements;
        buffer[new_write_idx] = entry;

        // Update the write index AFTER writing the data
        write_idx = new_write_idx;
    }

    T get(const bool block = true) {

        if (!block && size() == 0)
            return -1;
        
        while (size() == 0) {
            ;
        }

        // Consume
        size_t new_read_idx = (read_idx + 1) % num_elements;
        T retrieved_value = buffer[new_read_idx];

        // Update the read index AFTER reading (to signal availability to the producer)
        read_idx = new_read_idx;
        return retrieved_value;
    }

   private:
    std::atomic<std::size_t> read_idx;
    std::atomic<std::size_t> write_idx;
    std::array<std::atomic<T>, num_elements> buffer;
};

shared_fifo.h

#include "fifo.h"

using shared_element_t = uint32_t;
constexpr std::size_t SHARED_MEMORY_SIZE = 0x1700;  // Manually calculated free space using map file
constexpr std::size_t SHARED_MEMORY_ELEMENTS = SHARED_MEMORY_SIZE / sizeof(shared_element_t);
LockLessFifo<shared_element_t, SHARED_MEMORY_ELEMENTS> shared_fifo __attribute__((section(".noinit.$rpmsg_sh_mem")));

I don't expect the FIFO code to be perfect, but it is functional. I have only shared it out of completion and to give all context.

The FIFO is read from the m7 as follows: main_m7.cpp

#include <cstdio>

#include "libs/base/ipc_m7.h"

#include "shared_fifo.h"

extern "C" [[noreturn]] void app_main(void* param) {
    (void)param;

    coralmicro::IpcM7::GetSingleton()->StartM4();
    uint32_t counter1, counter2;
    while (true) {
        counter1 = shared_fifo.get();  // Works
        counter2 = shared_fifo.get();  // Always 0
        
        const std::size_t fifo_size = shared_fifo.size();
        constexpr std::size_t fifo_capacity = shared_fifo.capacity();
        
        printf("[M7] counter: %lu/%lu size: %u/%u\r\n", counter1, counter2, fifo_size, fifo_capacity);
    }
}

The FIFO is filled on the m4 as follows:

main_m4.cpp

#include <atomic>
#include <cmath>
#include <cstdio>

#include "fsl_pit.h"

#include "shared_fifo.h"


static std::atomic<std::size_t> counter1 = 0;
static std::atomic<std::size_t> counter2 = 0;

void my_pit_irq() {
    // Clear IRQ flag
    PIT_ClearStatusFlags(PIT1, kPIT_Chnl_0, kPIT_TimerFlag);
    
    counter1.store(counter1.load() + 1);  // Works
    counter2.fetch_add(1);                // Always 0

    SDK_ISR_EXIT_BARRIER;
}


extern "C" [[noreturn]] void app_main(void* param) {
    (void)param;

    configure_and_start_timer(); // removed for clarity

    while (true) {
        shared_fifo.put(counter1.load());   // Works
        shared_fifo.put(counter2.load());   // Always 0
    }
}

The output of this code is:

(...)    
[M7] counter: 71666076/0 size: 1470/1472
[M7] counter: 71666076/0 size: 1470/1472
[M7] counter: 71666077/0 size: 1470/1472
[M7] counter: 71666077/0 size: 1470/1472
[M7] counter: 71666077/0 size: 1470/1472
(...)

Why does the incorrect increment (counter1) work, but the correct increment (counter2) not? I've taken a look at the disassembly:

The PIT interrupt:

_Z10my_pit_irqv:
    
    // Clear IRQ flag
    ldr r3, .L3
    movs r2, #1
    str r2, [r3, #268]

    ldr r2, .L3+4      // Load .LANCHOR0
    dmb ish
    ldr r3, [r2]       // Load value stored at address of .LANCHOR0 (.load())
    dmb ish
    adds r3, r3, #1    //  Increment by 1
    dmb ish
    str r3, [r2]       // Store incremented value (.store())
    

    ldr r3, .L3+8      // Load .LANCHOR1
    dmb ish            
    dmb ish            // Dubble barrier?
.L2:
    ldrex r2, [r3]      // load 
    adds r2, r2, #1     // increment
    strex r1, r2, [r3]  // store
    cmp r1, #0          // check
    bne .L2             // retry

    dmb ish
    dsb 0xF

    bx lr             
.L4:
    .align  2
.L3:
    .word   1074626560
    .word   .LANCHOR0  // counter1
    .word   .LANCHOR1  // counter2

And the main:

app_main:
    push {r0, r1, r2, lr}
    ldr r6, .L14            // Counter1
    ldr r4, .L14+4
    ldr r5, .L14+8          // Counter2
.L13:

    // This is the same as for counter2
    add r1, sp, #4
    ldr r3, [r6]            // Load the value of counter1
    dmb ish
    mov r0, r4
    str r3, [sp, #4]
    bl  _ZN5LockLessFifoImLj1472EE3putERKm  // Put it in the FIFO
    dmb ish

    // This is the same as for counter1
    add r1, sp, #4
    ldr r3, [r5]            // Load the value of counter2
    dmb ish
    mov r0, r4
    str r3, [sp, #4]
    bl _ZN5LockLessFifoImLj1472EE3putERKm  // Put it in the FIFO
    b .L13
.L15:
    .align  2
.L14:
    .word   .LANCHOR0
    .word   _ZN511shared_fifoE
    .word   .LANCHOR1

I don't see where it is going wrong. I know the FIFO is functional, since the increment is working and we see the number printed in the terminal going up. The generated assembly is very similar and looks correct to me. Unfortunately, I cannot connect a debugger (yet) since I would need to solder the JTAG header to the board, and we need to wait before making any modifications.

Thanks for taking the time to take a look.

Solution

With some help of @NateEldredge in the comments I've found the answer: C++ atomics do not compile correctly on this hardware platform. The compiler generates dbm ish instructions, which only synchronize the inner shareable domain.

I've verified this by incrementing a shared counter 1000000 times on each core, and the result did not equal 2000000.

This could possibly by fixed by manually replacing the dbm ish with dbm osh instructions, but a quick and dirty fix was not sufficient:

// the attempt that didn't work
for (size_t i = 0; i < 1000000; i++) {
    asm("dmb osh" : : : "memory");
    unica::shared_fifo.shared_counter++;  // not actually atomic wrt. other core
    asm("dmb osh" : : : "memory");
}

Unfortunately, I'll just drop using the std::atomics and go back to using volatiles and masking IRQ where necessary

Update after a discussion on Reddit: https://www.reddit.com/r/embedded/comments/1bv1hqw/comment/kxz7yz8/?context=3

It seems like the memory region was configured as not shareable. I will test if this is the issue if I have time.