I have a Google Coral Dev Micro with a RT1176 SoC (800MHz Cortex-m7 and 400MHz Cortex-m4) the m7 is running FreeRTOS and the m4 is running bare-metal, compiling using GCC none eabi 9.3.1 with the following flags:
-Wall -Wno-psabi -mthumb -fno-common -ffunction-sections -fdata-sections -ffreestanding -fno-builtin -mapcs-frame --specs=nano.specs --specs=nosys.specs -u _printf_float -std=gnu99 -g -Os -save-temps -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16 -DNDEBUG
I want to communicate data generated in a timer interrupt between the two cores using shared memory and I have created my own lock-less FIFO buffer in shared memory:
fifo.h:
#include <array>
#include <atomic>
#include <cstddef>
#include <cstdint>
template <typename T, size_t num_elements>
class LockLessFifo {
public:
LockLessFifo() : read_idx(0), write_idx(0), buffer({0}) {}
size_t size() {
if (read_idx <= write_idx) {
return write_idx - read_idx;
} else {
return num_elements - (read_idx - write_idx);
}
}
constexpr std::size_t capacity() { return buffer.max_size(); }
void put(const T& entry) {
// Wait for space
while ((num_elements - size()) < 2) {
;
}
size_t new_write_idx = (write_idx + 1) % num_elements;
buffer[new_write_idx] = entry;
// Update the write index AFTER writing the data
write_idx = new_write_idx;
}
T get(const bool block = true) {
if (!block && size() == 0)
return -1;
while (size() == 0) {
;
}
// Consume
size_t new_read_idx = (read_idx + 1) % num_elements;
T retrieved_value = buffer[new_read_idx];
// Update the read index AFTER reading (to signal availability to the producer)
read_idx = new_read_idx;
return retrieved_value;
}
private:
std::atomic<std::size_t> read_idx;
std::atomic<std::size_t> write_idx;
std::array<std::atomic<T>, num_elements> buffer;
};
shared_fifo.h
#include "fifo.h"
using shared_element_t = uint32_t;
constexpr std::size_t SHARED_MEMORY_SIZE = 0x1700; // Manually calculated free space using map file
constexpr std::size_t SHARED_MEMORY_ELEMENTS = SHARED_MEMORY_SIZE / sizeof(shared_element_t);
LockLessFifo<shared_element_t, SHARED_MEMORY_ELEMENTS> shared_fifo __attribute__((section(".noinit.$rpmsg_sh_mem")));
I don't expect the FIFO code to be perfect, but it is functional. I have only shared it out of completion and to give all context.
The FIFO is read from the m7 as follows: main_m7.cpp
#include <cstdio>
#include "libs/base/ipc_m7.h"
#include "shared_fifo.h"
extern "C" [[noreturn]] void app_main(void* param) {
(void)param;
coralmicro::IpcM7::GetSingleton()->StartM4();
uint32_t counter1, counter2;
while (true) {
counter1 = shared_fifo.get(); // Works
counter2 = shared_fifo.get(); // Always 0
const std::size_t fifo_size = shared_fifo.size();
constexpr std::size_t fifo_capacity = shared_fifo.capacity();
printf("[M7] counter: %lu/%lu size: %u/%u\r\n", counter1, counter2, fifo_size, fifo_capacity);
}
}
The FIFO is filled on the m4 as follows:
main_m4.cpp
#include <atomic>
#include <cmath>
#include <cstdio>
#include "fsl_pit.h"
#include "shared_fifo.h"
static std::atomic<std::size_t> counter1 = 0;
static std::atomic<std::size_t> counter2 = 0;
void my_pit_irq() {
// Clear IRQ flag
PIT_ClearStatusFlags(PIT1, kPIT_Chnl_0, kPIT_TimerFlag);
counter1.store(counter1.load() + 1); // Works
counter2.fetch_add(1); // Always 0
SDK_ISR_EXIT_BARRIER;
}
extern "C" [[noreturn]] void app_main(void* param) {
(void)param;
configure_and_start_timer(); // removed for clarity
while (true) {
shared_fifo.put(counter1.load()); // Works
shared_fifo.put(counter2.load()); // Always 0
}
}
The output of this code is:
(...)
[M7] counter: 71666076/0 size: 1470/1472
[M7] counter: 71666076/0 size: 1470/1472
[M7] counter: 71666077/0 size: 1470/1472
[M7] counter: 71666077/0 size: 1470/1472
[M7] counter: 71666077/0 size: 1470/1472
(...)
Why does the incorrect increment (counter1) work, but the correct increment (counter2) not? I've taken a look at the disassembly:
The PIT interrupt:
_Z10my_pit_irqv:
// Clear IRQ flag
ldr r3, .L3
movs r2, #1
str r2, [r3, #268]
ldr r2, .L3+4 // Load .LANCHOR0
dmb ish
ldr r3, [r2] // Load value stored at address of .LANCHOR0 (.load())
dmb ish
adds r3, r3, #1 // Increment by 1
dmb ish
str r3, [r2] // Store incremented value (.store())
ldr r3, .L3+8 // Load .LANCHOR1
dmb ish
dmb ish // Dubble barrier?
.L2:
ldrex r2, [r3] // load
adds r2, r2, #1 // increment
strex r1, r2, [r3] // store
cmp r1, #0 // check
bne .L2 // retry
dmb ish
dsb 0xF
bx lr
.L4:
.align 2
.L3:
.word 1074626560
.word .LANCHOR0 // counter1
.word .LANCHOR1 // counter2
And the main:
app_main:
push {r0, r1, r2, lr}
ldr r6, .L14 // Counter1
ldr r4, .L14+4
ldr r5, .L14+8 // Counter2
.L13:
// This is the same as for counter2
add r1, sp, #4
ldr r3, [r6] // Load the value of counter1
dmb ish
mov r0, r4
str r3, [sp, #4]
bl _ZN5LockLessFifoImLj1472EE3putERKm // Put it in the FIFO
dmb ish
// This is the same as for counter1
add r1, sp, #4
ldr r3, [r5] // Load the value of counter2
dmb ish
mov r0, r4
str r3, [sp, #4]
bl _ZN5LockLessFifoImLj1472EE3putERKm // Put it in the FIFO
b .L13
.L15:
.align 2
.L14:
.word .LANCHOR0
.word _ZN511shared_fifoE
.word .LANCHOR1
I don't see where it is going wrong. I know the FIFO is functional, since the increment is working and we see the number printed in the terminal going up. The generated assembly is very similar and looks correct to me. Unfortunately, I cannot connect a debugger (yet) since I would need to solder the JTAG header to the board, and we need to wait before making any modifications.
Thanks for taking the time to take a look.
With some help of @NateEldredge in the comments I've found the answer: C++ atomics do not compile correctly on this hardware platform. The compiler generates dbm ish
instructions, which only synchronize the inner shareable domain.
I've verified this by incrementing a shared counter 1000000 times on each core, and the result did not equal 2000000.
This could possibly by fixed by manually replacing the dbm ish
with dbm osh
instructions, but a quick and dirty fix was not sufficient:
// the attempt that didn't work
for (size_t i = 0; i < 1000000; i++) {
asm("dmb osh" : : : "memory");
unica::shared_fifo.shared_counter++; // not actually atomic wrt. other core
asm("dmb osh" : : : "memory");
}
Unfortunately, I'll just drop using the std::atomic
s and go back to using volatile
s and masking IRQ where necessary
Update after a discussion on Reddit: https://www.reddit.com/r/embedded/comments/1bv1hqw/comment/kxz7yz8/?context=3
It seems like the memory region was configured as not shareable. I will test if this is the issue if I have time.