On a STM32 ARM Cortex M4, I have a simple bare metal blinky which strobes a LED on. It works fine when I use a spin wait:
typedef volatile uint32_t vuint32_t;
#define SET(addr, bits) (*((vuint32_t*) (addr)) |= (bits))
#define CLR(addr, bits) (*((vuint32_t*) (addr)) &= ~(bits))
for (;;) {
SET(GPIOA+_ODR, BIT(5));
// spin(100);
for (volatile int i = 100 * 1000; i; --i) asm("nop");
CLR(GPIOA+_ODR, BIT(5));
//spin(1000);
for (volatile int i = 1000 * 1000; i; --i) asm("nop");
}
The above code works perfectly - it strobes the LED on periodically. And, if I uncomment one of the spin()
calls, and comment out the corresponding for
call, it also works well. But if I use spin()
for both, the light is reversed: it is mainly on, and blinks off.
Using gdb
, it seems like it is switching around the order of the calls (or otherwise executing them differently than expected). I can't figure out why it does this, though, since I have everything marked volatile:
void spin(uint32_t ms) {
uint32_t ticks = (CPU_HZ / 1000) * ms;
(*(vuint32_t*) (STK+_LOAD)) = ticks - 1;
SET((STK+_CTRL), (BIT(2) | BIT(0)));
while (! ((*(vuint32_t*) STK+_CTRL) & COUNTFLAG)) asm volatile("nop");
}
The macros like GPIOA
, STK
, and _LOAD
are because I'm doing this bare metal, no external libs. But I've tested them and they all work well under isolation. The cause seems to be that the compiler is changing the order (or omitting?). I was able to confirm this using objdump
, at least some of the time.
I'm using arm-none-eabi-gcc 10.3.1
. See also: Non-conforming optimizations of volatile in gcc 11.1.
With only one call to spin
, spin(100)
acts as expected, spin(1000)
acts as expected, but spin(5000)
seems to only spin for 1 second (as far as I can time).
I'm using -Os
. Switching to -O0
does not solve the problem.
As requested, here is the disassembly of spin
. Note that, as per the suggestions, I've tried various changes, none of which behave as expected - below is the current one.
void spin(uint32_t ms) {
80001ac: b480 push {r7}
80001ae: b085 sub sp, #20
80001b0: af00 add r7, sp, #0
80001b2: 6078 str r0, [r7, #4]
uint32_t ticks = (CPU_HZ / 1000) * ms;
80001b4: 687b ldr r3, [r7, #4]
80001b6: f44f 527a mov.w r2, #16000 ; 0x3e80
80001ba: fb02 f303 mul.w r3, r2, r3
80001be: 60fb str r3, [r7, #12]
(*(vuint32_t*) (STK+_LOAD)) = ticks - 1; // The -1 is necessary as per.
80001c0: 4a11 ldr r2, [pc, #68] ; (8000208 <spin+0x5c>)
80001c2: 68fb ldr r3, [r7, #12]
80001c4: 3b01 subs r3, #1
80001c6: 6013 str r3, [r2, #0]
(*(vuint32_t*)(STK+_VAL)) = 0;
80001c8: 4b10 ldr r3, [pc, #64] ; (800020c <spin+0x60>)
80001ca: 2200 movs r2, #0
80001cc: 601a str r2, [r3, #0]
SET(STK + _CTRL, BIT(2) | BIT(0)); // Enable SysTick @CPU_HZ
80001ce: 4b10 ldr r3, [pc, #64] ; (8000210 <spin+0x64>)
80001d0: 681b ldr r3, [r3, #0]
80001d2: 4a0f ldr r2, [pc, #60] ; (8000210 <spin+0x64>)
80001d4: f043 0305 orr.w r3, r3, #5
80001d8: 6013 str r3, [r2, #0]
CLR((STK+_CTRL), (BIT(16)));
80001da: 4b0d ldr r3, [pc, #52] ; (8000210 <spin+0x64>)
80001dc: 681b ldr r3, [r3, #0]
80001de: 4a0c ldr r2, [pc, #48] ; (8000210 <spin+0x64>)
80001e0: f423 3380 bic.w r3, r3, #65536 ; 0x10000
80001e4: 6013 str r3, [r2, #0]
while (! ((*(vuint32_t*) STK+_CTRL) & COUNTFLAG)) asm("");
80001e6: e7ff b.n 80001e8 <spin+0x3c>
80001e8: 4b09 ldr r3, [pc, #36] ; (8000210 <spin+0x64>)
80001ea: 681b ldr r3, [r3, #0]
80001ec: f403 3380 and.w r3, r3, #65536 ; 0x10000
80001f0: 2b00 cmp r3, #0
80001f2: d0f9 beq.n 80001e8 <spin+0x3c>
(*(vuint32_t*) (STK+_CTRL)) = 0;
80001f4: 4b06 ldr r3, [pc, #24] ; (8000210 <spin+0x64>)
80001f6: 2200 movs r2, #0
80001f8: 601a str r2, [r3, #0]
}
80001fa: bf00 nop
80001fc: 3714 adds r7, #20
80001fe: 46bd mov sp, r7
8000200: f85d 7b04 ldr.w r7, [sp], #4
8000204: 4770 bx lr
8000206: bf00 nop
8000208: e000e014 .word 0xe000e014
800020c: e000e018 .word 0xe000e018
8000210: e000e010 .word 0xe000e010
and of relevant portions of main
:
08000214 <main>:
int main(void) {
...
SET((GPIOA + _ODR), BIT(5));
8000262: 4b0e ldr r3, [pc, #56] ; (800029c <main+0x88>)
8000264: 681b ldr r3, [r3, #0]
8000266: 4a0d ldr r2, [pc, #52] ; (800029c <main+0x88>)
8000268: f043 0320 orr.w r3, r3, #32
800026c: 6013 str r3, [r2, #0]
spin(100);
800026e: 2064 movs r0, #100 ; 0x64
8000270: f7ff ff9c bl 80001ac <spin>
//for (volatile int i = 1000 * 1000; i; --i) asm("nop");
CLR((GPIOA + _ODR), BIT(5));
8000274: 4b09 ldr r3, [pc, #36] ; (800029c <main+0x88>)
8000276: 681b ldr r3, [r3, #0]
8000278: 4a08 ldr r2, [pc, #32] ; (800029c <main+0x88>)
800027a: f023 0320 bic.w r3, r3, #32
800027e: 6013 str r3, [r2, #0]
spin(10000);
8000280: f242 7010 movw r0, #10000 ; 0x2710
8000284: f7ff ff92 bl 80001ac <spin>
SET((GPIOA + _ODR), BIT(5));
8000288: e7eb b.n 8000262 <main+0x4e>
800028a: bf00 nop
...
800029c: 40020014 .word 0x40020014
...
Do not blame compiler!!
spin
function is wrong. You need reset the COUNTFLAG before your while loop (you can do this by writing to VAL register too).
#define SCS_BASE (0xE000E000UL)
#define SysTick_BASE (SCS_BASE + 0x0010UL)
#define SysTick ((SysTick_Type *) SysTick_BASE )
#define __IM volatile const /*! Defines 'read only' structure member permissions */
#define __OM volatile /*! Defines 'write only' structure member permissions */
#define __IOM volatile /*! Defines 'read / write' structure member permissions */
typedef struct
{
__IOM uint32_t CTRL; /*!< Offset: 0x000 (R/W) SysTick Control and Status Register */
__IOM uint32_t LOAD; /*!< Offset: 0x004 (R/W) SysTick Reload Value Register */
__IOM uint32_t VAL; /*!< Offset: 0x008 (R/W) SysTick Current Value Register */
__IM uint32_t CALIB; /*!< Offset: 0x00C (R/ ) SysTick Calibration Register */
} SysTick_Type;
void spin(uint32_t ms)
{
SysTick -> LOAD = (CPU_HZ / 1000) * ms - 1;
SysTick->VAL = 0UL;
SysTick->CTRL = (1 << 0) | (1 << 2)
whille(!(SysTick -> CTRL & (1 << 16)));
SysTick->CTRL = 0;
}
Generated code:
spin:
mov r3, #8000
mov r2, #-536813568
mul r0, r3, r0
subs r0, r0, #1
movs r1, #0
movs r3, #5
str r0, [r2, #20]
str r1, [r2, #24]
str r3, [r2, #16]
.L2:
ldr r3, [r2, #16]
lsls r3, r3, #15
bpl .L2
movs r3, #0
str r3, [r2, #16]
bx lr