Search code examples
cgccembeddedinline-assemblyvolatile

Is gcc optimizing out my wait code, despite marking it volatile?


On a STM32 ARM Cortex M4, I have a simple bare metal blinky which strobes a LED on. It works fine when I use a spin wait:

typedef volatile uint32_t vuint32_t;
#define SET(addr, bits) (*((vuint32_t*) (addr)) |= (bits))
#define CLR(addr, bits) (*((vuint32_t*) (addr)) &= ~(bits))

for (;;) {
  SET(GPIOA+_ODR, BIT(5));
  // spin(100);
  for (volatile int i = 100 * 1000; i; --i) asm("nop");
  CLR(GPIOA+_ODR, BIT(5));
  //spin(1000);
  for (volatile int i = 1000 * 1000; i; --i) asm("nop");
}

The above code works perfectly - it strobes the LED on periodically. And, if I uncomment one of the spin() calls, and comment out the corresponding for call, it also works well. But if I use spin() for both, the light is reversed: it is mainly on, and blinks off.

Using gdb, it seems like it is switching around the order of the calls (or otherwise executing them differently than expected). I can't figure out why it does this, though, since I have everything marked volatile:

void spin(uint32_t ms) {
  uint32_t ticks = (CPU_HZ / 1000) * ms;
  (*(vuint32_t*) (STK+_LOAD)) = ticks - 1;
  SET((STK+_CTRL), (BIT(2) | BIT(0)));
  while (! ((*(vuint32_t*) STK+_CTRL) & COUNTFLAG)) asm volatile("nop");
}

The macros like GPIOA, STK, and _LOAD are because I'm doing this bare metal, no external libs. But I've tested them and they all work well under isolation. The cause seems to be that the compiler is changing the order (or omitting?). I was able to confirm this using objdump, at least some of the time.

I'm using arm-none-eabi-gcc 10.3.1. See also: Non-conforming optimizations of volatile in gcc 11.1.


Updates:

With only one call to spin, spin(100) acts as expected, spin(1000) acts as expected, but spin(5000) seems to only spin for 1 second (as far as I can time).

I'm using -Os. Switching to -O0 does not solve the problem.


As requested, here is the disassembly of spin. Note that, as per the suggestions, I've tried various changes, none of which behave as expected - below is the current one.

void spin(uint32_t ms) {
 80001ac:       b480            push    {r7}
 80001ae:       b085            sub     sp, #20
 80001b0:       af00            add     r7, sp, #0
 80001b2:       6078            str     r0, [r7, #4]
        uint32_t ticks = (CPU_HZ / 1000) * ms;
 80001b4:       687b            ldr     r3, [r7, #4]
 80001b6:       f44f 527a       mov.w   r2, #16000      ; 0x3e80
 80001ba:       fb02 f303       mul.w   r3, r2, r3
 80001be:       60fb            str     r3, [r7, #12]
        (*(vuint32_t*) (STK+_LOAD)) = ticks - 1;  // The -1 is necessary as per.
 80001c0:       4a11            ldr     r2, [pc, #68]   ; (8000208 <spin+0x5c>)
 80001c2:       68fb            ldr     r3, [r7, #12]
 80001c4:       3b01            subs    r3, #1
 80001c6:       6013            str     r3, [r2, #0]
    (*(vuint32_t*)(STK+_VAL)) = 0;
 80001c8:       4b10            ldr     r3, [pc, #64]   ; (800020c <spin+0x60>)
 80001ca:       2200            movs    r2, #0
 80001cc:       601a            str     r2, [r3, #0]
        SET(STK + _CTRL, BIT(2) | BIT(0));             // Enable SysTick @CPU_HZ
 80001ce:       4b10            ldr     r3, [pc, #64]   ; (8000210 <spin+0x64>)
 80001d0:       681b            ldr     r3, [r3, #0]
 80001d2:       4a0f            ldr     r2, [pc, #60]   ; (8000210 <spin+0x64>)
 80001d4:       f043 0305       orr.w   r3, r3, #5
 80001d8:       6013            str     r3, [r2, #0]
        CLR((STK+_CTRL), (BIT(16)));
 80001da:       4b0d            ldr     r3, [pc, #52]   ; (8000210 <spin+0x64>)
 80001dc:       681b            ldr     r3, [r3, #0]
 80001de:       4a0c            ldr     r2, [pc, #48]   ; (8000210 <spin+0x64>)
 80001e0:       f423 3380       bic.w   r3, r3, #65536  ; 0x10000
 80001e4:       6013            str     r3, [r2, #0]
        while (! ((*(vuint32_t*) STK+_CTRL) & COUNTFLAG)) asm("");
 80001e6:       e7ff            b.n     80001e8 <spin+0x3c>
 80001e8:       4b09            ldr     r3, [pc, #36]   ; (8000210 <spin+0x64>)
 80001ea:       681b            ldr     r3, [r3, #0]
 80001ec:       f403 3380       and.w   r3, r3, #65536  ; 0x10000
 80001f0:       2b00            cmp     r3, #0
 80001f2:       d0f9            beq.n   80001e8 <spin+0x3c>
        (*(vuint32_t*) (STK+_CTRL)) = 0;
 80001f4:       4b06            ldr     r3, [pc, #24]   ; (8000210 <spin+0x64>)
 80001f6:       2200            movs    r2, #0
 80001f8:       601a            str     r2, [r3, #0]
}
 80001fa:       bf00            nop
 80001fc:       3714            adds    r7, #20
 80001fe:       46bd            mov     sp, r7
 8000200:       f85d 7b04       ldr.w   r7, [sp], #4
 8000204:       4770            bx      lr
 8000206:       bf00            nop
 8000208:       e000e014        .word   0xe000e014
 800020c:       e000e018        .word   0xe000e018
 8000210:       e000e010        .word   0xe000e010

and of relevant portions of main:

08000214 <main>:

int main(void) {
...
    SET((GPIOA + _ODR), BIT(5));
 8000262:       4b0e            ldr     r3, [pc, #56]   ; (800029c <main+0x88>)
 8000264:       681b            ldr     r3, [r3, #0]
 8000266:       4a0d            ldr     r2, [pc, #52]   ; (800029c <main+0x88>)
 8000268:       f043 0320       orr.w   r3, r3, #32
 800026c:       6013            str     r3, [r2, #0]
    spin(100);
 800026e:       2064            movs    r0, #100        ; 0x64
 8000270:       f7ff ff9c       bl      80001ac <spin>
    //for (volatile int i = 1000 * 1000; i; --i) asm("nop");
    CLR((GPIOA + _ODR), BIT(5));
 8000274:       4b09            ldr     r3, [pc, #36]   ; (800029c <main+0x88>)
 8000276:       681b            ldr     r3, [r3, #0]
 8000278:       4a08            ldr     r2, [pc, #32]   ; (800029c <main+0x88>)
 800027a:       f023 0320       bic.w   r3, r3, #32
 800027e:       6013            str     r3, [r2, #0]
    spin(10000);
 8000280:       f242 7010       movw    r0, #10000      ; 0x2710
 8000284:       f7ff ff92       bl      80001ac <spin>
    SET((GPIOA + _ODR), BIT(5));
 8000288:       e7eb            b.n     8000262 <main+0x4e>
 800028a:       bf00            nop
...
 800029c:       40020014        .word   0x40020014
...

Solution

  • Do not blame compiler!! spin function is wrong. You need reset the COUNTFLAG before your while loop (you can do this by writing to VAL register too).

    #define SCS_BASE            (0xE000E000UL)                            
    #define SysTick_BASE        (SCS_BASE +  0x0010UL) 
    #define SysTick             ((SysTick_Type   *)     SysTick_BASE  ) 
    
    #define     __IM     volatile const      /*! Defines 'read only' structure member permissions */
    #define     __OM     volatile            /*! Defines 'write only' structure member permissions */
    #define     __IOM    volatile            /*! Defines 'read / write' structure member permissions */
    
    
    typedef struct
    {
      __IOM uint32_t CTRL;                   /*!< Offset: 0x000 (R/W)  SysTick Control and Status Register */
      __IOM uint32_t LOAD;                   /*!< Offset: 0x004 (R/W)  SysTick Reload Value Register */
      __IOM uint32_t VAL;                    /*!< Offset: 0x008 (R/W)  SysTick Current Value Register */
      __IM  uint32_t CALIB;                  /*!< Offset: 0x00C (R/ )  SysTick Calibration Register */
    } SysTick_Type;
    
    
    void spin(uint32_t ms) 
    {
        SysTick -> LOAD = (CPU_HZ / 1000) * ms - 1;
        
        SysTick->VAL   = 0UL;                                             
        SysTick->CTRL  = (1 << 0) | (1 << 2)
        whille(!(SysTick -> CTRL & (1 << 16)));
        SysTick->CTRL = 0;
    }
    
    

    Generated code:

    spin:
            mov     r3, #8000
            mov     r2, #-536813568
            mul     r0, r3, r0
            subs    r0, r0, #1
            movs    r1, #0
            movs    r3, #5
            str     r0, [r2, #20]
            str     r1, [r2, #24]
            str     r3, [r2, #16]
    .L2:
            ldr     r3, [r2, #16]
            lsls    r3, r3, #15
            bpl     .L2
            movs    r3, #0
            str     r3, [r2, #16]
            bx      lr
    

    https://godbolt.org/z/5rbndcv1e