Search code examples
coptimizationembeddedavravr-gcc

Saturating addition optimization in avr-gcc


I am programing for ATtiny13 and I have to do a lot of saturating additions.

Trying to optimize them, it seems that avr-gcc just doesn't know how to optimize anything at all. All of this was tried with AVR gcc 14.1.0 with -O3. Here's what I tried so far:

uint8_t saturating_add_1(uint8_t a, uint8_t b) {
    uint8_t temp = a + b;
    if(temp < a)
        return 0xFF;
    return temp;
}

This successfully optimizes on x86, however avr-gcc gives us this:

saturating_add_1:
.L__stack_usage = 0
        mov r25,r24
        add r24,r22
        cp r24,r25
        brlo .L1
        ret
.L1:
        ldi r24,lo8(-1)
        ret

Not great, not terrible, it does what we told it to. Let's try another version that is known to optimize correctly on other architectures:

uint8_t saturating_add_2(uint8_t a, uint8_t b) {
    if(b > 255 - a)
        return 255;
    else return a + b;
}

No, that is even worse:

saturating_add_2:
.L__stack_usage = 0
        ldi r18,lo8(-1)
        ldi r19,0
        sub r18,r24
        sbc r19,__zero_reg__
        cp r18,r22
        cpc r19,__zero_reg__
        brlt .L1
        add r24,r22
        ret
.L1:
        ldi r24,lo8(-1)
        ret

Fine, I guess we're trying compiler builtins.

uint8_t saturating_add_builtin(uint8_t a, uint8_t b) {
    
    if(__builtin_add_overflow(a, b, &a))
        return 255;
    else return a;
}
saturating_add_builtin:
.L__stack_usage = 0
        add r22,r24
        cp r22,r24
        brlo .L1
        mov r24,r22
        ret
.L1:
        ldi r24,lo8(-1)
        ret

It generates more or less the same assembly as our first try. I expect it to not compare but use the brcs or brcc instruction (branch if carry set/clear). Maybe we can force it?

uint8_t saturating_add_reg(uint8_t a, uint8_t b) {
    uint8_t temp = a + b;
    if(SREG & 1)
        return 255;
    return temp;
}
saturating_add_reg:
.L__stack_usage = 0
        add r24,r22
        in __tmp_reg__,__SREG__
        sbrs __tmp_reg__,0
        ret
        ldi r24,lo8(-1)
        ret
}

This is somewhat better, from 7 instructions down to 6. But avr-gcc trips me up again - Why does it use sbrs to skip ret instead of using sbrc to skip the ldi? Am I missing something?

Anyway, I also tried to fix it with inline assembly, however it is a bit unwieldly:

uint8_t saturating_add_asm_1(uint8_t a, uint8_t b) {
    asm (
        "add %[a], %[b]\n\t"
        "brcc no_overflow_%=\n\t"
        "ldi %[a], 255\n\t"
        "no_overflow_%=:"
        : [a] "+d" (a)
        : [b] "r" (b)
        : "cc"
    );
    return a;
}

This works fine, but the compiler cannot optimize for constants (with subi), which after all the time I spent on this hurts on an emotional level. My other try is:

uint8_t saturating_add_asm_2(uint8_t a, uint8_t b) {
    uint8_t temp = a + b;
    asm (
        "brcc no_overflow_%=\n\t"
        "ldi %[temp], 255\n\t"
        "no_overflow_%=:"
        : [temp] "+d" (temp)
        :
        :
    );
    return temp;
}

But this seems like it could break because of compiler code reordering? But we cannot make the asm block volatile, because that disables even more optimizations.

Thus, my questions are these:

Has anyone been able to to get avr-gcc to optimize this correctly without inline assembly?
Is there a correct way to optimize it with inline assembly so it optimizes for constants?


Solution

  • Has anyone been able to to get avr-gcc to optimize this correctly without inline assembly?

    Yes. It seems the best code (and easiest) code you get by means of the fixed-point support as of ISO/IEC TR 18037 "Embedded C":

    godbolt

    #include <stdint.h>
    #include <stdfix.h>
    
    static inline __attribute__((always_inline))
    uint8_t addu8_sat (uint8_t a, uint8_t b)
    {
        unsigned short sat fract ha = uhrbits (a);
        unsigned short sat fract hb = uhrbits (b);
        return bitsuhr (ha + hb);
    }
    
    static inline __attribute__((always_inline))
    int8_t addi8_sat (int8_t a, int8_t b)
    {
        short sat fract ha = hrbits (a);
        short sat fract hb = hrbits (b);
        return bitshr (ha + hb);
    }
    
    uint8_t test_u8 (uint8_t a, uint8_t b)
    {
        return addu8_sat (addu8_sat (a, b), 10);
    }
    
    int8_t test_i8 (int8_t a, int8_t b)
    {
        return addi8_sat (addi8_sat (a, b), 10);
    }
    

    Let's have a look at the assembly code as generated with

    $ avr-gcc -Os sat.c -std=gnu99 -S -dp

    test_u8:
        add r24,r22  ;  9   [c=4 l=3]  usadduqq3/0
        brcc 0f 
        sbc r24,r24
        0:  
        subi r24,-10     ;  18  [c=4 l=3]  usadduqq3/1
        brcs 0f 
        ldi r24,0xff
        0:  
        ret      ;  24  [c=0 l=1]  return
    
    test_i8:
        add r24,r22  ;  9   [c=4 l=5]  ssaddqq3/0
        brvc 0f 
        ldi r24,0x80
        sbrs r22,7
        dec r24
        0:  
        subi r24,-10     ;  18  [c=4 l=3]  ssaddqq3/1
        brvc 0f 
        ldi r24,0x7f
        0:  
        ret      ;  24  [c=0 l=1]  return
    

    So it uses 3 instructions, except in the signed case and when neither addend is known at compile-time. The tricky part is that in that case saturation depends on the sign of the addend, which is not known.

    This uses the fact that Q-format additions are binary the same like integer additions, so that the implementations mostly deals with translating between the two worlds. Notice that this won't work for multiplication or division. And mixing signed with unsigned is more complicated as it doesn't commute.

    Fixed-point support was added to avr-gcc v4.8 as part of GNU-C99.


    FYI, here is the respecive code for subtraction:

    static inline __attribute__((always_inline))
    uint8_t subu8_sat (uint8_t a, uint8_t b)
    {
        unsigned short sat fract ha = uhrbits (a);
        unsigned short sat fract hb = uhrbits (b);
        return bitsuhr (ha - hb);
    }
    
    static inline __attribute__((always_inline))
    int8_t subi8_sat (int8_t a, int8_t b)
    {
        short sat fract ha = hrbits (a);
        short sat fract hb = hrbits (b);
        return bitshr (ha - hb);
    }
    
    uint8_t test_sub_u8 (uint8_t a, uint8_t b)
    {
        return subu8_sat (subu8_sat (a, b), 10);
    }
    
    int8_t test_sub_i8 (int8_t a, int8_t b)
    {
        return subi8_sat (subi8_sat (a, b), 10);
    }
    
    test_sub_u8:
        sub r24,r22  ;  9   [c=4 l=3]  ussubuqq3/0
        brcc 0f 
        clr r24 
        0:  
        subi r24,10  ;  18  [c=4 l=3]  ussubuqq3/1
        brcc 0f 
        clr r24 
        0:  
        ret
    
    test_sub_i8:
        sub r24,r22  ;  9   [c=4 l=5]  sssubqq3/0
        brvc 0f 
        ldi r24,0x7f
        sbrs r22,7
        inc r24
        0:  
        subi r24,10  ;  18  [c=4 l=3]  sssubqq3/1
        brvc 0f 
        ldi r24,0x80
        0:  
        ret
    

    As a final note, the constant case is a bit tricky because adding 128 is not the same like subtracting -128, so extra care must be taken.

    Also the unsigned constant case uses the reverse of the C flag because it is subtracting a negative value, where it does actually implement for addition of a positive value.