Search code examples
cavr-gcc

C compose unsigned 32-bit integer from four 8-bit integers


On an 8-bit platform, I am composing an unsigned 32-bit integer from 4 8-bit integers like this:

uint8_t buf[4];
uint32_t large = 0;
large |= ((uint32_t)buf[0]) << 24;
large |= ((uint32_t)buf[1]) << 16;
large |= buf[2] << 8;
large |= buf[3] << 0;

Without the casts the compiler understandably complains:

bmp.c:100:23: warning: left shift count >= width of type [-Wshift-count-overflow]
  100 |     large |= (buf[1]) << 16;
      |                       ^~

Are these casts expensive (I would guess yes) and can this be done more efficiently?

Here is what I think is the relevant disassembly from avr-gcc (GCC) 13.2.0:

000060ee <.L29>:
        large |= ((uint32_t)buf[1]) << 16;
    60ee:       91 2c           mov     r9, r1
    60f0:       a1 2c           mov     r10, r1         
    60f2:       b1 2c           mov     r11, r1

000060f4 <.Loc.91>:
        large |= buf[3] << 0;   
    60f4:       a9 2a           or      r10, r25
    
000060f6 <.Loc.92>:
        large |= buf[2] << 8;
    60f6:       50 e0           ldi     r21, 0x00       ; 0
    
000060f8 <.Loc.93>:
    60f8:       54 2f           mov     r21, r20
    60fa:       44 27           eor     r20, r20
    60fc:       05 2e           mov     r0, r21 
    60fe:       00 0c           add     r0, r0
    6100:       66 0b           sbc     r22, r22
    6102:       77 0b           sbc     r23, r23

00006104 <.Loc.94>:
        large |= buf[3] << 0;
    6104:       84 2a           or      r8, r20 
    6106:       95 2a           or      r9, r21 
    6108:       a6 2a           or      r10, r22
    610a:       b7 2a           or      r11, r23
    610c:       b8 2a           or      r11, r24
    610e:       80 92 04 01     sts     0x0104, r8      ; 0x800104 <large>
    6112:       90 92 05 01     sts     0x0105, r9      ; 0x800105 <large+0x1>
    6116:       a0 92 06 01     sts     0x0106, r10     ; 0x800106 <large+0x2>
    611a:       b0 92 07 01     sts     0x0107, r11     ; 0x800107 <large+0x3>

Solution

  • The single expression you suggest yields 15 instead of 20 instructions if I am interpreting the disassembly correctly - nice!

    No - it is a problem of the undefined/implemetation defined behaviour in the code, when it is written correctly it does not matter. I would also suggest using pointer notation in parameters (as C passes arrays as pointers) and declare parameters as const if function is not changing them. It helps compiler with optimizations (even abstracting from const correctness)

    uint32_t foo(const uint8_t *buf)
    {
        uint32_t large = 0;
        large |= ((uint32_t)buf[0]) << 24;
        large |= ((uint32_t)buf[1]) << 16;
        large |= (uint32_t)buf[2] << 8;
        large |= buf[3] << 0;
        return large;
    }
    
    
    uint32_t bar(const uint8_t *buf)
    {
        return (uint32_t) buf[0] << 24 | (uint32_t) buf[1] << 16 | (uint32_t) buf[2] << 8 | buf[3];
    }
    

    Both generate the same machine code:

    foo:
    .L__stack_usage = 0
            mov r30,r24
            mov r31,r25
            ld r22,Z
            ldd r23,Z+1
            ldd r24,Z+2
            ldd r25,Z+3
            rcall __bswapsi2
            ret
    bar:
    .L__stack_usage = 0
            mov r30,r24
            mov r31,r25
            ld r22,Z
            ldd r23,Z+1
            ldd r24,Z+2
            ldd r25,Z+3
            rcall __bswapsi2
            ret
    

    https://godbolt.org/z/b7o4114EP

    Also AVR compiler assumes little endian and you "composing" the uint32_t number from the big-endian representation.

    If endianness match then I would suggest using memcpy

    memcpy(&large, buff, sizeof(large));
    

    Optimizing compilers will not call memcpy

    uint32_t bar(uint8_t buf[4])
    {
        uint32_t large;
        memcpy(&large, buf, sizeof(large));
        return large
    }
    
    bar:
    .L__stack_usage = 0
            mov r30,r24
            mov r31,r25
            ld r22,Z
            ldd r23,Z+1
            ldd r24,Z+2
            ldd r25,Z+3
            ret
    

    But to more interesting using unions makes code much more efficient if the buf data is big endian

    uint32_t foo(const uint8_t *buf)
    {
        union 
        {
            uint32_t large;
            uint8_t small[4];
        }d = {.small = {[0] = buf[0], [1] = buf[1], [2] = buf[2], [3] = buf[3]}};
        return d.large;
    }
    
    uint32_t bar(const uint8_t *buf)
    {
        union 
        {
            uint32_t large;
            uint8_t small[4];
        }d = {.small = {[0] = buf[3], [1] = buf[2], [2] = buf[1], [3] = buf[0]}};
        return d.large;
    }
    

    and the resulting code:

    foo:
    .L__stack_usage = 0
            mov r30,r24
            mov r31,r25
            ldd r23,Z+1
            ld r22,Z
            ldd r24,Z+2
            ldd r25,Z+3
    ret
    bar:
    .L__stack_usage = 0
            mov r30,r24
            mov r31,r25
            ldd r23,Z+2
            ldd r22,Z+3
            ldd r24,Z+1
            ld r25,Z
    ret