On an 8-bit platform, I am composing an unsigned 32-bit integer from 4 8-bit integers like this:
uint8_t buf[4];
uint32_t large = 0;
large |= ((uint32_t)buf[0]) << 24;
large |= ((uint32_t)buf[1]) << 16;
large |= buf[2] << 8;
large |= buf[3] << 0;
Without the casts the compiler understandably complains:
bmp.c:100:23: warning: left shift count >= width of type [-Wshift-count-overflow]
100 | large |= (buf[1]) << 16;
| ^~
Are these casts expensive (I would guess yes) and can this be done more efficiently?
Here is what I think is the relevant disassembly from avr-gcc (GCC) 13.2.0
:
000060ee <.L29>:
large |= ((uint32_t)buf[1]) << 16;
60ee: 91 2c mov r9, r1
60f0: a1 2c mov r10, r1
60f2: b1 2c mov r11, r1
000060f4 <.Loc.91>:
large |= buf[3] << 0;
60f4: a9 2a or r10, r25
000060f6 <.Loc.92>:
large |= buf[2] << 8;
60f6: 50 e0 ldi r21, 0x00 ; 0
000060f8 <.Loc.93>:
60f8: 54 2f mov r21, r20
60fa: 44 27 eor r20, r20
60fc: 05 2e mov r0, r21
60fe: 00 0c add r0, r0
6100: 66 0b sbc r22, r22
6102: 77 0b sbc r23, r23
00006104 <.Loc.94>:
large |= buf[3] << 0;
6104: 84 2a or r8, r20
6106: 95 2a or r9, r21
6108: a6 2a or r10, r22
610a: b7 2a or r11, r23
610c: b8 2a or r11, r24
610e: 80 92 04 01 sts 0x0104, r8 ; 0x800104 <large>
6112: 90 92 05 01 sts 0x0105, r9 ; 0x800105 <large+0x1>
6116: a0 92 06 01 sts 0x0106, r10 ; 0x800106 <large+0x2>
611a: b0 92 07 01 sts 0x0107, r11 ; 0x800107 <large+0x3>
The single expression you suggest yields 15 instead of 20 instructions if I am interpreting the disassembly correctly - nice!
No - it is a problem of the undefined/implemetation defined behaviour in the code, when it is written correctly it does not matter. I would also suggest using pointer notation in parameters (as C passes arrays as pointers) and declare parameters as const
if function is not changing them. It helps compiler with optimizations (even abstracting from const correctness)
uint32_t foo(const uint8_t *buf)
{
uint32_t large = 0;
large |= ((uint32_t)buf[0]) << 24;
large |= ((uint32_t)buf[1]) << 16;
large |= (uint32_t)buf[2] << 8;
large |= buf[3] << 0;
return large;
}
uint32_t bar(const uint8_t *buf)
{
return (uint32_t) buf[0] << 24 | (uint32_t) buf[1] << 16 | (uint32_t) buf[2] << 8 | buf[3];
}
Both generate the same machine code:
foo:
.L__stack_usage = 0
mov r30,r24
mov r31,r25
ld r22,Z
ldd r23,Z+1
ldd r24,Z+2
ldd r25,Z+3
rcall __bswapsi2
ret
bar:
.L__stack_usage = 0
mov r30,r24
mov r31,r25
ld r22,Z
ldd r23,Z+1
ldd r24,Z+2
ldd r25,Z+3
rcall __bswapsi2
ret
https://godbolt.org/z/b7o4114EP
Also AVR compiler assumes little endian and you "composing" the uint32_t number from the big-endian representation.
If endianness match then I would suggest using memcpy
memcpy(&large, buff, sizeof(large));
Optimizing compilers will not call memcpy
uint32_t bar(uint8_t buf[4])
{
uint32_t large;
memcpy(&large, buf, sizeof(large));
return large
}
bar:
.L__stack_usage = 0
mov r30,r24
mov r31,r25
ld r22,Z
ldd r23,Z+1
ldd r24,Z+2
ldd r25,Z+3
ret
But to more interesting using union
s makes code much more efficient if the buf
data is big endian
uint32_t foo(const uint8_t *buf)
{
union
{
uint32_t large;
uint8_t small[4];
}d = {.small = {[0] = buf[0], [1] = buf[1], [2] = buf[2], [3] = buf[3]}};
return d.large;
}
uint32_t bar(const uint8_t *buf)
{
union
{
uint32_t large;
uint8_t small[4];
}d = {.small = {[0] = buf[3], [1] = buf[2], [2] = buf[1], [3] = buf[0]}};
return d.large;
}
and the resulting code:
foo:
.L__stack_usage = 0
mov r30,r24
mov r31,r25
ldd r23,Z+1
ld r22,Z
ldd r24,Z+2
ldd r25,Z+3
ret
bar:
.L__stack_usage = 0
mov r30,r24
mov r31,r25
ldd r23,Z+2
ldd r22,Z+3
ldd r24,Z+1
ld r25,Z
ret