Why is GCC emitting larger output for a bytewise copy vs memcpy?

The following C11 program extracts the bit representation of a float into a uint32_t in two different ways.

#include <stdint.h>

_Static_assert(sizeof(float) == sizeof(uint32_t));

uint32_t f2i_char(float f) {
  uint32_t x;
  char const *src = (char const *)&f;
  char *dst = (char *)&x;
  *dst++ = *src++;
  *dst++ = *src++;
  *dst++ = *src++;
  *dst++ = *src++;
  return x;
}

uint32_t f2i_memcpy(float f) {
  uint32_t x;
  memcpy(&x, &f, sizeof(x));
  return x;
}

The output assembly, compiled with armgcc 10.2.1 (none eabi) is very different, even with the -Os or -O3 optimizations applied:

I'm compiling with: -mcpu=cortex-m4 -std=c11 -mfpu=fpv4-sp-d16 -mfloat-abi=hard

f2i_char:
  sub sp, sp, #16
  vstr.32 s0, [sp, #4]
  ldr r3, [sp, #4]
  strb r3, [sp, #12]
  ubfx r2, r3, #8, #8
  strb r2, [sp, #13]
  ubfx r2, r3, #16, #8
  ubfx r3, r3, #24, #8
  strb r2, [sp, #14]
  strb r3, [sp, #15]
  ldr r0, [sp, #12]
  add sp, sp, #16
  bx lr
f2i_memcpy:
  sub sp, sp, #8
  vstr.32 s0, [sp, #4]
  ldr r0, [sp, #4]
  add sp, sp, #8
  bx lr

Why isn't gcc generating the same assembly for both functions?

Godbolt example

Solution

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104344

GCC does not recognize and match the unrolled version as a bswap or store-merging pattern.

GCC does recognize the loop version.