Search code examples
cgcclibcmemset

Why does gcc choose the most basic memset() implementation?


My bare-metal program manually calls memset() to zero entire/aligned 4k pages like this (I'm not using uint64_t, but another 8-byte thing):

  uint64_t something[512] __attribute__((aligned(4096)));
  memset(something, 0x0, 4096);

I'm compiling similar to this...

%> /path/to/gcc-11.1.0/bin/aarch64-unknown-elf-gcc \
     -O3 \
     -std=gnu99 \
     -nostartfiles \
     --specs=nano.specs \
     -march=armv8.1-a \
     -Wl,--gc-sections \
     -Tlinker_script.lds \
     my_code.c \
     -o my_code.elf

When I disassemble and look at memset() that is linked in, it's this basic/generic/one-byte-at-a-time, implementation:

000000004010399c <memset>:
      s = (char*)aligned_addr;
    }

#endif /* not PREFER_SIZE_OVER_SPEED */

  while (n--)
    4010399c:   d2800003    mov x3, #0x0                    // #0
    401039a0:   eb03005f    cmp x2, x3
    401039a4:   54000041    b.ne    401039ac <memset+0x10>  // b.any
    *s++ = (char) c;

  return m;
}
    401039a8:   d65f03c0    ret
    *s++ = (char) c;
    401039ac:   38236801    strb    w1, [x0, x3]
    401039b0:   91000463    add x3, x3, #0x1
    401039b4:   17fffffb    b   401039a0 <memset+0x4>

I'm expecting an aarch64 optimized version that uses stp or vector instructions. My compiler has a /path/to/gcc-11.1.0/newlib-nano subdirectory.

I've removed --specs=nano.specs and fiddled around with a variety of options, but I'm not sure what I can do here...

HOW CAN I GET THE OPTIMIZED memset() IMPLEMENTATION?

Note that I used /path/to/gcc-11.1.0/bin/aarch64-unknown-elf-ar x to extract the libc_a.memset.o file from lots of different *.a files, but they were all empty: /path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libc.a, /path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libc_nano.a, /path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libg.a, /path/to/gcc-11.1.0/aarch64-unknown-elf/lib/libc.a, etc... Not the byte-by-byte implementation, just empty. I did this to look for a good implementation, but I clearly don't understand what is going on here...


Solution

  • You can see the code here. Your library was compiled with the definition PREFER_SIZE_OVER_SPEED. You need to recompile your library.

    https://github.com/eblot/newlib/blob/master/newlib/libc/string/memset.c

    /*
    FUNCTION
        <<memset>>---set an area of memory
    INDEX
        memset
    ANSI_SYNOPSIS
        #include <string.h>
        void *memset(void *<[dst]>, int <[c]>, size_t <[length]>);
    TRAD_SYNOPSIS
        #include <string.h>
        void *memset(<[dst]>, <[c]>, <[length]>)
        void *<[dst]>;
        int <[c]>;
        size_t <[length]>;
    DESCRIPTION
        This function converts the argument <[c]> into an unsigned
        char and fills the first <[length]> characters of the array
        pointed to by <[dst]> to the value.
    RETURNS
        <<memset>> returns the value of <[dst]>.
    PORTABILITY
    <<memset>> is ANSI C.
        <<memset>> requires no supporting OS subroutines.
    QUICKREF
        memset ansi pure
    */
    
    #include <string.h>
    
    #define LBLOCKSIZE (sizeof(long))
    #define UNALIGNED(X)   ((long)X & (LBLOCKSIZE - 1))
    #define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
    
    _PTR
    _DEFUN (memset, (m, c, n),
        _PTR m _AND
        int c _AND
        size_t n)
    {
      char *s = (char *) m;
    
    #if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
      int i;
      unsigned long buffer;
      unsigned long *aligned_addr;
      unsigned int d = c & 0xff;    /* To avoid sign extension, copy C to an
                       unsigned variable.  */
    
      while (UNALIGNED (s))
        {
          if (n--)
            *s++ = (char) c;
          else
            return m;
        }
    
      if (!TOO_SMALL (n))
        {
          /* If we get this far, we know that n is large and s is word-aligned. */
          aligned_addr = (unsigned long *) s;
    
          /* Store D into each char sized location in BUFFER so that
             we can set large blocks quickly.  */
          buffer = (d << 8) | d;
          buffer |= (buffer << 16);
          for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
            buffer = (buffer << i) | buffer;
    
          /* Unroll the loop.  */
          while (n >= LBLOCKSIZE*4)
            {
              *aligned_addr++ = buffer;
              *aligned_addr++ = buffer;
              *aligned_addr++ = buffer;
              *aligned_addr++ = buffer;
              n -= 4*LBLOCKSIZE;
            }
    
          while (n >= LBLOCKSIZE)
            {
              *aligned_addr++ = buffer;
              n -= LBLOCKSIZE;
            }
          /* Pick up the remainder with a bytewise loop.  */
          s = (char*)aligned_addr;
        }
    
    #endif /* not PREFER_SIZE_OVER_SPEED */
    
      while (n--)
        *s++ = (char) c;
    
      return m;
    }