My bare-metal program manually calls memset()
to zero entire/aligned 4k pages like this (I'm not using uint64_t, but another 8-byte thing):
uint64_t something[512] __attribute__((aligned(4096)));
memset(something, 0x0, 4096);
I'm compiling similar to this...
%> /path/to/gcc-11.1.0/bin/aarch64-unknown-elf-gcc \
-O3 \
-std=gnu99 \
-nostartfiles \
--specs=nano.specs \
-march=armv8.1-a \
-Wl,--gc-sections \
-Tlinker_script.lds \
my_code.c \
-o my_code.elf
When I disassemble and look at memset()
that is linked in, it's this basic/generic/one-byte-at-a-time, implementation:
000000004010399c <memset>:
s = (char*)aligned_addr;
}
#endif /* not PREFER_SIZE_OVER_SPEED */
while (n--)
4010399c: d2800003 mov x3, #0x0 // #0
401039a0: eb03005f cmp x2, x3
401039a4: 54000041 b.ne 401039ac <memset+0x10> // b.any
*s++ = (char) c;
return m;
}
401039a8: d65f03c0 ret
*s++ = (char) c;
401039ac: 38236801 strb w1, [x0, x3]
401039b0: 91000463 add x3, x3, #0x1
401039b4: 17fffffb b 401039a0 <memset+0x4>
I'm expecting an aarch64 optimized version that uses stp
or vector instructions. My compiler has a /path/to/gcc-11.1.0/newlib-nano
subdirectory.
I've removed --specs=nano.specs
and fiddled around with a variety of options, but I'm not sure what I can do here...
HOW CAN I GET THE OPTIMIZED memset()
IMPLEMENTATION?
Note that I used
/path/to/gcc-11.1.0/bin/aarch64-unknown-elf-ar x
to extract thelibc_a.memset.o
file from lots of different*.a
files, but they were all empty:/path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libc.a
,/path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libc_nano.a
,/path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libg.a
,/path/to/gcc-11.1.0/aarch64-unknown-elf/lib/libc.a
, etc... Not the byte-by-byte implementation, just empty. I did this to look for a good implementation, but I clearly don't understand what is going on here...
You can see the code here. Your library was compiled with the definition PREFER_SIZE_OVER_SPEED
. You need to recompile your library.
https://github.com/eblot/newlib/blob/master/newlib/libc/string/memset.c
/*
FUNCTION
<<memset>>---set an area of memory
INDEX
memset
ANSI_SYNOPSIS
#include <string.h>
void *memset(void *<[dst]>, int <[c]>, size_t <[length]>);
TRAD_SYNOPSIS
#include <string.h>
void *memset(<[dst]>, <[c]>, <[length]>)
void *<[dst]>;
int <[c]>;
size_t <[length]>;
DESCRIPTION
This function converts the argument <[c]> into an unsigned
char and fills the first <[length]> characters of the array
pointed to by <[dst]> to the value.
RETURNS
<<memset>> returns the value of <[dst]>.
PORTABILITY
<<memset>> is ANSI C.
<<memset>> requires no supporting OS subroutines.
QUICKREF
memset ansi pure
*/
#include <string.h>
#define LBLOCKSIZE (sizeof(long))
#define UNALIGNED(X) ((long)X & (LBLOCKSIZE - 1))
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
_PTR
_DEFUN (memset, (m, c, n),
_PTR m _AND
int c _AND
size_t n)
{
char *s = (char *) m;
#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
int i;
unsigned long buffer;
unsigned long *aligned_addr;
unsigned int d = c & 0xff; /* To avoid sign extension, copy C to an
unsigned variable. */
while (UNALIGNED (s))
{
if (n--)
*s++ = (char) c;
else
return m;
}
if (!TOO_SMALL (n))
{
/* If we get this far, we know that n is large and s is word-aligned. */
aligned_addr = (unsigned long *) s;
/* Store D into each char sized location in BUFFER so that
we can set large blocks quickly. */
buffer = (d << 8) | d;
buffer |= (buffer << 16);
for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
buffer = (buffer << i) | buffer;
/* Unroll the loop. */
while (n >= LBLOCKSIZE*4)
{
*aligned_addr++ = buffer;
*aligned_addr++ = buffer;
*aligned_addr++ = buffer;
*aligned_addr++ = buffer;
n -= 4*LBLOCKSIZE;
}
while (n >= LBLOCKSIZE)
{
*aligned_addr++ = buffer;
n -= LBLOCKSIZE;
}
/* Pick up the remainder with a bytewise loop. */
s = (char*)aligned_addr;
}
#endif /* not PREFER_SIZE_OVER_SPEED */
while (n--)
*s++ = (char) c;
return m;
}