I am studying AVX by writing AVX code with inline assembly. In this case, I tried to implement AVX in a simple function. The function name I made is lower_all_chars_base
.
Its behavior is: Apply logical OR to every single char in std::string
with 0x20
.
c
be every single char in the input.'A' <= c && c <= 'Z'
.So the function will make the characters be lower case.
I tried to make the AVX version of the function, the store instruction was unaligned, and there was no speed up at all.
Then I thought, if the memory access is aligned, then it must be faster. After that, I tried to make the AVX version with aligned store, but still gcc
base optimization -O3
beats up my vectorized code by hand. What am I doing wrong here?
lower_all_chars_base
simple function.lower_all_chars_avx_aligned
AVX2 aligned move version:lower_all_chars_avx_unaligned
AVX2 unaligned move version:gcc
base optimization -O3
beat up my optimization?root@esteh:/tmp# g++ --version
g++ (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
root@esteh:/tmp# g++ -Wall -Wextra -std=c++2a -O3 test.cpp -o test
root@esteh:/tmp# nice -n -20 ./test
lower_all_chars_base
Min = 0.00662300
Max = 0.00793100
Avg = 0.00717280
Total = 0.07172800
lower_all_chars_avx_aligned
Min = 0.00650200
Max = 0.00785100
Avg = 0.00726220
Total = 0.07262200
lower_all_chars_avx_unaligned
Min = 0.00623600
Max = 0.00835000
Avg = 0.00701360
Total = 0.07013600
Edit: N - 1
for the memset.
Godbolt link: https://godbolt.org/z/a16cGK
#include <ctime>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <iostream>
using std::string;
void lower_all_chars_base(string &str);
void lower_all_chars_avx_aligned(string &str);
void lower_all_chars_avx_unaligned(string &str);
void do_benchmark(std::string &x, void (*fx)(string &));
void mem_flush(const void *p, unsigned int allocation_size);
#define N (size_t)(1024u * 1024 * 40)
#define BENCHMARK(STR, FX) do { \
puts(#FX); \
do_benchmark(STR, FX); \
} while(0)
int main() {
static char x[N];
memset(x, 'A', N - 1);
string a(x), b(x), c(x);
BENCHMARK(a, lower_all_chars_base);
BENCHMARK(b, lower_all_chars_avx_aligned);
BENCHMARK(c, lower_all_chars_avx_unaligned);
assert(a == b);
assert(b == c);
memset(x, 'a', N - 1);
assert(memcmp(c.c_str(), x, N - 1) == 0);
}
void do_benchmark(std::string &x, void (*fx)(string &)) {
const size_t n = 10;
double min, max, avg, c, total = 0;
for (size_t i = 0; i < n; i++) {
clock_t time0 = clock();
fx(x);
clock_t time1 = clock();
c = (double)(time1 - time0) / CLOCKS_PER_SEC;
total += c;
if (i == 0) {
min = max = c;
} else {
if (c > max) max = c;
if (c < min) min = c;
}
mem_flush(x.c_str(), x.size());
}
avg = total / (double)n;
printf("Min = %.8f\n", min);
printf("Max = %.8f\n", max);
printf("Avg = %.8f\n", avg);
printf("Total = %.8f\n\n", total);
}
__attribute__((noinline))
void lower_all_chars_base(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
while (len--) {
*cs++ |= 0x20;
}
}
static const uint64_t mask[] __attribute__((aligned(32))) = {
0x2020202020202020ull, 0x2020202020202020ull,
0x2020202020202020ull, 0x2020202020202020ull
};
__attribute__((noinline))
void lower_all_chars_avx_aligned(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
/* Only use AVX for data bigger than 4K. */
if (len > 4096) {
/* Handle unaligned data from the head. */
uint8_t n = (uintptr_t)cs & 0b11111u;
for (uint8_t i = 0; i < n; i++) {
*cs++ |= 0x20;
}
len -= n;
/* Prevent AVX to process data beyond the array. */
size_t vlen = len - 288;
size_t j;
/* Process the aligned memory with AVX. */
asm volatile("vmovdqa %[mask], %%ymm0"::[mask]"m"(mask):"ymm0");
for (j = 0; j < vlen; j += 288) {
asm volatile(
"vpor\t(%[cs],%[j]), %%ymm0, %%ymm1\n\t"
"vpor\t32(%[cs],%[j]), %%ymm0, %%ymm2\n\t"
"vpor\t64(%[cs],%[j]), %%ymm0, %%ymm3\n\t"
"vpor\t96(%[cs],%[j]), %%ymm0, %%ymm4\n\t"
"vpor\t128(%[cs],%[j]), %%ymm0, %%ymm5\n\t"
"vpor\t160(%[cs],%[j]), %%ymm0, %%ymm6\n\t"
"vpor\t192(%[cs],%[j]), %%ymm0, %%ymm7\n\t"
"vpor\t224(%[cs],%[j]), %%ymm0, %%ymm8\n\t"
"vpor\t256(%[cs],%[j]), %%ymm0, %%ymm9\n\t"
"vmovdqa\t%%ymm1, (%[cs],%[j])\n\t"
"vmovdqa\t%%ymm2, 32(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm3, 64(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm4, 96(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm5, 128(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm6, 160(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm7, 192(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm8, 224(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm9, 256(%[cs],%[j])"
:
: [cs]"p"(cs), [j]"r"(j)
: "memory", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
"ymm6", "ymm7", "ymm8", "ymm9"
);
}
asm volatile("vzeroupper":::
"ymm0", "ymm1", "ymm2", "ymm3",
"ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12","ymm13","ymm14","ymm15"
);
cs += j;
len -= j;
}
/* Backup remaining elements from the AVX operation. */
for (size_t i = 0; i < len; i++) {
*cs++ |= 0x20;
}
}
__attribute__((noinline))
void lower_all_chars_avx_unaligned(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
/* Only use AVX for data bigger than 4K. */
if (len > 4096) {
size_t j;
size_t vlen = len - 288;
asm volatile("vmovdqa %[mask], %%ymm0"::[mask]"m"(mask):"ymm0");
for (j = 0; j < vlen; j += 288) {
asm volatile(
"vpor\t(%[cs],%[j]), %%ymm0, %%ymm1\n\t"
"vpor\t32(%[cs],%[j]), %%ymm0, %%ymm2\n\t"
"vpor\t64(%[cs],%[j]), %%ymm0, %%ymm3\n\t"
"vpor\t96(%[cs],%[j]), %%ymm0, %%ymm4\n\t"
"vpor\t128(%[cs],%[j]), %%ymm0, %%ymm5\n\t"
"vpor\t160(%[cs],%[j]), %%ymm0, %%ymm6\n\t"
"vpor\t192(%[cs],%[j]), %%ymm0, %%ymm7\n\t"
"vpor\t224(%[cs],%[j]), %%ymm0, %%ymm8\n\t"
"vpor\t256(%[cs],%[j]), %%ymm0, %%ymm9\n\t"
"vmovdqu\t%%ymm1, (%[cs],%[j])\n\t"
"vmovdqu\t%%ymm2, 32(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm3, 64(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm4, 96(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm5, 128(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm6, 160(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm7, 192(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm8, 224(%[cs],%[j])\n\t"
"vmovdqu\t%%ymm9, 256(%[cs],%[j])"
:
: [cs]"p"(cs), [j]"r"(j)
: "memory", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
"ymm6", "ymm7", "ymm8", "ymm9"
);
}
asm volatile("vzeroupper":::
"ymm0", "ymm1", "ymm2", "ymm3",
"ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12","ymm13","ymm14","ymm15"
);
cs += j;
len -= j;
}
/* Backup remaining elements from the AVX operation. */
for (size_t i = 0; i < len; i++) {
*cs++ |= 0x20;
}
}
void mem_flush(const void *p, unsigned int allocation_size) {
/* https://stackoverflow.com/a/43694725/7275114 */
const size_t cache_line = 64;
const char *cp = (const char *)p;
size_t i = 0;
if (p == NULL || allocation_size <= 0)
return;
for (i = 0; i < allocation_size; i += cache_line) {
asm volatile("clflush (%0)"::"r"(&cp[i]):"memory");
}
asm volatile("sfence"::: "memory");
}
I tried to apply some suggestions in the comments.
Yet, I do not use intrinsics. Now the hand-coded Assembly version is approximately 1.02 times faster than gcc optimization with -O3 -mavx2
flags. It is not a significant speed up. But I learned a lot about inline assembly. I am still waiting for other answers, I hope there is a better answer than this.
lower_all_chars_avx_aligned
function changes summary:vbroadcastsd
to load the mask, like how the clang
does.volatile
and "memory"
clobber in inline assembly.ymm0
to ymm15
registers.__AVX__
and __AVX2__
constant checking to prevent vzeroupper
emitted twice.-mavx2
to improve gcc base optimization.lower_all_chars_avx_unaligned
is removed.malloc
instead of static char[]
variable to handle bigger data.Min
is the minimum time from 30 times function call.Max
is the maximum time from 30 times function call.Avg
is the average time from 30 times function call.Total
is the total time from 30 times function call.root@yukii-hpc2:/tmp# g++ -Wall -Wextra -std=c++2a -O3 -mavx2 test.cpp -o test
root@yukii-hpc2:/tmp# nice -n -20 ./test
lower_all_chars_avx_aligned
Min = 0.31086600
Max = 0.31319800
Avg = 0.31159833
Total = 9.34795000
lower_all_chars_base
Min = 0.31823400
Max = 0.32902100
Avg = 0.31904893
Total = 9.57146800
root@yukii-hpc2:/tmp# g++ --version
g++ (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
root@yukii-hpc2:/tmp#
/*
https://stackoverflow.com/questions/65404362/avx2-code-cannot-be-faster-than-gcc-base-optmization
*/
#include <ctime>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <cstdlib>
#include <iostream>
using std::string;
void lower_all_chars_base(string &str);
void lower_all_chars_avx_aligned(string &str);
void do_benchmark(string &x, void (*fx)(string &));
void mem_flush(const void *p, unsigned int allocation_size);
#define _M(N) (size_t)(1024ull * 1024 * (N))
#define _G(N) (size_t)(1024ull * 1024 * 1024 * (N))
#define N (_G(3) + _M(512) + 1234) /* 3.5 G + 1234 */
/*
1234 is just to make it odd,
so it likely will jump to the tail of AVX aligned loop.
*/
#define BENCHMARK(STR, FX) do { \
puts(#FX); \
do_benchmark(STR, FX); \
} while(0)
int main() {
char *x = (char *)malloc(N + 1);
memset(x, 'A', N);
x[N] = '\0';
{
string a(x);
memset(x, 'a', N);
BENCHMARK(a, lower_all_chars_avx_aligned);
assert(memcmp(a.c_str(), x, N) == 0);
}
/* Restore value for the next benchmark. */
memset(x, 'A', N);
{
string a(x);
memset(x, 'a', N);
BENCHMARK(a, lower_all_chars_base);
assert(memcmp(a.c_str(), x, N) == 0);
}
free(x);
}
inline static void lower_all_chars_b1024(char *cs, uint16_t len) {
while (len--) {
*cs++ |= 0x20;
}
}
/* Aligned memory for mask for performance. */
static const uint64_t mask[] __attribute__((aligned(32))) = {
0x2020202020202020ull
};
__attribute__((noinline))
void lower_all_chars_avx_aligned(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
/* Only use AVX for data bigger than or equal to 1K. */
if (len >= 1024) {
size_t avx_size = 0x1e0; /* Bytes per AVX main iteration. */
char *end = &(cs[len]);
/* End of aligned process iteration. */
char *end_avx = &(end[-avx_size]);
/* Dummy variable, to let the compiler choose the best GP register. */
uintptr_t rem_bytes;
asm(
/* Prepare %[rem_bytes] initial value. */
"movq\t%[end], %[rem_bytes]\n\t"
/* Load the mask. */
"vbroadcastsd\t%[mask], %%ymm0\n\t"
/* Handle unaligned memory from the head. */
"vpor\t(%[cs]), %%ymm0, %%ymm1\n\t"
"vmovdqu\t%%ymm1, (%[cs])\n\t"
"addq\t$0x20, %[cs]\n\t" /* Move to the next 32 bytes. */
/* Handle aligned memory part.
Use `vmovdqa` to make sure that the memory is
aligned properly.
Note that ORing is idempotent: you can OR the same
byte multiple times without changing it further. So
%[cs] can partially overlap with `vmovdqu` operation
before this point.
https://stackoverflow.com/questions/65404362/avx2-code-cannot-be-faster-than-gcc-base-optmization#comment115632279_65404362
*/
"andq\t$~0b11111ull, %[cs]\n\t" /* Clear 5-bit LSB. */
"1:\n\t"
"vpor\t0x000(%[cs]), %%ymm0, %%ymm1\n\t"
"vpor\t0x020(%[cs]), %%ymm0, %%ymm2\n\t"
"vpor\t0x040(%[cs]), %%ymm0, %%ymm3\n\t"
"vpor\t0x060(%[cs]), %%ymm0, %%ymm4\n\t"
"vpor\t0x080(%[cs]), %%ymm0, %%ymm5\n\t"
"vpor\t0x0a0(%[cs]), %%ymm0, %%ymm6\n\t"
"vpor\t0x0c0(%[cs]), %%ymm0, %%ymm7\n\t"
"vpor\t0x0e0(%[cs]), %%ymm0, %%ymm8\n\t"
"vpor\t0x100(%[cs]), %%ymm0, %%ymm9\n\t"
"vpor\t0x120(%[cs]), %%ymm0, %%ymm10\n\t"
"vpor\t0x140(%[cs]), %%ymm0, %%ymm11\n\t"
"vpor\t0x160(%[cs]), %%ymm0, %%ymm12\n\t"
"vpor\t0x180(%[cs]), %%ymm0, %%ymm13\n\t"
"vpor\t0x1a0(%[cs]), %%ymm0, %%ymm14\n\t"
"vpor\t0x1c0(%[cs]), %%ymm0, %%ymm15\n\t"
/* Plug the result to aligned memory. */
"vmovdqa\t%%ymm1, 0x000(%[cs])\n\t"
"vmovdqa\t%%ymm2, 0x020(%[cs])\n\t"
"vmovdqa\t%%ymm3, 0x040(%[cs])\n\t"
"vmovdqa\t%%ymm4, 0x060(%[cs])\n\t"
"vmovdqa\t%%ymm5, 0x080(%[cs])\n\t"
"vmovdqa\t%%ymm6, 0x0a0(%[cs])\n\t"
"vmovdqa\t%%ymm7, 0x0c0(%[cs])\n\t"
"vmovdqa\t%%ymm8, 0x0e0(%[cs])\n\t"
"vmovdqa\t%%ymm9, 0x100(%[cs])\n\t"
"vmovdqa\t%%ymm10, 0x120(%[cs])\n\t"
"vmovdqa\t%%ymm11, 0x140(%[cs])\n\t"
"vmovdqa\t%%ymm12, 0x160(%[cs])\n\t"
"vmovdqa\t%%ymm13, 0x180(%[cs])\n\t"
"vmovdqa\t%%ymm14, 0x1a0(%[cs])\n\t"
"vmovdqa\t%%ymm15, 0x1c0(%[cs])\n\t"
"addq\t%[avx_size], %[cs]\n\t"
"cmpq\t%[end_avx], %[cs]\n\t"
"jb\t1b\n\t"
"subq\t%[cs], %[rem_bytes]\n\t"
/* Now, %[rem_bytes] contains the remaining bytes. */
"testq\t%[rem_bytes], %[rem_bytes]\n\t"
"jz\t3f\n\t"
/* There's no remaining bytes if `jz` is taken. */
/* Handle the tail, may be back off several bytes
to make the remaining bytes to be multiple of 32.
*/
"leaq\t0b11111(%[rem_bytes]), %[dec_avx]\n\t"
"andq\t$~0b11111ull, %[dec_avx]\n\t"
"subq\t%[rem_bytes], %[dec_avx]\n\t"
"subq\t%[dec_avx], %[cs]\n\t"
"2:\n\t"
"vpor\t(%[cs]), %%ymm0, %%ymm1\n\t"
"vmovdqu\t%%ymm1, (%[cs])\n\t"
"addq\t$0x20, %[cs]\n\t"
"cmpq\t%[end], %[cs]\n\t"
"jb\t2b\n\t"
"3:\n\t"
#if !defined(__AVX__) && !defined(__AVX2__)
"vzeroupper"
#endif
/* Output */
: [cs]"+r"(cs),
[end]"+r"(end),
[end_avx]"+r"(end_avx),
[dec_avx]"=r"(end_avx), /* May reuse end_avx if needed. */
[rem_bytes]"=r"(rem_bytes),
/* Tell the compiler that this inline assembly is
going to read/write `len` bytes from `cs`. */
[dummy_mem_output]"+m"(*(char (*)[len])cs)
/* Input */
: [mask]"m"(mask),
[avx_size]"n"(avx_size)
/* Clobbers */
: "ymm0", "ymm1", "ymm2", "ymm3",
"ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12", "ymm13", "ymm14", "ymm15"
);
} else {
/* Let the compiler use its own optimization here. */
lower_all_chars_b1024(cs, len);
}
}
__attribute__((noinline))
void lower_all_chars_base(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
while (len--) {
*cs++ |= 0x20;
}
}
void do_benchmark(string &x, void (*fx)(string &)) {
const size_t n = 30;
double min = 0, max = 0, avg, c, total = 0;
for (size_t i = 0; i < n; i++) {
mem_flush(x.c_str(), x.size());
clock_t time0 = clock();
fx(x);
clock_t time1 = clock();
c = (double)(time1 - time0) / CLOCKS_PER_SEC;
total += c;
if (i == 0) {
min = max = c;
} else {
if (c > max) max = c;
if (c < min) min = c;
}
}
avg = total / (double)n;
printf("Min = %.8f\n", min);
printf("Max = %.8f\n", max);
printf("Avg = %.8f\n", avg);
printf("Total = %.8f\n\n", total);
}
void mem_flush(const void *p, unsigned int allocation_size) {
/* https://stackoverflow.com/a/43694725/7275114 */
const size_t cache_line = 64;
const char *cp = (const char *)p;
size_t i = 0;
if (p == NULL || allocation_size <= 0)
return;
for (i = 0; i < allocation_size; i += cache_line) {
asm volatile("clflush (%0)"::"r"(&cp[i]):"memory");
}
asm volatile("sfence"::: "memory");
}