uint8_t data[] = "mykeyxyz:1234\nky:123\n...";
.
My lines of string has format key:value
, where each line has len(key) <= 16
guaranteed. I want to load mykeyxyz
into a __m128i
, but fill out the higher position with 0.
The easiest way is to have an array of 255 or 0 masks, but that requires another memory load. Is there anyway to do this faster?
The accepted answer gives ~2% faster total program time. To compare, test 1brc_valid13.cpp
against 1brc_valid14.cpp
(which uses the accepted answer). Hardware: AMD 2950X, Ubuntu 18.04, g++ 11.4, compile command: g++ -o main 1brc_final_valid.cpp -O3 -std=c++17 -march=native -m64 -lpthread
Edit: preferably without AVX512
Edit 2: I need the variable len
so I can start parsing the value part.
Edit 3: the function will be used in a loop (for example to parse 1 million lines of text). But strcmp_mask
will basically always be inside L1 cache
Edit 4: I benchmark the functions by parsing 1 billion lines of (key,value)
and process them. You can download the code/data and replicate the results in my repo: https://github.com/lehuyduc/1brc-simd . Also the discussion post will contain more info
Edit 5: I tested maskafterc256
and found that it caused my code to be 50x slower!!! If I replace _mm256_set_epi8
with _mm256_setr_epi8
, then it becomes 500+x slower (took so long that I just Ctrl-C). I'm not sure what _mm256_set_epi8
does, because it's translated into a sequence of instructions instead of a single one.
perf stat -d ./main
result for maskafterc
14,470.46 msec task-clock # 20.785 CPUs utilized
3,032 context-switches # 0.210 K/sec
5 cpu-migrations # 0.000 K/sec
341,894 page-faults # 0.024 M/sec
55,073,525,723 cycles # 3.806 GHz (37.19%)
1,714,679,575 stalled-cycles-frontend # 3.11% frontend cycles idle (36.71%)
11,442,758,125 stalled-cycles-backend # 20.78% backend cycles idle (36.92%)
80,739,874,133 instructions # 1.47 insn per cycle
# 0.14 stalled cycles per insn (37.39%)
8,661,529,181 branches # 598.566 M/sec (38.22%)
39,299,214 branch-misses # 0.45% of all branches (38.13%)
17,784,400,757 L1-dcache-loads # 1229.015 M/sec (37.93%)
827,509,870 L1-dcache-load-misses # 4.65% of all L1-dcache hits (37.52%)
<not supported> LLC-loads
<not supported> LLC-load-misses
0.696207306 seconds time elapsed
12.918590000 seconds user
1.546535000 seconds sys
perf stat -d ./main
result for maskafterc256
Performance counter stats for './main':
1,047,414.73 msec task-clock # 29.982 CPUs utilized
125,296 context-switches # 0.120 K/sec
211 cpu-migrations # 0.000 K/sec
341,889 page-faults # 0.326 K/sec
4,229,832,527,830 cycles # 4.038 GHz (37.50%)
10,965,796,240 stalled-cycles-frontend # 0.26% frontend cycles idle (37.50%)
167,711,051,408 stalled-cycles-backend # 3.96% backend cycles idle (37.49%)
296,573,918,148 instructions # 0.07 insn per cycle
# 0.57 stalled cycles per insn (37.50%)
44,843,867,352 branches # 42.814 M/sec (37.50%)
56,509,334 branch-misses # 0.13% of all branches (37.51%)
91,621,829,978 L1-dcache-loads # 87.474 M/sec (37.50%)
18,776,996,709 L1-dcache-load-misses # 20.49% of all L1-dcache hits (37.51%)
<not supported> LLC-loads
<not supported> LLC-load-misses
34.935225940 seconds time elapsed
1039.609651000 seconds user
6.774492000 seconds sys
#include <iostream>
#include <immintrin.h>
#include <string>
#include <cstring>
using namespace std;
alignas(4096) const uint8_t strcmp_mask[32] = {
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
int main()
{
uint8_t data[] = "mykeyxyz:1234\naaaaaaaaaaa";
__m128i chars = _mm_loadu_si128((__m128i*)data);
__m128i separators = _mm_set1_epi8(':');
__m128i compared = _mm_cmpeq_epi8(chars, separators);
uint32_t separator_mask = _mm_movemask_epi8(compared);
uint32_t len = __builtin_ctz(separator_mask);
cout << "len = " << len << "\n";
__m128i mask = _mm_loadu_si128((__m128i*)(strcmp_mask + 16 - len));
__m128i key_chars = _mm_and_si128(chars, mask);
uint8_t res[16];
memcpy(res, (char*)&key_chars, 16);
for (int i = 0; i < 16; i++) cout << int(res[i]) << " ";
cout << "\n";
}
// len = 8
// 109 121 107 101 121 120 121 122 0 0 0 0 0 0 0 0
I often find it interesting to see how others approach a problem, so here's my version. It only requires SSE2, but benefits from SSSE3, and BMI1 for the trailing zeros calculation.
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <immintrin.h>
// gcc maskafterc.c -o maskafterc.bin -O3 -march=native -Wall
__m128i maskafterc(__m128i input, uint8_t c, uint8_t* restrict pos) {
// Finds first occurance of c in input and takes its position pos.
// Returns mask of 255s before pos, 0s on and after.
// e.g. maskafterc([5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], 9, uint8_t *pos)
// sets pos = 4 and returns [255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0].
__m128i index = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
__m128i cmp = _mm_cmpeq_epi8(input, _mm_set1_epi8(c));
uint32_t mmask = _mm_movemask_epi8(cmp);
*pos = (mmask ? __builtin_ctz(mmask) : 16); // Return all -1s if c not found
return _mm_cmplt_epi8(index, _mm_set1_epi8(*pos));
}
int main(int argc, char **argv)
{
unsigned char data[] = "mykeyxyz:98765211234\naaaaaaaaaaa";
uint8_t pos;
__m128i chars = _mm_loadu_si128((__m128i*)data);
__m128i res =_mm_and_si128(maskafterc(chars, ':', &pos), chars);
if (pos < 16) puts((char*) &res);
printf("keylen = %i\n", pos);
}
//mykeyxyz
//keylen = 8
EDIT: AVX2 version.
__m256i maskafterc256(__m256i input, uint8_t c, uint8_t* restrict pos) {
// Finds first occurance of c in input and takes its position pos.
// Returns mask of 255s before pos, 0s on and after.
__m256i index = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31);
__m256i cmp = _mm256_cmpeq_epi8(input, _mm256_set1_epi8(c));
uint32_t mmask = _mm256_movemask_epi8(cmp);
*pos = (mmask ? __builtin_ctz(mmask) : 32); // Return all -1s if c not found
return _mm256_cmpgt_epi8(_mm256_set1_epi8(*pos), index);
}