Search code examples
c++assemblyoptimizationsimdavx

Fastest way to mask out bytes higher than separator position with SIMD


uint8_t data[] = "mykeyxyz:1234\nky:123\n...";. My lines of string has format key:value, where each line has len(key) <= 16 guaranteed. I want to load mykeyxyz into a __m128i, but fill out the higher position with 0.

The easiest way is to have an array of 255 or 0 masks, but that requires another memory load. Is there anyway to do this faster?

The accepted answer gives ~2% faster total program time. To compare, test 1brc_valid13.cpp against 1brc_valid14.cpp (which uses the accepted answer). Hardware: AMD 2950X, Ubuntu 18.04, g++ 11.4, compile command: g++ -o main 1brc_final_valid.cpp -O3 -std=c++17 -march=native -m64 -lpthread

Edit: preferably without AVX512

Edit 2: I need the variable len so I can start parsing the value part.

Edit 3: the function will be used in a loop (for example to parse 1 million lines of text). But strcmp_mask will basically always be inside L1 cache

Edit 4: I benchmark the functions by parsing 1 billion lines of (key,value) and process them. You can download the code/data and replicate the results in my repo: https://github.com/lehuyduc/1brc-simd . Also the discussion post will contain more info

Edit 5: I tested maskafterc256 and found that it caused my code to be 50x slower!!! If I replace _mm256_set_epi8 with _mm256_setr_epi8, then it becomes 500+x slower (took so long that I just Ctrl-C). I'm not sure what _mm256_set_epi8 does, because it's translated into a sequence of instructions instead of a single one.

perf stat -d ./main result for maskafterc

         14,470.46 msec task-clock                #   20.785 CPUs utilized
             3,032      context-switches          #    0.210 K/sec
                 5      cpu-migrations            #    0.000 K/sec
           341,894      page-faults               #    0.024 M/sec
    55,073,525,723      cycles                    #    3.806 GHz                      (37.19%)
     1,714,679,575      stalled-cycles-frontend   #    3.11% frontend cycles idle     (36.71%)
    11,442,758,125      stalled-cycles-backend    #   20.78% backend cycles idle      (36.92%)
    80,739,874,133      instructions              #    1.47  insn per cycle
                                                  #    0.14  stalled cycles per insn  (37.39%)
     8,661,529,181      branches                  #  598.566 M/sec                    (38.22%)
        39,299,214      branch-misses             #    0.45% of all branches          (38.13%)
    17,784,400,757      L1-dcache-loads           # 1229.015 M/sec                    (37.93%)
       827,509,870      L1-dcache-load-misses     #    4.65% of all L1-dcache hits    (37.52%)
   <not supported>      LLC-loads
   <not supported>      LLC-load-misses

       0.696207306 seconds time elapsed

      12.918590000 seconds user
       1.546535000 seconds sys

perf stat -d ./main result for maskafterc256

Performance counter stats for './main':

      1,047,414.73 msec task-clock                #   29.982 CPUs utilized
           125,296      context-switches          #    0.120 K/sec
               211      cpu-migrations            #    0.000 K/sec
           341,889      page-faults               #    0.326 K/sec
 4,229,832,527,830      cycles                    #    4.038 GHz                      (37.50%)
    10,965,796,240      stalled-cycles-frontend   #    0.26% frontend cycles idle     (37.50%)
   167,711,051,408      stalled-cycles-backend    #    3.96% backend cycles idle      (37.49%)
   296,573,918,148      instructions              #    0.07  insn per cycle
                                                  #    0.57  stalled cycles per insn  (37.50%)
    44,843,867,352      branches                  #   42.814 M/sec                    (37.50%)
        56,509,334      branch-misses             #    0.13% of all branches          (37.51%)
    91,621,829,978      L1-dcache-loads           #   87.474 M/sec                    (37.50%)
    18,776,996,709      L1-dcache-load-misses     #   20.49% of all L1-dcache hits    (37.51%)
   <not supported>      LLC-loads
   <not supported>      LLC-load-misses

      34.935225940 seconds time elapsed

    1039.609651000 seconds user
       6.774492000 seconds sys
#include <iostream>
#include <immintrin.h>
#include <string>
#include <cstring>
using namespace std;

alignas(4096) const uint8_t strcmp_mask[32] = {
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};

int main()
{
  uint8_t data[] = "mykeyxyz:1234\naaaaaaaaaaa";
  __m128i chars = _mm_loadu_si128((__m128i*)data);
  __m128i separators = _mm_set1_epi8(':');
  __m128i compared = _mm_cmpeq_epi8(chars, separators);
  uint32_t separator_mask = _mm_movemask_epi8(compared);
  uint32_t len = __builtin_ctz(separator_mask);
  
  cout << "len = " << len << "\n";
  __m128i mask = _mm_loadu_si128((__m128i*)(strcmp_mask + 16 - len));
  __m128i key_chars = _mm_and_si128(chars, mask);
  
  uint8_t res[16];
  memcpy(res, (char*)&key_chars, 16);
  for (int i = 0; i < 16; i++) cout << int(res[i]) << " ";
  cout << "\n";
}
// len = 8
// 109 121 107 101 121 120 121 122 0 0 0 0 0 0 0 0

Solution

  • I often find it interesting to see how others approach a problem, so here's my version. It only requires SSE2, but benefits from SSSE3, and BMI1 for the trailing zeros calculation.

    #include <stdio.h>
    #include <string.h>
    #include <stdint.h>
    #include <immintrin.h>
    
    // gcc maskafterc.c -o maskafterc.bin -O3 -march=native -Wall
    
    __m128i maskafterc(__m128i input, uint8_t c, uint8_t* restrict pos) {
      // Finds first occurance of c in input and takes its position pos. 
      // Returns mask of 255s before pos, 0s on and after.
      // e.g. maskafterc([5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], 9, uint8_t *pos)
      // sets pos = 4 and returns [255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0].
      __m128i index = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
      __m128i cmp = _mm_cmpeq_epi8(input, _mm_set1_epi8(c));
      uint32_t mmask = _mm_movemask_epi8(cmp);
      *pos = (mmask ? __builtin_ctz(mmask) : 16); // Return all -1s if c not found
      return _mm_cmplt_epi8(index, _mm_set1_epi8(*pos));
    }
    
    int main(int argc, char **argv)
    {
      unsigned char data[] = "mykeyxyz:98765211234\naaaaaaaaaaa";
      uint8_t pos;
      __m128i chars = _mm_loadu_si128((__m128i*)data);
      __m128i res =_mm_and_si128(maskafterc(chars, ':', &pos), chars);
      if (pos < 16) puts((char*) &res);
      printf("keylen = %i\n", pos);
    }
    //mykeyxyz
    //keylen = 8
    

    EDIT: AVX2 version.

    __m256i maskafterc256(__m256i input, uint8_t c, uint8_t* restrict pos) {
      // Finds first occurance of c in input and takes its position pos. 
      // Returns mask of 255s before pos, 0s on and after.
      __m256i index = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                      10, 11, 12, 13, 14, 15, 16,
                                      17, 18, 19, 20, 21, 22, 23, 24,
                                      25, 26, 27, 28, 29, 30, 31);
      __m256i cmp = _mm256_cmpeq_epi8(input, _mm256_set1_epi8(c));
      uint32_t mmask = _mm256_movemask_epi8(cmp);
      *pos = (mmask ? __builtin_ctz(mmask) : 32); // Return all -1s if c not found
      return _mm256_cmpgt_epi8(_mm256_set1_epi8(*pos), index);
    }