Search code examples
cx86simdintrinsicsavx

Count leading zeros in __m256i word


I'm tinkering around with AVX-2 instructions and I'm looking for a fast way to count the number of leading zeros in a __m256i word (which has 256 bits).

So far, I have figured out the following way:

// Computes the number of leading zero bits.
// Here, avx_word is of type _m256i.

if (!_mm256_testz_si256(avx_word, avx_word)) {
  uint64_t word = _mm256_extract_epi64(avx_word, 0);
  if (word > 0)
    return (__builtin_clzll(word));

  word = _mm256_extract_epi64(avx_word, 1);
  if (word > 0)
    return (__builtin_clzll(word) + 64);

  word = _mm256_extract_epi64(avx_word, 2);
  if (word > 0)
    return (__builtin_clzll(word) + 128);

  word = _mm256_extract_epi64(avx_word, 3);
  return (__builtin_clzll(word) + 192);
} else
  return 256; // word is entirely zero

However, I find it rather clumsy to figure out the exact non-zero word within the 256 bit register.

Does anybody know if there is a more elegant (or faster) way to do this?

Just as an additional information: I actually want to compute the index of the first set bit for arbitrarily long vectors created by logical ANDs, and I am comparing the performance of standard 64 bit operations with SSE and AVX-2 code. Here is my entire test code:

#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
#include <stdint.h>
#include <assert.h>
#include <time.h>
#include <sys/time.h>
#include <stdalign.h>

#define ALL  0xFFFFFFFF
#define NONE 0x0


#define BV_SHIFTBITS ((size_t)    6)
#define BV_MOD_WORD  ((size_t)   63)
#define BV_ONE       ((uint64_t)  1)
#define BV_ZERO      ((uint64_t)  0)
#define BV_WORDSIZE  ((uint64_t) 64)


uint64_t*
Vector_new(
    size_t num_bits) {

  assert ((num_bits % 256) == 0);
  size_t num_words = num_bits >> BV_SHIFTBITS;
  size_t mod = num_bits & BV_MOD_WORD;
  if (mod > 0)
    assert (0);
  uint64_t* words;
  posix_memalign((void**) &(words), 32, sizeof(uint64_t) * num_words);
  for (size_t i = 0; i < num_words; ++i)
    words[i] = 0;
  return words;
}


void
Vector_set(
    uint64_t* vector,
    size_t pos) {

  const size_t word_index = pos >> BV_SHIFTBITS;
  const size_t offset     = pos & BV_MOD_WORD;
  vector[word_index] |= (BV_ONE << (BV_MOD_WORD - offset));
}


size_t
Vector_and_first_bit(
    uint64_t** vectors,
    const size_t num_vectors,
    const size_t num_words) {

  for (size_t i = 0; i < num_words; ++i) {
    uint64_t word = vectors[0][i];
    for (size_t j = 1; j < num_vectors; ++j)
      word &= vectors[j][i];
    if (word > 0)
      return (1 + i * BV_WORDSIZE + __builtin_clzll(word));
  }
  return 0;
}


size_t
Vector_and_first_bit_256(
    uint64_t** vectors,
    const size_t num_vectors,
    const size_t num_avx_words) {

  for (size_t i = 0; i < num_avx_words; ++i) {
    const size_t addr_offset = i << 2;
    __m256i avx_word = _mm256_load_si256(
        (__m256i const*) (vectors[0] + addr_offset));

    // AND the AVX words
    for (size_t j = 1; j < num_vectors; ++j) {
      avx_word = _mm256_and_si256(
        avx_word,
        _mm256_load_si256((__m256i const*) (vectors[j] + addr_offset))
      );
    }

    // test whether resulting AVX word is not zero
    if (!_mm256_testz_si256(avx_word, avx_word)) {
      uint64_t word = _mm256_extract_epi64(avx_word, 0);
      const size_t shift = i << 8;
      if (word > 0)
        return (1 + shift + __builtin_clzll(word));

      word = _mm256_extract_epi64(avx_word, 1);
      if (word > 0)
        return (1 + shift + __builtin_clzll(word) + 64);

      word = _mm256_extract_epi64(avx_word, 2);
      if (word > 0)
        return (1 + shift + __builtin_clzll(word) + 128);

      word = _mm256_extract_epi64(avx_word, 3);
      return (1 + shift + __builtin_clzll(word) + 192);
    }
  }
  return 0;
}


size_t
Vector_and_first_bit_128(
    uint64_t** vectors,
    const size_t num_vectors,
    const size_t num_avx_words) {

  for (size_t i = 0; i < num_avx_words; ++i) {
    const size_t addr_offset = i << 1;
    __m128i avx_word = _mm_load_si128(
        (__m128i const*) (vectors[0] + addr_offset));

    // AND the AVX words
    for (size_t j = 1; j < num_vectors; ++j) {
      avx_word = _mm_and_si128(
        avx_word,
        _mm_load_si128((__m128i const*) (vectors[j] + addr_offset))
      );
    }

    // test whether resulting AVX word is not zero
    if (!_mm_test_all_zeros(avx_word, avx_word)) {
      uint64_t word = _mm_extract_epi64(avx_word, 0);
      if (word > 0)
        return (1 + (i << 7) + __builtin_clzll(word));

      word = _mm_extract_epi64(avx_word, 1);
      return (1 + (i << 7) + __builtin_clzll(word) + 64);
    }
  }
  return 0;
}


uint64_t*
make_random_vector(
    const size_t num_bits,
    const size_t propability) {

  uint64_t* vector = Vector_new(num_bits);
  for (size_t i = 0; i < num_bits; ++i) {
    const int x = rand() % 10;
    if (x >= (int) propability)
      Vector_set(vector, i);
  }
  return vector;
}


size_t
millis(
    const struct timeval* end,
    const struct timeval* start) {

  struct timeval e = *end;
  struct timeval s = *start;
  return (1000 * (e.tv_sec - s.tv_sec) + (e.tv_usec - s.tv_usec) / 1000);
}


int
main(
    int argc,
    char** argv) {

  if (argc != 6)
    printf("fuck %s\n", argv[0]);

  srand(time(NULL));

  const size_t num_vectors = atoi(argv[1]);
  const size_t size = atoi(argv[2]);
  const size_t num_iterations = atoi(argv[3]);
  const size_t num_dimensions = atoi(argv[4]);
  const size_t propability = atoi(argv[5]);
  const size_t num_words = size / 64;
  const size_t num_sse_words = num_words / 2;
  const size_t num_avx_words = num_words / 4;

  assert(num_vectors > 0);
  assert(size > 0);
  assert(num_iterations > 0);
  assert(num_dimensions > 0);

  struct timeval t1;
  gettimeofday(&t1, NULL);

  uint64_t*** vectors = (uint64_t***) malloc(sizeof(uint64_t**) * num_vectors);
  for (size_t j = 0; j < num_vectors; ++j) {
    vectors[j] = (uint64_t**) malloc(sizeof(uint64_t*) * num_dimensions);
    for (size_t i = 0; i < num_dimensions; ++i)
      vectors[j][i] = make_random_vector(size, propability);
  }

  struct timeval t2;
  gettimeofday(&t2, NULL);
  printf("Creation: %zu ms\n", millis(&t2, &t1));



  size_t* results_64    = (size_t*) malloc(sizeof(size_t) * num_vectors);
  size_t* results_128   = (size_t*) malloc(sizeof(size_t) * num_vectors);
  size_t* results_256   = (size_t*) malloc(sizeof(size_t) * num_vectors);


  gettimeofday(&t1, NULL);
  for (size_t j = 0; j < num_iterations; ++j)
    for (size_t i = 0; i < num_vectors; ++i)
      results_64[i] = Vector_and_first_bit(vectors[i], num_dimensions,
          num_words);
  gettimeofday(&t2, NULL);
  const size_t millis_64 = millis(&t2, &t1);
  printf("64            : %zu ms\n", millis_64);


  gettimeofday(&t1, NULL);
  for (size_t j = 0; j < num_iterations; ++j)
    for (size_t i = 0; i < num_vectors; ++i)
      results_128[i] = Vector_and_first_bit_128(vectors[i],
          num_dimensions, num_sse_words);
  gettimeofday(&t2, NULL);
  const size_t millis_128 = millis(&t2, &t1);
  const double factor_128 = (double) millis_64 / (double) millis_128;
  printf("128           : %zu ms (factor: %.2f)\n", millis_128, factor_128);

  gettimeofday(&t1, NULL);
  for (size_t j = 0; j < num_iterations; ++j)
    for (size_t i = 0; i < num_vectors; ++i)
      results_256[i] = Vector_and_first_bit_256(vectors[i],
          num_dimensions, num_avx_words);
  gettimeofday(&t2, NULL);
  const size_t millis_256 = millis(&t2, &t1);
  const double factor_256 = (double) millis_64 / (double) millis_256;
  printf("256           : %zu ms (factor: %.2f)\n", millis_256, factor_256);


  for (size_t i = 0; i < num_vectors; ++i) {
    if (results_64[i] != results_256[i])
      printf("ERROR: %zu (64) != %zu (256) with i = %zu\n", results_64[i],
          results_256[i], i);
    if (results_64[i] != results_128[i])
      printf("ERROR: %zu (64) != %zu (128) with i = %zu\n", results_64[i],
          results_128[i], i);
  }


  free(results_64);
  free(results_128);
  free(results_256);

  for (size_t j = 0; j < num_vectors; ++j) {
    for (size_t i = 0; i < num_dimensions; ++i)
      free(vectors[j][i]);
    free(vectors[j]);
  }
  free(vectors);
  return 0;
}

To compile:

gcc -o main main.c -O3 -Wall -Wextra -pedantic-errors -Werror -march=native -std=c99 -fno-tree-vectorize

To execute:

./main 1000 8192 50000 5 9

The parameters mean: 1000 testcases, vectors of length 8192 bits, 50000, test repetitions (last two parameters are minor tweaks).

Sample output for the above call on my machine:

Creation: 363 ms
64            : 15000 ms
128           : 10070 ms (factor: 1.49)
256           : 6784 ms (factor: 2.21)

Solution

  • If your input values are uniformly distributed, almost all of the time the highest set bit will be in the top 64 bits of the vector (1 in 2^64). A branch on this condition will predict very well. @Nejc's answer is good for that case.

    But many problems where lzcnt is part of the solution have more uniformly distributed output, where it's common for the highest set bit to be somewhere other than the highest 64 bits.


    Wim's idea of lzcnt on a compare bitmap to find the right element is a very good approach.

    However, runtime-variable indexing of the vector with a store/reload is probably better than a shuffle. Store-forwarding latency is low (maybe 5 to 7 cycles on Skylake), and some of that latency is in parallel with the index generation (compare / movemask / lzcnt). The movd/vpermd/movd lane-crossing shuffle strategy takes 7 cycles on Skylake to get the right element into an integer register, only starting after the index is known. Load-use latency for an L1d cache hit is only 5 cycles from address to data, probably similar when data comes from the store buffer. (4c only in pointer-chasing scenarios). See http://agner.org/optimize/, and especially https://uops.info/ which has latency measurements for movd. If you look at the details, one of the measurements is for a round-trip chain of movd to/from xmm0. On Haswell it's 2 cycles per round trip, so 5 total with a vpermd in between. On Skylake, a movd round trip has 4 cycle latency, presumably 3 cycles one way, 1 the other, so 7 cycles with vpermd.

    I think this version should be better or equal latency on Haswell/Skylake, and also better throughput. Much better on Ryzen. (vpermd is slower than Intel on Zen before Zen 4. And a movd round-trip on Zen 4 is about 7 cycles, with one of the movd instructions being 2 uops. 6 cycle round-trip on Zen 2 and 3 with single-uop movd.)

    Some scalar math is needed on an lzcnt result to get a load index, which eats into the latency and throughput advantage some, depending on how clever the compiler is. lzcnt on a vmovmskps result works directly as a shuffle index for vpermd.

    Aligning the stack by 32 avoids cache-line splits on a 32-byte store, but it takes extra instructions. So this is best if it can inline into a function that uses it multiple times, or already needs that much alignment for some other __m256i. GCC will align the stack whether you ask for it or not, but MSVC and Clang won't. Store-forwarding from a dword aligned relative to the store works even if the store itself is a cache-line split on most modern CPUs, I think.

    static inline
    int lzcnt_si256(__m256i vec)
    {
        // or just lzcnt_si256_memsrc(&vec) optimizes away store/reload
        __m256i   vnonzero = _mm256_cmpeq_epi32(vec, _mm256_setzero_si256());
        uint32_t  nzmask = ~ _mm256_movemask_epi8(vnonzero);   //  1 for bytes that are part of non-zero dwords
        // nzmask |= 0xf;  // branchless clamp to last elem
        if (nzmask == 0)     // all 32 bits came from vpmovmskb, so NOT didn't introduce any constant 1s
            return 256;      // don't access outside the array
        alignas(32) uint32_t elems[8];
        _mm256_storeu_si256((__m256i*)elems, vec);
    
        unsigned  lzbytes = _lzcnt_u32(nzmask);   // bytes above the dword containing the nonzero bit.
        unsigned char *end_elem = 28 + (unsigned char*)elems;
        uint32_t *nz_elem = (uint32_t*)(end_elem - lzbytes);  // idx = 31-(lzcnt+3) = 28-lzcnt
        return    8*lzbytes + _lzcnt_u32(*nz_elem);  // this is an aligned load, memcpy not needed
    }
    
    // unaligned and strict-aliasing safe
    static inline
    int lzcnt_si256_memsrc(const void *m256)
    {
        __m256i  vec = _mm256_loadu_si256((const __m256i*)m256);
        __m256i  vnonzero = _mm256_cmpeq_epi32(vec, _mm256_setzero_si256());
        uint32_t nzmask = ~ _mm256_movemask_epi8(vnonzero);
        // nzmask |= 0xf;  // branchless clamp to last elem
        if (nzmask == 0)    // all 32 bits came from vpmovmskb, so NOT didn't introduce any constant 1s
            return 256;     // don't access outside the array
    
        unsigned  lzbytes = _lzcnt_u32(nzmask);
        unsigned char *end_elem = 28 + (unsigned char*)m256;  // can be done as part of the addressing mode after sub
        uint32_t *nz_elem = (uint32_t*)(end_elem - lzbytes);  // idx = MSB_idx-3 = 31-(lzcnt+3) = 28-lzcnt
        uint32_t nz_dword;
        memcpy(&nz_dword, nz_elem, sizeof(nz_dword));  // For strict-aliasing safety,  and/or if m256 wasn't aligned by 4.  __attribute__((aligned(1),may_alias)) on the pointer would work in GNU C
        return    8*lzbytes + _lzcnt_u32(nz_dword);
    }
    

    GCC and clang don't optimize away the storeu copy if the vector was just loaded, so I made a separate version, unfortunately. (If your data isn't 4-byte aligned and you're getting cache-line splits on the dword load as well as the vector load, consider using the byte version that loads with movzx.)

    On Godbolt with GCC13 -O3 -march=x86-64-v3, we get asm like this to count ymm0 into esi (inlined into main's loop.)

    # GCC13.2 -O3 -march=x86-64-v3     (or -march=haswell)
    # lzcnt_si256_memsrc inside a loop in main, after printf
       vmovdqa ymm0, YMMWORD PTR [rsp]
       vpxor   xmm1, xmm1, xmm1
       mov     esi, 256
       vpcmpeqd        ymm0, ymm0, ymm1  # could have used a memory source operand
       vpmovmskb       eax, ymm0
       xor     eax, -1                   # ~mask and set FLAGS, unlike the NOT instruction
       je      .L33
       xor     ecx, ecx            # dep-breaking for lzcnt
       mov     edx, 28
       lzcnt   ecx, eax
       sub     rdx, rcx
       lzcnt   edx, DWORD PTR [rsp+32+rdx]  # Un-laminates into 2 uops with an indexed addr mode.  mov rdx, rsp ; sub would have allowed [rdx+32+28] here
       lea     esi, [rdx+rcx*8]          # lzbytes*8 + lzcnt(nz chunk)
    .L33:   # jump target for mask==0
    

    Clang likes to do vptest ymm0, ymm0 / jne for the early-out, costing more uops than just waiting to test the movemask result. Perhaps an [[unlikely]] annotation or non-portable equivalent could help.

    GCC's non-inline version is better in some ways (sub against the pointer arg and +28 as part of the addr mode for lzcnt edx, [rdi+28] which can stay micro-fused on Intel since it only uses one reg.) But GCC wastes a mov reg copy and two dep-breaking xor-zeroing instructions even though both lzcnt instructions can overwrite their inputs (or a reg holding a pointer for the mem-src version). Sometimes it's possible to rearrange your C in ways that help, but it depends on the surrounding code where this inlines.


    bsr instead of 31 - lzcnt on the mask could reduce critical path latency on Intel: no SUB or NEG, just adding something as part of the addressing mode for the scalar load. GCC8 and earlier would emit it for 31-__builtin_clz(), but current GCC just uses 31-lzcnt or 31^lzcnt even with -march=haswell where both have identical performance characteristics (including the output dependency.)

    If you're tuning specifically for Intel, BSR might still be a good idea. But for portable software, BSR is significantly slower on AMD than LZCNT, which is relevant everywhere except x86-64 macOS. But good luck getting compilers other than MSVC to emit it.

    #ifndef _MSC_VER
    #include <stdalign.h>  //MSVC is missing this?
    #else
    #include <intrin.h>
    #pragma intrinsic(_BitScanReverse)  // https://msdn.microsoft.com/en-us/library/fbxyd7zd.aspx suggests this
    #endif
    
    // undefined result for mask=0, like BSR
    static inline uint32_t bsr_nonzero(uint32_t mask)
    {
    // on Intel, bsr has a minor advantage for the first step
    // for AMD, BSR is slow so you should use 31-LZCNT.
    
     // Intel's docs say there should be a _bit_scan_reverse(x), maybe try that with ICC
     #ifdef _MSC_VER
        unsigned long tmp;
        _BitScanReverse(&tmp, mask);
        return tmp;
     #else
        return 31 - __builtin_clz(mask);  // GCC 9 and later use lzcnt even with -march=haswell implying tune for Intel, leading to worse code.  Same for clang
     #endif
    }
    

    In C++20 there's 31 - std::countr_zero(mask) - all CPUs with AVX2 also have BMI1/BMI2, so compilers will be able to use lzcnt for it. There's no portable C equivalent. (On CPUs without BMI1, countr_zero would be slightly slower because it guarantees 32 for mask=0, unlike the bsr instruction or intrinsic, or GNU __builtin_clz. So it would branch or CMOV on the input being zero.)

    BSR version using byte elements

    Not recommended in general, but I didn't want to throw this away. It compiles with a BSR on MSVC but is worse with GCC and Clang. Change the #if to 1 for lzcnt, for a byte-source version for unaligned data to avoid cache-line and page splits on the scalar load (but not the vec).

    int lzcnt_si256_byte_memsrc(const void *m256)
    {
        __m256i  vec = _mm256_loadu_si256((const __m256i*)m256);
        __m256i   nonzero_elem = _mm256_cmpeq_epi8(vec, _mm256_setzero_si256());
        uint32_t  mask = ~_mm256_movemask_epi8(nonzero_elem);   //  1 where there are non-zero bytes
    
        // alternative: mask |= 1 to lzcnt the low byte in that case, giving 32-24 = 8
        if (mask == 0)
            return 256;
        unsigned char *elems = (unsigned char*)m256;
    
    #if 0  // GCC13 compiles both the same, clang's __builtin_clz version is worse (xor twice)
    // Unless you're specifically tuning for Intel, use this version
        uint8_t *end_elem = elems + 31;
        unsigned   lz_msk   = _lzcnt_u32(mask);
        size_t   idx_from_end = -(size_t)lz_msk;    // idx = 31 - lz_msk;  // relative to start
        unsigned   highest_nonzero_byte = end_elem[idx_from_end];
    #else
        unsigned   idx = bsr_nonzero(mask);   // use bsr to get the 31-x for free, because it's known to be non-zero
        unsigned   lz_msk = 31 - idx;    // off the critical path, if compilers actually use BSR
    // MSVC notices that (31-idx)*8 - 24 + x  is (28-idx)*8 + x,  allowing a 2-component LEA
    // GCC and clang miss that, in older versions that actually use BSR
        unsigned   highest_nonzero_byte = elems[idx];
    #endif
        return     lz_msk * 8 + _lzcnt_u32(highest_nonzero_byte) - 24;
        // lzcnt(byte)-24, because we don't want to count the leading 24 bits of padding.
    }
    

    On AMD CPUs, bsr is significantly slower than lzcnt. On Intel CPUs, they're the same performance, except for minor variations in output-dependency details. (lzcnt has a false dependency before Skylake. bsr has a true dependency on the output for all CPUs.)

    bsr with input=0 leaves the destination register unmodified, but intrinsics don't provide a way to take advantage of that to get CMOV-like behaviour for free. (Intel only documents it as undefined output, but AMD documents the actual behaviour of Intel / AMD CPUs as producing the old value in the destination register).

    bsr sets ZF if the input was zero, rather than based on the output like most instructions. (This and the output dependency may be why it's slow on AMD.) Branching on the BSR flags is not particularly better than branching on ZF as set by xor eax,-1 to invert the mask, which is what gcc does. Anyway, Intel does document a _BitScanReverse(&idx, mask) intrinsic that returns a bool, but gcc doesn't support it (not even with x86intrin.h). The GNU C builtin doesn't return a boolean to let you use the flag result, and doesn't normally use the FLAGS result even when you check the input C variable for non-zero.


    Wim's version needs lz_msk-24 because the high 24 bits are always zero with an 8-bit mask. But a 32-bit mask fills a 32-bit reg.

    This version with 8 bit elements and a 32-bit mask is the reverse: we need to lzcnt the selected byte, not including the 24 leading zero bits in the register. So our -24 moves to a different spot, not part of the critical path for indexing the array.

    GCC chooses to do it as part of a single 3-component LEA (reg + reg*scale - const), which is great for throughput, but puts it on the critical path after the final lzcnt. (3-component LEA has extra latency vs. reg + reg*scale on Intel before Ice Lake. See Agner Fog's instruction tables, and https://uops.info/ for testing of different LEA addr modes on Ice Lake and Alder Lake; any scaled index is now a slow-LEA, but still 1c latency until Alder, and more ports.)

    Clang with some -march options makes a mess, favouring latency over throughput by using more instructions instead of a complex LEA. But sometimes it breaks things up more than necessary.

    A multiply by 8 can be done as part of an lea, but a multiply by 32 would need a shift (or be folded into two separate LEAs).


    Intel's optimization manual says (Table 2-24) even Sandybridge can forward from a 256-bit store to single-byte loads without a problem, so I think it's fine on AVX2 CPUs, the same as forwarding to 32-bit loads that of 4-byte-aligned chunks of the store.


    AVX-512 versions, maybe not faster, just an experiment

    Store/reload might still be a better strategy, especially with data already in memory to avoid the store part. We can make the low element special, always true or always false, with vptestnmb against a vector of all-ones except the low dword, getting the nz |= 0xf for free.

    AVX-512 has vplzcntq to do per-element 64-bit bit-scan. Doing that before a store could shorten critical-path latency by overlapping the lzcnt with the element-index calculation. But it takes a vector ALU instead of the port-1 ALU on Intel CPUs which can't run 512-bit vector uops (or any vector uops when there are any 512-bit uops in flight.)

    The fun way is to shuffle and blend using the mask to pick one of four elements, I was hoping I could use vpcompressq to left-pack, but that gets lowest not highest. A horizontal-sum type of shuffle + blend pattern can grab the lower element only if the high element's mask bit was zero. Shuffles themselves can use merge-masking, including into the same reg for single-uop reduction steps. But the mask instructions to prepare the mask aren't fast.

    // See Godbolt for the first 2 attempts at this
    
    // this isn't great either
    int lzcnt_si256_avx512_v2(__m256i vec)
    {
        __mmask8 nz = _mm256_test_epi64_mask(vec, vec); // 1 where input is non-zero, useful for merging higher into lower elements
        __m256i vclz = _mm256_lzcnt_epi64(vec);
        vclz = _mm256_add_epi64(vclz, _mm256_set_epi64x(0, 64, 128, 192));  // add the number of bits to the left of each element
        int nz_shr = nz>>1;
        vclz = _mm256_mask_unpackhi_epi64(vclz, nz_shr, vclz, vclz); //  even elements = higher elem if it was non-zero (nz_mask=1 merge masking), else keep the lower element
        nz_shr |= nz;  // kand: 1 cycle latency on port 0
        nz_shr >>= 2;  // kshiftr: 4 cycle latency on port 5 (Intel).  Zen 4 is 1c latency on 2 ports
        __m128i low = _mm256_mask_extracti32x4_epi32(_mm256_castsi256_si128(vclz), nz_shr, vclz, 1);
        return _mm_cvtsi128_si32(low);
    }
    

    Trailing zero count via vpcompressd to left-pack based on non-zero mask

    // Just for fun, use vpcompressd to grab the first non-zero element
    // Worth comparing for throughput and latency on Intel and Zen 4, vs. store/reload
    static inline
    int tzcnt_si256_avx512(__m256i vec)
    {
        __mmask8 mask = _mm256_test_epi32_mask(vec, vec);
        __m256i pack_nz = _mm256_maskz_compress_epi32(mask, vec);
        uint32_t first_nz = _mm256_extract_epi32(pack_nz, 0);  // optimizes to vmovd
        uint32_t mask32 = mask | (-1U<<7); // produce 256 = 7*32 + 32 for an all-zero vector where tzcnt(first_nz) = 32.
          // -1U<<7 fits in signed imm8.  With 1<<7, compilers were going nuts trying to optimize
          // alternative: if (mask==0) return 256;
        return _tzcnt_u32(first_nz) + 32*_tzcnt_u32(mask32);
    }
    

    On Godbolt I included a version that uses VBMI2 (Ice Lake) vpcompressb so the final x + 32*y can be *8 instead, allowing LEA. But that makes it harder to handle an all-zero mask; see the code comments on Godbolt.

    Another tzcnt strategy could involve vplzcntq( v & -v ), and subtracting that from a vector of set_epi64x(192+63, 128+63, 64+63, 63) to get tzcnt=63-lzcnt(blsi(vec)). Then select the right non-zero element? That's more vector ops, but runs the lzcnt dep chain in parallel with the compress shuffle. vpcompress* is 2 uops, and the first only needs the mask as input, not the vector being shuffled. (Presumably it processes a mask into a shuffle-control for vperm*). That would optimize for latency but not throughput. More vector uops is even more of a downside if using 512-bit vectors.

    Handling the all-zero input case might need a branch with the compress strategy, unless we wanted to do a merge-masked sub into a vector of set1(256). 63-lzcnt only works for lzcnt in [0..63]; it would need -1 instead of 64 to get the result we want for the input=0 case. That wouldn't hurt instruction-level parallelism badly: the vplzcntq could still run in parallel with with the compare-into-mask, and vpsubq would have to wait for both to be ready. And that's also separate from the compress reading the mask from the compare.