I want to create a wrapper around the x86 instructions PDEP (Parallel Bit Deposit) and PEXT (Parallel Bit Extract). On architectures where these aren't available (and the corresponding intrinsics aren't either), I need a fast fallback implementation.
The naive algorithms for 32-bit integers looks as follows:
constexpr std::uint32_t bit_deposit(std::uint32_t src, std::uint32_t mask) {
std::uint32_t result = 0;
for (std::uint32_t src_pos = 0, mask_pos = 0; mask_pos != 32; ++mask_pos) {
if (mask >> mask_pos & 1) {
result |= (src >> src_pos++ & 1) << mask_pos;
}
}
return result;
}
static_assert(bit_deposit(0b000, 0b000000) == 0b000000);
static_assert(bit_deposit(0b101, 0b101010) == 0b100010);
static_assert(bit_deposit(0b111, 0b101010) == 0b101010);
constexpr std::uint32_t bit_extract(std::uint32_t src, std::uint32_t mask) {
std::uint32_t result = 0;
for (std::uint32_t src_pos = 0, mask_pos = 0; mask_pos != 32; ++mask_pos) {
if (mask >> mask_pos & 1) {
result |= (src >> mask_pos & 1) << src_pos++;
}
}
return result;
}
static_assert(bit_extract(0b000000, 0b000000) == 0b000);
static_assert(bit_extract(0b100010, 0b101010) == 0b101);
static_assert(bit_extract(0b101010, 0b101010) == 0b111);
Is there a way that you can significantly improve this? The current algorithm is O(n), and I suspect there is a O(log n) solution.
For a lot of other bit manipulation ops, there is logarithmic algorithm (multi-bit shift from single-bit shift, populatin count, round to next power of two, etc.). Therefore, I suspect that something similar exists for bit-deposit and bit-extract.
In Hacker's Delight chapter 7, rearranging bits and bytes, there is this implementation for expand:
unsigned expand(unsigned x, unsigned m) {
unsigned m0, mk, mp, mv, t;
unsigned array[5];
int i;
m0 = m; // Save original mask.
mk = ~m << 1; // We will count 0's to right.
for (i = 0; i < 5; i++) {
mp = mk ^ (mk << 1); // Parallel suffix.
mp = mp ^ (mp << 2);
mp = mp ^ (mp << 4);
mp = mp ^ (mp << 8);
mp = mp ^ (mp << 16);
mv = mp & m; // Bits to move.
array[i] = mv;
m = (m ^ mv) | (mv >> (1 << i)); // Compress m.
mk = mk & ~mp;
}
for (i = 4; i >= 0; i--) {
mv = array[i];
t = x << (1 << i);
x = (x & ~mv) | (t & mv);
}
return x & m0; // Clear out extraneous bits.
}
And this implementation for compress:
unsigned compress(unsigned x, unsigned m) {
unsigned mk, mp, mv, t;
int i;
x = x & m; // Clear irrelevant bits.
mk = ~m << 1; // We will count 0's to right.
for (i = 0; i < 5; i++) {
mp = mk ^ (mk << 1); // Parallel suffix.
mp = mp ^ (mp << 2);
mp = mp ^ (mp << 4);
mp = mp ^ (mp << 8);
mp = mp ^ (mp << 16);
mv = mp & m; // Bits to move.
m = m ^ mv | (mv >> (1 << i)); // Compress m.
t = x & mv;
x = x ^ t | (t >> (1 << i)); // Compress x.
mk = mk & ~mp;
}
return x;
}
These doesn't make k
iterations for a k
-bit integer, but I don't know whether these are actually a good way to do it. I've heard a rumour that you can do better, but I haven't heard how, maybe it's not even true.