Interleave bits efficiently
Would a short, precalculated array lookup count as a "mathematical trick"?
Precalculate an array of 256 uint16_t
s:
static const uint16_t lookup[256]={0x0000, 0x0001, 0x0005 ..., 0x5555};
We can interleave two eight-bit values, and come up with a 16 bit value easily:
uint16_t interleave(uint8_t a, uint8_t b)
{
return (lookup[a] << 1) | lookup[b];
}
How to extend this to interleave two 32-bit values into a 64-bit value should be obvious: call this four times, for each one of the four bytes that make up a uint32_t
, then <<
an |
the results together. Bribe the compiler to inline the whole thing, and the end result should be fairly quick and cheap.
Since RAM is cheap these days, you might want to consider a precalculated table of 65536 uint32_t
s, also.
To make saolof's answer concrete, the following is an implementation using the CLMUL instruction set, interleaving two pairs of uint32_t
s per call:
#include <immintrin.h>
#include <stdint.h>
typedef struct {
uint32_t x;
uint32_t y;
} uint32_2;
static inline void interleave_clmul(uint32_2 *input, uint64_t *out) {
__m128i xy = _mm_load_si128((const __m128i *)input);
xy = _mm_shuffle_epi32(xy, 0b11011000);
// Two carryless multiplies
__m128i p2 = _mm_clmulepi64_si128(xy, xy, 0x11);
__m128i p1 = _mm_clmulepi64_si128(xy, xy, 0x00);
// Bitwise interleave the results
p2 = _mm_slli_epi16(p2, 1);
__m128i p3 = _mm_or_si128(p1, p2);
_mm_storeu_si128((__m128i *)out, p3);
}
That compiles down to the following:
interleave_clmul(uint32_2*, unsigned long*):
vpshufd xmm0, xmmword ptr [rdi], 216 # xmm0 = mem[0,2,1,3]
vpclmulqdq xmm1, xmm0, xmm0, 17
vpclmulqdq xmm0, xmm0, xmm0, 0
vpaddw xmm1, xmm1, xmm1
vpor xmm0, xmm0, xmm1
vmovdqu xmmword ptr [rsi], xmm0
ret
Replace _mm_load_si128
with _mm_loadu_si128
if your data is not aligned--unaligned loads aren't that much slower on x86 anyway. This system is faster than the corresponding implementation with pdep
instructions in terms of throughput.
Naive
Total rdtsc: 1295559857, iterations: 1000, count: 10000
clmul-based
Total rdtsc: 17751716, iterations: 1000, count: 10000
pdep-based
Total rdtsc: 26123417, iterations: 1000, count: 10000
pdep-based unrolled
Total rdtsc: 24281811, iterations: 1000, count: 10000
Turbo boost was disabled; CPU is a 1.60 GHz base clock Kaby Lake. Results seem consistent across runs. (Results on other architectures would be nice.) The (somewhat messy) testing code:
#include <stdio.h>
#include <inttypes.h>
#include <string.h>
#include <stdlib.h>
#include <immintrin.h>
// rdtscp
#include <x86intrin.h>
typedef struct uint32_2 {
uint32_t x;
uint32_t y;
} uint32_2;
uint32_2* generate_pairs(const int count) {
uint32_2* p = malloc(count * sizeof(uint32_2));
uint32_t r = 401923091;
#define RNG r *= 52308420; r += 2304;
for (int i = 0; i < count; ++i) {
p[i].x = r;
RNG RNG
p[i].y = r;
RNG RNG
}
return p;
}
void interleave_naive(uint64_t* dst, uint32_2* src, int count) {
for (int i = 0; i < count; ++i) {
struct uint32_2 s = src[i];
uint32_t x = s.x, y = s.y;
uint64_t result = 0;
for (int k = 0; k < 32; ++k) {
if (x & ((uint32_t)1 << k)) {
result |= (uint64_t)1 << (2 * k);
}
if (y & ((uint32_t)1 << k)) {
result |= (uint64_t)1 << (2 * k + 1);
}
}
dst[i] = result;
}
}
void interleave_pdep(uint64_t* dst, uint32_2* src, int count) {
for (int i = 0; i < count; ++i) {
struct uint32_2 s = src[i];
uint32_t x = s.x, y = s.y;
uint64_t result = _pdep_u64(x, 0x5555555555555555) | _pdep_u64(y, 0xaaaaaaaaaaaaaaaa);
dst[i] = result;
}
}
void interleave_pdep_unrolled(uint64_t* dst, uint32_2* src, int count) {
for (int i = 0; i < count; i += 2) {
struct uint32_2 s1 = src[i];
struct uint32_2 s2 = src[i + 1];
uint32_t x1 = s1.x, y1 = s1.y;
uint32_t x2 = s2.x, y2 = s2.y;
uint64_t result1 = _pdep_u64(x1, 0x5555555555555555) | _pdep_u64(y1, 0xaaaaaaaaaaaaaaaa);
uint64_t result2 = _pdep_u64(x2, 0x5555555555555555) | _pdep_u64(y2, 0xaaaaaaaaaaaaaaaa);
dst[i] = result1;
dst[i + 1] = result2;
}
}
void interleave_clmul(uint64_t* dst, uint32_2* src, int count) {
uint32_2* end = src + count;
uint64_t* out = dst;
for (uint32_2* p = src; p < end; p += 2, out += 2) {
__m128i xy = _mm_load_si128((const __m128i *) p);
xy = _mm_shuffle_epi32(xy, 0b11011000);
__m128i p2 = _mm_clmulepi64_si128(xy, xy, 0x11);
__m128i p1 = _mm_clmulepi64_si128(xy, xy, 0x00);
p2 = _mm_slli_epi16(p2, 1);
__m128i p3 = _mm_or_si128(p1, p2);
_mm_store_si128((__m128i *)out, p3);
}
}
#define ITERATIONS 1000
void time_inv(uint32_2* pairs, int count, void (*interleave) (uint64_t*, uint32_2*, int)) {
uint64_t* result = malloc(count * sizeof(uint64_t));
uint64_t* reference_result = malloc(count * sizeof(uint64_t));
interleave_naive(reference_result, pairs, count);
// Induce page faults
memset(result, 0, count * sizeof(uint64_t));
unsigned _;
int64_t start_rdtsc = __rdtscp(&_);
for (int i = 0; i < ITERATIONS; ++i) {
interleave(result, pairs, count);
}
int64_t end_rdtsc = __rdtscp(&_);
for (int i = 0; i < count; ++i) {
if (reference_result[i] != result[i]) {
fprintf(stderr, "Incorrect value at index %d; got %" PRIx64 ", should be %" PRIx64 "\n", i, result[i], reference_result[i]);
abort();
}
}
printf("Total rdtsc: %" PRId64 ", iterations: %d, count: %d\n", end_rdtsc - start_rdtsc, ITERATIONS, count);
free(result);
}
int main() {
const int count = 10000;
uint32_2* pairs = generate_pairs(count);
printf("Naive\n");
time_inv(pairs, count, interleave_naive);
printf("clmul-based\n");
time_inv(pairs, count, interleave_clmul);
printf("pdep-based\n");
time_inv(pairs, count, interleave_pdep);
printf("pdep-based unrolled\n");
time_inv(pairs, count, interleave_pdep_unrolled);
free(pairs);
}
Compile with gcc bleh.c -o bleh -O2 -march=native
.
Perf stats for each implementation are below. CLMUL seems to do 1.5c/pair, bottlenecking on port 5 contention by 2 pclmulqdq
and 1 vpshufd
, while the pdep implementations bottleneck on port 1, on which pdep
executes, resulting in 2c/pair:
clmul-based
Total rdtsc: 1774895925, total milliseconds: 985.575000, iterations: 100000, count: 10000
Performance counter stats for './interleave':
556,602,052 uops_dispatched_port.port_0 (49.87%)
1,556,592,314 cycles (49.86%)
469,021,017 uops_dispatched_port.port_1 (49.86%)
472,968,452 uops_dispatched_port.port_2 (50.08%)
519,804,531 uops_dispatched_port.port_3 (50.13%)
499,980,587 uops_dispatched_port.port_4 (50.14%)
1,509,928,584 uops_dispatched_port.port_5 (50.14%)
1,484,649,884 uops_dispatched_port.port_6 (49.92%)
pdep-based
Total rdtsc: 2588637876, total milliseconds: 1438.065000, iterations: 100000, count: 10000
Performance counter stats for './interleave':
745,844,862 uops_dispatched_port.port_0 (50.02%)
2,289,048,624 cycles (50.02%)
2,033,116,738 uops_dispatched_port.port_1 (50.02%)
1,508,870,090 uops_dispatched_port.port_2 (50.02%)
1,498,920,409 uops_dispatched_port.port_3 (49.98%)
1,056,089,339 uops_dispatched_port.port_4 (49.98%)
843,399,033 uops_dispatched_port.port_5 (49.98%)
1,414,062,891 uops_dispatched_port.port_6 (49.98%)
pdep-based unrolled
Total rdtsc: 2387027127, total milliseconds: 1325.857000, iterations: 100000, count: 10000
Performance counter stats for './interleave':
532,577,450 uops_dispatched_port.port_0 (49.64%)
2,099,782,071 cycles (49.94%)
2,004,347,972 uops_dispatched_port.port_1 (50.24%)
1,532,203,395 uops_dispatched_port.port_2 (50.54%)
1,467,988,364 uops_dispatched_port.port_3 (50.36%)
1,701,095,132 uops_dispatched_port.port_4 (50.06%)
543,597,866 uops_dispatched_port.port_5 (49.76%)
930,460,812 uops_dispatched_port.port_6 (49.46%)
NathanOliver's link offers the 16-bit -> 32-bit implementation:
static const unsigned int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};
static const unsigned int S[] = {1, 2, 4, 8};
unsigned int x; // Interleave lower 16 bits of x and y, so the bits of x
unsigned int y; // are in the even positions and bits from y in the odd;
unsigned int z; // z gets the resulting 32-bit Morton Number.
// x and y must initially be less than 65536.
x = (x | (x << S[3])) & B[3];
x = (x | (x << S[2])) & B[2];
x = (x | (x << S[1])) & B[1];
x = (x | (x << S[0])) & B[0];
y = [the same thing on y]
z = x | (y << 1);
Which works by:
- leave the low 8 bits of x where they are. Move the high 8 bits up by 8;
- divide in half and do the same thing, this time leaving the low pairs of 4 bits where they are and moving the others up by 4;
- and again, and again.
I.e. it proceeds as:
0000 0000 0000 0000 abcd efgh ijkl mnop
-> 0000 0000 abcd efgh 0000 0000 ijkl mnop
-> 0000 abcd 0000 efgh 0000 ijkl 0000 mnop
-> 00ab 00cd 00ef 00gh 00ij 00kl 00mn 00op
-> 0a0b 0c0d 0e0f 0g0h 0i0j 0k0l 0m0n 0o0p
And then combines the two inputs together.
As per my earlier comment, to extend that to 64 bits, just add an initial shift by 16 and mask by 0x0000ffff0000ffff
, either because you can intuitively follow the pattern or as a divide-and-conquer step, turning the 32-bit problem into two non-overlapping 16-bit problems and then using the 16-bit solution.
For larger integers, it's worth mentioning the clmul x86 extension for finite field multiplication (carryless multiplication). Interleaving an integer with zeros is equivalent to a carryless multiplication of the integer with itself, which is a single ALU instruction.