Interleave bits efficiently

Would a short, precalculated array lookup count as a "mathematical trick"?

Precalculate an array of 256 uint16_ts:

static const uint16_t lookup[256]={0x0000, 0x0001, 0x0005 ..., 0x5555};

We can interleave two eight-bit values, and come up with a 16 bit value easily:

uint16_t interleave(uint8_t a, uint8_t b)
{
    return (lookup[a] << 1) | lookup[b];
}

How to extend this to interleave two 32-bit values into a 64-bit value should be obvious: call this four times, for each one of the four bytes that make up a uint32_t, then << an | the results together. Bribe the compiler to inline the whole thing, and the end result should be fairly quick and cheap.

Since RAM is cheap these days, you might want to consider a precalculated table of 65536 uint32_ts, also.

To make saolof's answer concrete, the following is an implementation using the CLMUL instruction set, interleaving two pairs of uint32_ts per call:

#include <immintrin.h>
#include <stdint.h>

typedef struct {
  uint32_t x;
  uint32_t y;
} uint32_2;

static inline void interleave_clmul(uint32_2 *input, uint64_t *out) {
  __m128i xy = _mm_load_si128((const __m128i *)input);

  xy = _mm_shuffle_epi32(xy, 0b11011000);

  // Two carryless multiplies
  __m128i p2 = _mm_clmulepi64_si128(xy, xy, 0x11);
  __m128i p1 = _mm_clmulepi64_si128(xy, xy, 0x00);

  // Bitwise interleave the results
  p2 = _mm_slli_epi16(p2, 1);
  __m128i p3 = _mm_or_si128(p1, p2);

  _mm_storeu_si128((__m128i *)out, p3);
}

That compiles down to the following:

interleave_clmul(uint32_2*, unsigned long*):
        vpshufd         xmm0, xmmword ptr [rdi], 216    # xmm0 = mem[0,2,1,3]
        vpclmulqdq      xmm1, xmm0, xmm0, 17
        vpclmulqdq      xmm0, xmm0, xmm0, 0
        vpaddw          xmm1, xmm1, xmm1
        vpor            xmm0, xmm0, xmm1
        vmovdqu         xmmword ptr [rsi], xmm0
        ret

Replace _mm_load_si128 with _mm_loadu_si128 if your data is not aligned--unaligned loads aren't that much slower on x86 anyway. This system is faster than the corresponding implementation with pdep instructions in terms of throughput.

Naive
Total rdtsc: 1295559857, iterations: 1000, count: 10000
clmul-based
Total rdtsc: 17751716, iterations: 1000, count: 10000
pdep-based
Total rdtsc: 26123417, iterations: 1000, count: 10000
pdep-based unrolled
Total rdtsc: 24281811, iterations: 1000, count: 10000

Turbo boost was disabled; CPU is a 1.60 GHz base clock Kaby Lake. Results seem consistent across runs. (Results on other architectures would be nice.) The (somewhat messy) testing code:


#include <stdio.h>
#include <inttypes.h>
#include <string.h>
#include <stdlib.h>
#include <immintrin.h>
// rdtscp
#include <x86intrin.h>

typedef struct uint32_2 {
    uint32_t x;
    uint32_t y;
} uint32_2;


uint32_2* generate_pairs(const int count) {
    uint32_2* p = malloc(count * sizeof(uint32_2));

    uint32_t r = 401923091;
#define RNG r *= 52308420; r += 2304;

    for (int i = 0; i < count; ++i) {
        p[i].x = r;
        RNG RNG
        p[i].y = r;
        RNG RNG
    }

    return p;
}

void interleave_naive(uint64_t* dst, uint32_2* src, int count) {
    for (int i = 0; i < count; ++i) {
        struct uint32_2 s = src[i];
        uint32_t x = s.x, y = s.y;

        uint64_t result = 0;
        for (int k = 0; k < 32; ++k) {
            if (x & ((uint32_t)1 << k)) {
                result |= (uint64_t)1 << (2 * k);
            }

            if (y & ((uint32_t)1 << k)) {
                result |= (uint64_t)1 << (2 * k + 1);
            }
        }

        dst[i] = result;
    }
}

void interleave_pdep(uint64_t* dst, uint32_2* src, int count) {
    for (int i = 0; i < count; ++i) {
        struct uint32_2 s = src[i];
        uint32_t x = s.x, y = s.y;

        uint64_t result = _pdep_u64(x, 0x5555555555555555) | _pdep_u64(y, 0xaaaaaaaaaaaaaaaa);

        dst[i] = result;
    }
}

void interleave_pdep_unrolled(uint64_t* dst, uint32_2* src, int count) {
    for (int i = 0; i < count; i += 2) {
        struct uint32_2 s1 = src[i];
        struct uint32_2 s2 = src[i + 1];

        uint32_t x1 = s1.x, y1 = s1.y;
        uint32_t x2 = s2.x, y2 = s2.y;

        uint64_t result1 = _pdep_u64(x1, 0x5555555555555555) | _pdep_u64(y1, 0xaaaaaaaaaaaaaaaa);
        uint64_t result2 = _pdep_u64(x2, 0x5555555555555555) | _pdep_u64(y2, 0xaaaaaaaaaaaaaaaa);

        dst[i] = result1;
        dst[i + 1] = result2;
    }
}

void interleave_clmul(uint64_t* dst, uint32_2* src, int count) {
    uint32_2* end = src + count;
    uint64_t* out = dst;

    for (uint32_2* p = src; p < end; p += 2, out += 2) {
        __m128i xy = _mm_load_si128((const __m128i *) p);

        xy = _mm_shuffle_epi32(xy, 0b11011000);

        __m128i p2 = _mm_clmulepi64_si128(xy, xy, 0x11);
        __m128i p1 = _mm_clmulepi64_si128(xy, xy, 0x00);

        p2 = _mm_slli_epi16(p2, 1);
        __m128i p3 = _mm_or_si128(p1, p2);

        _mm_store_si128((__m128i *)out, p3);
    }
}

#define ITERATIONS 1000

void time_inv(uint32_2* pairs, int count, void (*interleave) (uint64_t*, uint32_2*, int)) {
    uint64_t* result = malloc(count * sizeof(uint64_t));
    uint64_t* reference_result = malloc(count * sizeof(uint64_t));

    interleave_naive(reference_result, pairs, count);
    // Induce page faults
    memset(result, 0, count * sizeof(uint64_t));

    unsigned _;
    int64_t start_rdtsc = __rdtscp(&_);
    for (int i = 0; i < ITERATIONS; ++i) {
        interleave(result, pairs, count);
    }
    int64_t end_rdtsc = __rdtscp(&_);

    for (int i = 0; i < count; ++i) {
        if (reference_result[i] != result[i]) {
            fprintf(stderr, "Incorrect value at index %d; got %" PRIx64 ", should be %" PRIx64 "\n", i, result[i], reference_result[i]);
            abort();
        }
    }

    printf("Total rdtsc: %" PRId64 ", iterations: %d, count: %d\n", end_rdtsc - start_rdtsc, ITERATIONS, count);

    free(result);
}

int main() {
    const int count = 10000;
    uint32_2* pairs = generate_pairs(count);

    printf("Naive\n");
    time_inv(pairs, count, interleave_naive);
    printf("clmul-based\n");
    time_inv(pairs, count, interleave_clmul);
    printf("pdep-based\n");
    time_inv(pairs, count, interleave_pdep);
    printf("pdep-based unrolled\n");
    time_inv(pairs, count, interleave_pdep_unrolled);

    free(pairs);
}

Compile with gcc bleh.c -o bleh -O2 -march=native.

Perf stats for each implementation are below. CLMUL seems to do 1.5c/pair, bottlenecking on port 5 contention by 2 pclmulqdq and 1 vpshufd, while the pdep implementations bottleneck on port 1, on which pdep executes, resulting in 2c/pair:

clmul-based
Total rdtsc: 1774895925, total milliseconds: 985.575000, iterations: 100000, count: 10000

 Performance counter stats for './interleave':

       556,602,052      uops_dispatched_port.port_0                                     (49.87%)
     1,556,592,314      cycles                                                        (49.86%)
       469,021,017      uops_dispatched_port.port_1                                     (49.86%)
       472,968,452      uops_dispatched_port.port_2                                     (50.08%)
       519,804,531      uops_dispatched_port.port_3                                     (50.13%)
       499,980,587      uops_dispatched_port.port_4                                     (50.14%)
     1,509,928,584      uops_dispatched_port.port_5                                     (50.14%)
     1,484,649,884      uops_dispatched_port.port_6                                     (49.92%)

pdep-based
Total rdtsc: 2588637876, total milliseconds: 1438.065000, iterations: 100000, count: 10000

 Performance counter stats for './interleave':

       745,844,862      uops_dispatched_port.port_0                                     (50.02%)
     2,289,048,624      cycles                                                        (50.02%)
     2,033,116,738      uops_dispatched_port.port_1                                     (50.02%)
     1,508,870,090      uops_dispatched_port.port_2                                     (50.02%)
     1,498,920,409      uops_dispatched_port.port_3                                     (49.98%)
     1,056,089,339      uops_dispatched_port.port_4                                     (49.98%)
       843,399,033      uops_dispatched_port.port_5                                     (49.98%)
     1,414,062,891      uops_dispatched_port.port_6                                     (49.98%)

pdep-based unrolled
Total rdtsc: 2387027127, total milliseconds: 1325.857000, iterations: 100000, count: 10000

 Performance counter stats for './interleave':

       532,577,450      uops_dispatched_port.port_0                                     (49.64%)
     2,099,782,071      cycles                                                        (49.94%)
     2,004,347,972      uops_dispatched_port.port_1                                     (50.24%)
     1,532,203,395      uops_dispatched_port.port_2                                     (50.54%)
     1,467,988,364      uops_dispatched_port.port_3                                     (50.36%)
     1,701,095,132      uops_dispatched_port.port_4                                     (50.06%)
       543,597,866      uops_dispatched_port.port_5                                     (49.76%)
       930,460,812      uops_dispatched_port.port_6                                     (49.46%)

NathanOliver's link offers the 16-bit -> 32-bit implementation:

static const unsigned int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};
static const unsigned int S[] = {1, 2, 4, 8};

unsigned int x; // Interleave lower 16 bits of x and y, so the bits of x
unsigned int y; // are in the even positions and bits from y in the odd;
unsigned int z; // z gets the resulting 32-bit Morton Number.  
                // x and y must initially be less than 65536.

x = (x | (x << S[3])) & B[3];
x = (x | (x << S[2])) & B[2];
x = (x | (x << S[1])) & B[1];
x = (x | (x << S[0])) & B[0];

y = [the same thing on y]

z = x | (y << 1);

Which works by:

leave the low 8 bits of x where they are. Move the high 8 bits up by 8;
divide in half and do the same thing, this time leaving the low pairs of 4 bits where they are and moving the others up by 4;
and again, and again.

I.e. it proceeds as:

   0000 0000 0000 0000  abcd efgh ijkl mnop
-> 0000 0000 abcd efgh  0000 0000 ijkl mnop
-> 0000 abcd 0000 efgh  0000 ijkl 0000 mnop
-> 00ab 00cd 00ef 00gh  00ij 00kl 00mn 00op
-> 0a0b 0c0d 0e0f 0g0h  0i0j 0k0l 0m0n 0o0p

And then combines the two inputs together.

As per my earlier comment, to extend that to 64 bits, just add an initial shift by 16 and mask by 0x0000ffff0000ffff, either because you can intuitively follow the pattern or as a divide-and-conquer step, turning the 32-bit problem into two non-overlapping 16-bit problems and then using the 16-bit solution.

For larger integers, it's worth mentioning the clmul x86 extension for finite field multiplication (carryless multiplication). Interleaving an integer with zeros is equivalent to a carryless multiplication of the integer with itself, which is a single ALU instruction.

Interleave bits efficiently

Tags:

Algorithm

C++

Math

Bit Manipulation

Assembly

Related

Recent Posts