Fastest way to compare bitsets (< operator on bitsets)?
The obvious optimization would be
template<std::size_t N>
bool operator<(const std::bitset<N>& x, const std::bitset<N>& y)
{
for (int i = N-1; i >= 0; i--) {
if (x[i] ^ y[i]) return y[i];
}
return false;
}
Other than that, it should be quite impossible to use a more bits-per-test as there is no standard-conforming way to access them. You could benchmark x.to_string() < y.to_string()
and hope for both to_string()
and string comparison to be optimized better than bitwise access to a bitset
, but that's a long shot.
I just looked at the source code, but unfortunately (unless, hopefully, I am mistaken), they don't seem to give you in-place access to a const & unsigned long
for a particular block of bits. If they did, then you could perform template recursion, and effectively compare each unsigned long
rather than each bit in an unsigned long.
After all, if A < B
, then not only should each of the most significant bits a <= b
, also each of the most significant block A[i] <= B[i]
.
I hate to say it, but I would probably roll my own using recursion on C++11's std::array
. If you have access to the blocks, then you can make a template recursive function to do this pretty easily (and as I'm sure you know since you're asking for metaprogramming) give the compiler a great chance to optimize.
All in all, not a great answer, but that's what I would do.
Excellent question, by the way.
===========
EDIT
This should time three approaches: the one with the most current upvotes, the block strategy I described, and a template recursive variant. I fill a vector with bitsets and then sort repeatedly using the specified comparator functor.
Happy hacking!
Output on my computer:
RUNTIMES: compiled g++ -std=c++11 -Wall -g test.cpp std::bitset 4530000 (6000000 original in OP) Block-by-block 900000 Template recursive 730000 compiled g++ -std=c++11 -Wall -g -O3 test.cpp RUNTIMES: std::bitset 700000 (740000 original in OP) Block-by-block 470000 Template recursive 530000
C++11 code:
#include <iostream>
#include <bitset>
#include <algorithm>
#include <time.h>
/* Existing answer. Note that I've flipped the order of bit significance to match my own */
template<std::size_t N>
class BitByBitComparator
{
public:
bool operator()(const std::bitset<N>& x, const std::bitset<N>& y) const
{
for (int i = 0; i < N; ++i) {
if (x[i] ^ y[i]) return y[i];
}
return false;
}
};
/* New simple bit set class (note: mostly untested). Also note bad
design: should only allow read access via immutable facade. */
template<std::size_t N>
class SimpleBitSet
{
public:
static const int BLOCK_SIZE = 64;
static const int LOG_BLOCK_SIZE = 6;
static constexpr int NUM_BLOCKS = N >> LOG_BLOCK_SIZE;
std::array<unsigned long int, NUM_BLOCKS> allBlocks;
SimpleBitSet()
{
allBlocks.fill(0);
}
void addItem(int itemIndex)
{
// TODO: can do faster
int blockIndex = itemIndex >> LOG_BLOCK_SIZE;
unsigned long int & block = allBlocks[blockIndex];
int indexWithinBlock = itemIndex % BLOCK_SIZE;
block |= (0x8000000000000000 >> indexWithinBlock);
}
bool getItem(int itemIndex) const
{
int blockIndex = itemIndex >> LOG_BLOCK_SIZE;
unsigned long int block = allBlocks[blockIndex];
int indexWithinBlock = itemIndex % BLOCK_SIZE;
return bool((block << indexWithinBlock) & 0x8000000000000000);
}
};
/* New comparator type 1: block-by-block. */
template<std::size_t N>
class BlockByBlockComparator
{
public:
bool operator()(const SimpleBitSet<N>& x, const SimpleBitSet<N>& y) const
{
return ArrayCompare(x.allBlocks, y.allBlocks);
}
template <std::size_t S>
bool ArrayCompare(const std::array<unsigned long int, S> & lhs, const std::array<unsigned long int, S> & rhs) const
{
for (int i=0; i<S; ++i)
{
unsigned long int lhsBlock = lhs[i];
unsigned long int rhsBlock = rhs[i];
if (lhsBlock < rhsBlock) return true;
if (lhsBlock > rhsBlock) return false;
}
return false;
}
};
/* New comparator type 2: template recursive block-by-block. */
template <std::size_t I, std::size_t S>
class TemplateRecursiveArrayCompare;
template <std::size_t S>
class TemplateRecursiveArrayCompare<S, S>
{
public:
bool operator()(const std::array<unsigned long int, S> & lhs, const std::array<unsigned long int, S> & rhs) const
{
return false;
}
};
template <std::size_t I, std::size_t S>
class TemplateRecursiveArrayCompare
{
public:
bool operator()(const std::array<unsigned long int, S> & lhs, const std::array<unsigned long int, S> & rhs) const
{
unsigned long int lhsBlock = lhs[I];
unsigned long int rhsBlock = rhs[I];
if (lhsBlock < rhsBlock) return true;
if (lhsBlock > rhsBlock) return false;
return TemplateRecursiveArrayCompare<I+1, S>()(lhs, rhs);
}
};
template<std::size_t N>
class TemplateRecursiveBlockByBlockComparator
{
public:
bool operator()(const SimpleBitSet<N>& x, const SimpleBitSet<N>& y) const
{
return TemplateRecursiveArrayCompare<x.NUM_BLOCKS, x.NUM_BLOCKS>()(x.allBlocks, y.allBlocks);
}
};
/* Construction, timing, and verification code */
int main()
{
srand(0);
const int BITSET_SIZE = 4096;
std::cout << "Constructing..." << std::endl;
// Fill a vector with random bitsets
const int NUMBER_TO_PROCESS = 10000;
const int SAMPLES_TO_FILL = BITSET_SIZE;
std::vector<std::bitset<BITSET_SIZE> > allBitSets(NUMBER_TO_PROCESS);
std::vector<SimpleBitSet<BITSET_SIZE> > allSimpleBitSets(NUMBER_TO_PROCESS);
for (int k=0; k<NUMBER_TO_PROCESS; ++k)
{
std::bitset<BITSET_SIZE> bs;
SimpleBitSet<BITSET_SIZE> homemadeBs;
for (int j=0; j<SAMPLES_TO_FILL; ++j)
{
int indexToAdd = rand()%BITSET_SIZE;
bs[indexToAdd] = true;
homemadeBs.addItem(indexToAdd);
}
allBitSets[k] = bs;
allSimpleBitSets[k] = homemadeBs;
}
clock_t t1,t2,t3,t4;
t1=clock();
std::cout << "Sorting using bit-by-bit compare and std::bitset..." << std::endl;
const int NUMBER_REPS = 100;
for (int rep = 0; rep<NUMBER_REPS; ++rep)
{
auto tempCopy = allBitSets;
std::sort(tempCopy.begin(), tempCopy.end(), BitByBitComparator<BITSET_SIZE>());
}
t2=clock();
std::cout << "Sorting block-by-block using SimpleBitSet..." << std::endl;
for (int rep = 0; rep<NUMBER_REPS; ++rep)
{
auto tempCopy = allSimpleBitSets;
std::sort(tempCopy.begin(), tempCopy.end(), BlockByBlockComparator<BITSET_SIZE>());
}
t3=clock();
std::cout << "Sorting block-by-block w/ template recursion using SimpleBitSet..." << std::endl;
for (int rep = 0; rep<NUMBER_REPS; ++rep)
{
auto tempCopy = allSimpleBitSets;
std::sort(tempCopy.begin(), tempCopy.end(), TemplateRecursiveBlockByBlockComparator<BITSET_SIZE>());
}
t4=clock();
std::cout << std::endl << "RUNTIMES:" << std::endl;
std::cout << "\tstd::bitset \t" << t2-t1 << std::endl;
std::cout << "\tBlock-by-block \t" << t3-t2 << std::endl;
std::cout << "\tTemplate recursive \t" << t4-t3 << std::endl;
std::cout << std::endl;
std::cout << "Checking result... ";
std::sort(allBitSets.begin(), allBitSets.end(), BitByBitComparator<BITSET_SIZE>());
auto copy = allSimpleBitSets;
std::sort(allSimpleBitSets.begin(), allSimpleBitSets.end(), BlockByBlockComparator<BITSET_SIZE>());
std::sort(copy.begin(), copy.end(), TemplateRecursiveBlockByBlockComparator<BITSET_SIZE>());
for (int k=0; k<NUMBER_TO_PROCESS; ++k)
{
auto stdBitSet = allBitSets[k];
auto blockBitSet = allSimpleBitSets[k];
auto tempRecBlockBitSet = allSimpleBitSets[k];
for (int j=0; j<BITSET_SIZE; ++j)
if (stdBitSet[j] != blockBitSet.getItem(j) || blockBitSet.getItem(j) != tempRecBlockBitSet.getItem(j))
std::cerr << "error: sorted order does not match" << std::endl;
}
std::cout << "success" << std::endl;
return 0;
}
Though you say bit set, aren't you really talking about arbitrary precision unsigned integer comparison. If so, then you're probably not going to easily do better then wrapping GMP.
From their website:
GMP is carefully designed to be as fast as possible, both for small operands and for huge operands. The speed is achieved by using fullwords as the basic arithmetic type, by using fast algorithms, with highly optimised assembly code for the most common inner loops for a lot of CPUs, and by a general emphasis on speed.
Consider their integer functions
If you are willing to adopt the solution if STL bitset changes you may use
template<int n>
bool compare(bitset<n>& l, bitset<n>& r){
if(n > 64){
typedef array<long, (n/64)> AsArray;
return *reinterpret_cast<AsArray*>(&l)
< *reinterpret_cast<AsArray*>(&r);
}//else
return l.to_ulong() < r.to_ulong();
}
the compiler throws the irrelevant branch of the if away