Calculate the permanent as quickly as possible
gcc C++ n ≈ 36 (57 seconds on my system)
Uses Glynn formula with a Gray code for updates if all column sums are even, otherwise uses Ryser's method. Threaded and vectorized. Optimized for AVX, so don't expect much on older processors. Don't bother with n>=35
for a matrix with only +1's even if your system is fast enough since the signed 128 bit accumulator will overflow. For random matrices you will probably not hit the overflow. For n>=37
the internal multipliers will start to overflow for an all 1/-1
matrix. So only use this program for n<=36
.
Just give the matrix elements on STDIN separated by any kind of whitespace
permanent
1 2
3 4
^D
permanent.cpp
:
/*
Compile using something like:
g++ -Wall -O3 -march=native -fstrict-aliasing -std=c++11 -pthread -s permanent.cpp -o permanent
*/
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cstdint>
#include <climits>
#include <array>
#include <vector>
#include <thread>
#include <future>
#include <ctgmath>
#include <immintrin.h>
using namespace std;
bool const DEBUG = false;
int const CACHE = 64;
using Index = int_fast32_t;
Index glynn;
// Number of elements in our vectors
Index const POW = 3;
Index const ELEMS = 1 << POW;
// Over how many floats we distribute each row
Index const WIDTH = 9;
// Number of bits in the fraction part of a floating point number
int const FLOAT_MANTISSA = 23;
// Type to use for the first add/multiply phase
using Sum = float;
using SumN = __restrict__ Sum __attribute__((vector_size(ELEMS*sizeof(Sum))));
// Type to convert to between the first and second phase
using ProdN = __restrict__ int32_t __attribute__((vector_size(ELEMS*sizeof(int32_t))));
// Type to use for the third and last multiply phase.
// Also used for the final accumulator
using Value = __int128;
using UValue = unsigned __int128;
// Wrap Value so C++ doesn't really see it and we can put it in vectors etc.
// Needed since C++ doesn't fully support __int128
struct Number {
Number& operator+=(Number const& right) {
value += right.value;
return *this;
}
// Output the value
void print(ostream& os, bool dbl = false) const;
friend ostream& operator<<(ostream& os, Number const& number) {
number.print(os);
return os;
}
Value value;
};
using ms = chrono::milliseconds;
auto nr_threads = thread::hardware_concurrency();
vector<Sum> input;
// Allocate cache aligned datastructures
template<typename T>
T* alloc(size_t n) {
T* mem = static_cast<T*>(aligned_alloc(CACHE, sizeof(T) * n));
if (mem == nullptr) throw(bad_alloc());
return mem;
}
// Work assigned to thread k of nr_threads threads
Number permanent_part(Index n, Index k, SumN** more) {
uint64_t loops = (UINT64_C(1) << n) / nr_threads;
if (glynn) loops /= 2;
Index l = loops < ELEMS ? loops : ELEMS;
loops /= l;
auto from = loops * k;
auto to = loops * (k+1);
if (DEBUG) cout << "From=" << from << "\n";
uint64_t old_gray = from ^ from/2;
uint64_t bit = 1;
bool bits = (to-from) & 1;
Index nn = (n+WIDTH-1)/WIDTH;
Index ww = nn * WIDTH;
auto column = alloc<SumN>(ww);
for (Index i=0; i<n; ++i)
for (Index j=0; j<ELEMS; ++j) column[i][j] = 0;
for (Index i=n; i<ww; ++i)
for (Index j=0; j<ELEMS; ++j) column[i][j] = 1;
Index b;
if (glynn) {
b = n > POW+1 ? n - POW - 1: 0;
auto c = n-1-b;
for (Index k=0; k<l; k++) {
Index gray = k ^ k/2;
for (Index j=0; j< c; ++j)
if (gray & 1 << j)
for (Index i=0; i<n; ++i)
column[i][k] -= input[(b+j)*n+i];
else
for (Index i=0; i<n; ++i)
column[i][k] += input[(b+j)*n+i];
}
for (Index i=0; i<n; ++i)
for (Index k=0; k<l; k++)
column[i][k] += input[n*(n-1)+i];
for (Index k=1; k<l; k+=2)
column[0][k] = -column[0][k];
for (Index i=0; i<b; ++i, bit <<= 1) {
if (old_gray & bit) {
bits = bits ^ 1;
for (Index j=0; j<ww; ++j)
column[j] -= more[i][j];
} else {
for (Index j=0; j<ww; ++j)
column[j] += more[i][j];
}
}
for (Index i=0; i<n; ++i)
for (Index k=0; k<l; k++)
column[i][k] /= 2;
} else {
b = n > POW ? n - POW : 0;
auto c = n-b;
for (Index k=0; k<l; k++) {
Index gray = k ^ k/2;
for (Index j=0; j<c; ++j)
if (gray & 1 << j)
for (Index i=0; i<n; ++i)
column[i][k] -= input[(b+j)*n+i];
}
for (Index k=1; k<l; k+=2)
column[0][k] = -column[0][k];
for (Index i=0; i<b; ++i, bit <<= 1) {
if (old_gray & bit) {
bits = bits ^ 1;
for (Index j=0; j<ww; ++j)
column[j] -= more[i][j];
}
}
}
if (DEBUG) {
for (Index i=0; i<ww; ++i) {
cout << "Column[" << i << "]=";
for (Index j=0; j<ELEMS; ++j) cout << " " << column[i][j];
cout << "\n";
}
}
--more;
old_gray = (from ^ from/2) | UINT64_C(1) << b;
Value total = 0;
SumN accu[WIDTH];
for (auto p=from; p<to; ++p) {
uint64_t new_gray = p ^ p/2;
uint64_t bit = old_gray ^ new_gray;
Index i = __builtin_ffsl(bit);
auto diff = more[i];
auto c = column;
if (new_gray > old_gray) {
// Phase 1 add/multiply.
// Uses floats until just before loss of precision
for (Index i=0; i<WIDTH; ++i) accu[i] = *c++ -= *diff++;
for (Index j=1; j < nn; ++j)
for (Index i=0; i<WIDTH; ++i) accu[i] *= *c++ -= *diff++;
} else {
// Phase 1 add/multiply.
// Uses floats until just before loss of precision
for (Index i=0; i<WIDTH; ++i) accu[i] = *c++ += *diff++;
for (Index j=1; j < nn; ++j)
for (Index i=0; i<WIDTH; ++i) accu[i] *= *c++ += *diff++;
}
if (DEBUG) {
cout << "p=" << p << "\n";
for (Index i=0; i<ww; ++i) {
cout << "Column[" << i << "]=";
for (Index j=0; j<ELEMS; ++j) cout << " " << column[i][j];
cout << "\n";
}
}
// Convert floats to int32_t
ProdN prod32[WIDTH] __attribute__((aligned (32)));
for (Index i=0; i<WIDTH; ++i)
// Unfortunately gcc doesn't recognize the static_cast<int32_t>
// as a vector pattern, so force it with an intrinsic
#ifdef __AVX__
//prod32[i] = static_cast<ProdN>(accu[i]);
reinterpret_cast<__m256i&>(prod32[i]) = _mm256_cvttps_epi32(accu[i]);
#else // __AVX__
for (Index j=0; j<ELEMS; ++j)
prod32[i][j] = static_cast<int32_t>(accu[i][j]);
#endif // __AVX__
// Phase 2 multiply. Uses int64_t until just before overflow
int64_t prod64[3][ELEMS];
for (Index i=0; i<3; ++i) {
for (Index j=0; j<ELEMS; ++j)
prod64[i][j] = static_cast<int64_t>(prod32[i][j]) * prod32[i+3][j] * prod32[i+6][j];
}
// Phase 3 multiply. Collect into __int128. For large matrices this will
// actually overflow but that's ok as long as all 128 low bits are
// correct. Terms will cancel and the final sum can fit into 128 bits
// (This will start to fail at n=35 for the all 1 matrix)
// Strictly speaking this needs the -fwrapv gcc option
for (Index j=0; j<ELEMS; ++j) {
auto value = static_cast<Value>(prod64[0][j]) * prod64[1][j] * prod64[2][j];
if (DEBUG) cout << "value[" << j << "]=" << static_cast<double>(value) << "\n";
total += value;
}
total = -total;
old_gray = new_gray;
}
return bits ? Number{-total} : Number{total};
}
// Prepare datastructures, Assign work to threads
Number permanent(Index n) {
Index nn = (n+WIDTH-1)/WIDTH;
Index ww = nn*WIDTH;
Index rows = n > (POW+glynn) ? n-POW-glynn : 0;
auto data = alloc<SumN>(ww*(rows+1));
auto pointers = alloc<SumN *>(rows+1);
auto more = &pointers[0];
for (Index i=0; i<rows; ++i)
more[i] = &data[ww*i];
more[rows] = &data[ww*rows];
for (Index j=0; j<ww; ++j)
for (Index i=0; i<ELEMS; ++i)
more[rows][j][i] = 0;
Index loops = n >= POW+glynn ? ELEMS : 1 << (n-glynn);
auto a = &input[0];
for (Index r=0; r<rows; ++r) {
for (Index j=0; j<n; ++j) {
for (Index i=0; i<loops; ++i)
more[r][j][i] = j == 0 && i %2 ? -*a : *a;
for (Index i=loops; i<ELEMS; ++i)
more[r][j][i] = 0;
++a;
}
for (Index j=n; j<ww; ++j)
for (Index i=0; i<ELEMS; ++i)
more[r][j][i] = 0;
}
if (DEBUG)
for (Index r=0; r<=rows; ++r)
for (Index j=0; j<ww; ++j) {
cout << "more[" << r << "][" << j << "]=";
for (Index i=0; i<ELEMS; ++i)
cout << " " << more[r][j][i];
cout << "\n";
}
// Send work to threads...
vector<future<Number>> results;
for (auto i=1U; i < nr_threads; ++i)
results.emplace_back(async(DEBUG ? launch::deferred: launch::async, permanent_part, n, i, more));
// And collect results
auto r = permanent_part(n, 0, more);
for (auto& result: results)
r += result.get();
free(data);
free(pointers);
// For glynn we should double the result, but we will only do this during
// the final print. This allows n=34 for an all 1 matrix to work
// if (glynn) r *= 2;
return r;
}
// Print 128 bit number
void Number::print(ostream& os, bool dbl) const {
const UValue BILLION = 1000000000;
UValue val;
if (value < 0) {
os << "-";
val = -value;
} else
val = value;
if (dbl) val *= 2;
uint32_t output[5];
for (int i=0; i<5; ++i) {
output[i] = val % BILLION;
val /= BILLION;
}
bool print = false;
for (int i=4; i>=0; --i) {
if (print) {
os << setfill('0') << setw(9) << output[i];
} else if (output[i] || i == 0) {
print = true;
os << output[i];
}
}
}
// Read matrix, check for sanity
void my_main() {
Sum a;
while (cin >> a)
input.push_back(a);
size_t n = sqrt(input.size());
if (input.size() != n*n)
throw(logic_error("Read " + to_string(input.size()) +
" elements which does not make a square matrix"));
vector<double> columns_pos(n, 0);
vector<double> columns_neg(n, 0);
Sum *p = &input[0];
for (size_t i=0; i<n; ++i)
for (size_t j=0; j<n; ++j, ++p) {
if (*p >= 0) columns_pos[j] += *p;
else columns_neg[j] -= *p;
}
std::array<double,WIDTH> prod;
prod.fill(1);
int32_t odd = 0;
for (size_t j=0; j<n; ++j) {
prod[j%WIDTH] *= max(columns_pos[j], columns_neg[j]);
auto sum = static_cast<int32_t>(columns_pos[j] - columns_neg[j]);
odd |= sum;
}
glynn = (odd & 1) ^ 1;
for (Index i=0; i<WIDTH; ++i)
// A float has an implicit 1. in front of the fraction so it can
// represent 1 bit more than the mantissa size. And 1 << (mantissa+1)
// itself is in fact representable
if (prod[i] && log2(prod[i]) > FLOAT_MANTISSA+1)
throw(range_error("Values in matrix are too large. A subproduct reaches " + to_string(prod[i]) + " which doesn't fit in a float without loss of precision"));
for (Index i=0; i<3; ++i) {
auto prod3 = prod[i] * prod[i+3] * prod[i+6];
if (log2(prod3) >= CHAR_BIT*sizeof(int64_t)-1)
throw(range_error("Values in matrix are too large. A subproduct reaches " + to_string(prod3) + " which doesn't fit in an int64"));
}
nr_threads = pow(2, ceil(log2(static_cast<float>(nr_threads))));
uint64_t loops = UINT64_C(1) << n;
if (glynn) loops /= 2;
if (nr_threads * ELEMS > loops)
nr_threads = max(loops / ELEMS, UINT64_C(1));
// if (DEBUG) nr_threads = 1;
cout << n << " x " << n << " matrix, method " << (glynn ? "Glynn" : "Ryser") << ", " << nr_threads << " threads" << endl;
// Go for the actual calculation
auto start = chrono::steady_clock::now();
auto perm = permanent(n);
auto end = chrono::steady_clock::now();
auto elapsed = chrono::duration_cast<ms>(end-start).count();
cout << "Permanent=";
perm.print(cout, glynn);
cout << " (" << elapsed / 1000. << " s)" << endl;
}
// Wrapper to print any exceptions
int main() {
try {
my_main();
} catch(exception& e) {
cerr << "Error: " << e.what() << endl;
exit(EXIT_FAILURE);
}
exit(EXIT_SUCCESS);
}
C99, n ≈ 33 (35 seconds)
#include <stdint.h>
#include <stdio.h>
#define CHUNK_SIZE 12
#define NUM_THREADS 8
#define popcnt __builtin_popcountll
#define BILLION (1000 * 1000 * 1000)
#define UPDATE_ROW_PPROD() \
update_row_pprod(row_pprod, row, rows, row_sums, mask, mask_popcnt)
typedef __int128 int128_t;
static inline int64_t update_row_pprod
(
int64_t* row_pprod, int64_t row, int64_t* rows,
int64_t* row_sums, int64_t mask, int64_t mask_popcnt
)
{
int64_t temp = 2 * popcnt(rows[row] & mask) - mask_popcnt;
row_pprod[0] *= temp;
temp -= 1;
row_pprod[1] *= temp;
temp -= row_sums[row];
row_pprod[2] *= temp;
temp += 1;
row_pprod[3] *= temp;
return row + 1;
}
int main(int argc, char* argv[])
{
int64_t size = argc - 1, rows[argc - 1];
int64_t row_sums[argc - 1];
int128_t permanent = 0, sign = size & 1 ? -1 : 1;
if (argc == 2)
{
printf("%d\n", argv[1][0] == '-' ? -1 : 1);
return 0;
}
for (int64_t row = 0; row < size; row++)
{
char positive = argv[row + 1][0] == '+' ? '-' : '+';
sign *= ',' - positive;
rows[row] = row_sums[row] = 0;
for (char* p = &argv[row + 1][1]; *p; p++)
{
rows[row] <<= 1;
rows[row] |= *p == positive;
row_sums[row] += *p == positive;
}
row_sums[row] = 2 * row_sums[row] - size;
}
#pragma omp parallel for reduction(+:permanent) num_threads(NUM_THREADS)
for (int64_t mask = 1; mask < 1LL << (size - 1); mask += 2)
{
int64_t mask_popcnt = popcnt(mask);
int64_t row = 0;
int128_t row_prod = 1 - 2 * (mask_popcnt & 1);
int128_t row_prod_high = -row_prod;
int128_t row_prod_inv = row_prod;
int128_t row_prod_inv_high = -row_prod;
for (int64_t chunk = 0; chunk < size / CHUNK_SIZE; chunk++)
{
int64_t row_pprod[4] = {1, 1, 1, 1};
for (int64_t i = 0; i < CHUNK_SIZE; i++)
row = UPDATE_ROW_PPROD();
row_prod *= row_pprod[0], row_prod_high *= row_pprod[1];
row_prod_inv *= row_pprod[3], row_prod_inv_high *= row_pprod[2];
}
int64_t row_pprod[4] = {1, 1, 1, 1};
while (row < size)
row = UPDATE_ROW_PPROD();
row_prod *= row_pprod[0], row_prod_high *= row_pprod[1];
row_prod_inv *= row_pprod[3], row_prod_inv_high *= row_pprod[2];
permanent += row_prod + row_prod_high + row_prod_inv + row_prod_inv_high;
}
permanent *= sign;
if (permanent < 0)
printf("-"), permanent *= -1;
int32_t output[5], print = 0;
output[0] = permanent % BILLION, permanent /= BILLION;
output[1] = permanent % BILLION, permanent /= BILLION;
output[2] = permanent % BILLION, permanent /= BILLION;
output[3] = permanent % BILLION, permanent /= BILLION;
output[4] = permanent % BILLION;
if (output[4])
printf("%u", output[4]), print = 1;
if (print)
printf("%09u", output[3]);
else if (output[3])
printf("%u", output[3]), print = 1;
if (print)
printf("%09u", output[2]);
else if (output[2])
printf("%u", output[2]), print = 1;
if (print)
printf("%09u", output[1]);
else if (output[1])
printf("%u", output[1]), print = 1;
if (print)
printf("%09u\n", output[0]);
else
printf("%u\n", output[0]);
}
Input is currently a bit cumbersome; it is taken with rows as command line arguments, where each entry is represented by its sign, i.e., + indicates a 1 and - indicates a -1.
Test run
$ gcc -Wall -std=c99 -march=native -Ofast -fopenmp -fwrapv -o permanent permanent.c
$ ./permanent +--+ ---+ -+-+ +--+
-4
$ ./permanent ---- -+-- +--- +-+-
0
$ ./permanent +-+----- --++-++- +----+++ ---+-+++ +--++++- -+-+-++- +-+-+-+- --+-++++
192
$ ./permanent +-+--+++----++++-++- +-+++++-+--+++--+++- --+++----+-+++---+-- ---++-++++++------+- -+++-+++---+-+-+++++ +-++--+-++++-++-+--- +--+---+-++++---+++- +--+-++-+++-+-+++-++ +-----+++-----++-++- --+-+-++-+-++++++-++ -------+----++++---- ++---++--+-++-++++++ -++-----++++-----+-+ ++---+-+----+-++-+-+ +++++---+++-+-+++-++ +--+----+--++-+----- -+++-++--+++--++--++ ++--++-++-+++-++-+-+ +++---+--++---+----+ -+++-------++-++-+--
1021509632
$ time ./permanent +++++++++++++++++++++++++++++++{,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,} # 31
8222838654177922817725562880000000
real 0m8.365s
user 1m6.504s
sys 0m0.000s
$ time ./permanent ++++++++++++++++++++++++++++++++{,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,} # 32
263130836933693530167218012160000000
real 0m17.013s
user 2m15.226s
sys 0m0.001s
$ time ./permanent +++++++++++++++++++++++++++++++++{,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,} # 33
8683317618811886495518194401280000000
real 0m34.592s
user 4m35.354s
sys 0m0.001s
Python 2, n ≈ 28
from operator import mul
def fast_glynn_perm(M):
row_comb = [sum(c) for c in zip(*M)]
n=len(M)
total = 0
old_grey = 0
sign = +1
binary_power_dict = {2**i:i for i in range(n)}
num_loops = 2**(n-1)
for bin_index in xrange(1, num_loops + 1):
total += sign * reduce(mul, row_comb)
new_grey = bin_index^(bin_index/2)
grey_diff = old_grey ^ new_grey
grey_diff_index = binary_power_dict[grey_diff]
new_vector = M[grey_diff_index]
direction = 2 * cmp(old_grey,new_grey)
for i in range(n):
row_comb[i] += new_vector[i] * direction
sign = -sign
old_grey = new_grey
return total/num_loops
Uses the Glynn formula with a Gray code for updates. Runs up to n=23
in a minute on my machine. One can surely do better implementing this in a faster language and with better data structures. This doesn't use that the matrix is ±1-valued.
A Ryser formula implementation is very similar, summing over all 0/1 vectors of coefficients rather than ±1-vectors. It takes about twice as long as Glynn's formula because adds over all 2^n such vectors, whereas Glynn's halves that using symmetry to only those starting with +1
.
from operator import mul
def fast_ryser_perm(M):
n=len(M)
row_comb = [0] * n
total = 0
old_grey = 0
sign = +1
binary_power_dict = {2**i:i for i in range(n)}
num_loops = 2**n
for bin_index in range(1, num_loops) + [0]:
total += sign * reduce(mul, row_comb)
new_grey = bin_index^(bin_index/2)
grey_diff = old_grey ^ new_grey
grey_diff_index = binary_power_dict[grey_diff]
new_vector = M[grey_diff_index]
direction = cmp(old_grey, new_grey)
for i in range(n):
row_comb[i] += new_vector[i] * direction
sign = -sign
old_grey = new_grey
return total * (-1)**n