Approximating a special case of the Riemann Theta function
C++
No more naive approach. Only evaluate inside the ellipsoid.
Uses the armadillo, ntl, gsl and pthread libraries. Install using
apt-get install libarmadillo-dev libntl-dev libgsl-dev
Compile the program using something like:
g++ -Wall -std=c++11 -O3 -fno-math-errno -funsafe-math-optimizations -ffast-math -fno-signed-zeros -fno-trapping-math -fomit-frame-pointer -march=native -s infinity.cpp -larmadillo -lntl -lgsl -lpthread -o infinity
On some systems you may need to add -lgslcblas
after -lgsl
.
Run with the size of the matrix followed by the elements on STDIN:
./infinity < matrix.txt
matrix.txt
:
4
5 2 0 0
2 5 2 -2
0 2 5 0
0 -2 0 5
Or to try a precision of 1e-5:
./infinity -p 1e-5 < matrix.txt
infinity.cpp
:
// Based on http://arxiv.org/abs/nlin/0206009
#include <iostream>
#include <vector>
#include <stdexcept>
#include <cstdlib>
#include <cmath>
#include <string>
#include <thread>
#include <future>
#include <chrono>
using namespace std;
#include <getopt.h>
#include <armadillo>
using namespace arma;
#include <NTL/mat_ZZ.h>
#include <NTL/LLL.h>
using namespace NTL;
#include <gsl/gsl_sf_gamma.h>
#include <gsl/gsl_errno.h>
#include <gsl/gsl_roots.h>
double const EPSILON = 1e-4; // default precision
double const GROW = 2; // By how much we grow the ellipsoid volume
double const UPSCALE = 1e9; // lattice reduction, upscale real to integer
double const THREAD_SEC = 0.1; // Use threads if need more time than this
double const RADIUS_MAX = 1e6; // Maximum radius used in root finding
double const RADIUS_INTERVAL = 1e-6; // precision of target radius
int const ITER_MAX = 1000; // Maximum iterations in root finding
unsigned long POINTS_MIN = 1000; // Minimum points before getting fancy
struct Result {
Result& operator+=(Result const& add) {
sum += add.sum;
elapsed += add.elapsed;
points += add.points;
return *this;
}
friend Result operator-(Result const& left, Result const& right) {
return Result{left.sum - right.sum,
left.elapsed - right.elapsed,
left.points - right.points};
}
double sum, elapsed;
unsigned long points;
};
struct Params {
double half_rho, half_N, epsilon;
};
double fill_factor_error(double r, void *void_params) {
auto params = static_cast<Params*>(void_params);
r -= params->half_rho;
return gsl_sf_gamma_inc(params->half_N, r*r) - params->epsilon;
}
// Calculate radius needed for target precision
double radius(int N, double rho, double lat_det, double epsilon) {
Params params;
params.half_rho = rho / 2.;
params.half_N = N / 2.;
params.epsilon = epsilon*lat_det*gsl_sf_gamma(params.half_N)/pow(M_PI, params.half_N);
// Calculate minimum allowed radius
auto r = sqrt(params.half_N)+params.half_rho;
auto val = fill_factor_error(r, ¶ms);
cout << "Minimum R=" << r << " -> " << val << endl;
if (val > 0) {
// The minimum radius is not good enough. Work out a better one by
// finding the root of a tricky function
auto low = r;
auto high = RADIUS_MAX * 2 * params.half_rho;
auto val = fill_factor_error(high, ¶ms);
if (val >= 0)
throw(logic_error("huge RADIUS_MAX is still not big enough"));
gsl_function F;
F.function = fill_factor_error;
F.params = ¶ms;
auto T = gsl_root_fsolver_brent;
auto s = gsl_root_fsolver_alloc (T);
gsl_root_fsolver_set (s, &F, low, high);
int status = GSL_CONTINUE;
for (auto iter=1; status == GSL_CONTINUE && iter <= ITER_MAX; ++iter) {
gsl_root_fsolver_iterate (s);
low = gsl_root_fsolver_x_lower (s);
high = gsl_root_fsolver_x_upper (s);
status = gsl_root_test_interval(low, high, 0, RADIUS_INTERVAL * 2 * params.half_rho);
}
r = gsl_root_fsolver_root(s);
gsl_root_fsolver_free(s);
if (status == GSL_CONTINUE)
throw(logic_error("Search for R did not converge"));
}
return r;
}
// Recursively walk down the ellipsoids in each dimension
void ellipsoid(int d, mat const& A, double const* InvD, mat& Accu,
Result& result, double r2) {
auto r = sqrt(r2);
auto offset = Accu(d, d);
// InvD[d] = 1/ A(d, d)
auto from = ceil((-r-offset) * InvD[d]);
auto to = floor((r-offset) * InvD[d]);
for (auto v = from; v <= to; ++v) {
auto value = v * A(d, d)+offset;
auto residu = r2 - value*value;
if (d == 0) {
result.sum += exp(residu);
++result.points;
} else {
for (auto i=0; i<d; ++i) Accu(d-1, i) = Accu(d, i) + v * A(d, i);
ellipsoid(d-1, A, InvD, Accu, result, residu);
}
}
}
// Specialised version of ellipsoid() that will only process points an octant
void ellipsoid(int d, mat const& A, double const* InvD, mat& Accu,
Result& result, double r2, unsigned int octant) {
auto r = sqrt(r2);
auto offset = Accu(d, d);
// InvD[d] = 1/ A(d, d)
long from = ceil((-r-offset) * InvD[d]);
long to = floor((r-offset) * InvD[d]);
auto points = to-from+1;
auto base = from + points/2;
if (points & 1) {
auto value = base * A(d, d) + offset;
auto residu = r2 - value * value;
if (d == 0) {
if ((octant & (octant - 1)) == 0) {
result.sum += exp(residu);
++result.points;
}
} else {
for (auto i=0; i<d; ++i) Accu(d-1, i) = Accu(d, i) + base * A(d, i);
ellipsoid(d-1, A, InvD, Accu, result, residu, octant);
}
++base;
}
if ((octant & 1) == 0) {
to = from + points / 2 - 1;
base = from;
}
octant /= 2;
for (auto v = base; v <= to; ++v) {
auto value = v * A(d,d)+offset;
auto residu = r2 - value*value;
if (d == 0) {
if ((octant & (octant - 1)) == 0) {
result.sum += exp(residu);
++result.points;
}
} else {
for (auto i=0; i<d; ++i) Accu(d-1, i) = Accu(d, i) + v * A(d, i);
if (octant == 1)
ellipsoid(d-1, A, InvD, Accu, result, residu);
else
ellipsoid(d-1, A, InvD, Accu, result, residu, octant);
}
}
}
// Prepare call to ellipsoid()
Result sym_ellipsoid(int N, mat const& A, const vector<double>& InvD, double r,
unsigned int octant = 1) {
auto start = chrono::steady_clock::now();
auto r2 = r*r;
mat Accu(N, N);
Accu.row(N-1).zeros();
Result result{0, 0, 0};
// 2*octant+1 forces the points into the upper half plane, skipping 0
// This way we use the lattice symmetry and calculate only half the points
ellipsoid(N-1, A, &InvD[0], Accu, result, r2, 2*octant+1);
// Compensate for the extra factor exp(r*r) we always add in ellipsoid()
result.sum /= exp(r2);
auto end = chrono::steady_clock::now();
result.elapsed = chrono::duration<double>{end-start}.count();
return result;
}
// Prepare multithreaded use of sym_ellipsoid(). Each thread gets 1 octant
Result sym_ellipsoid_t(int N, mat const& A, const vector<double>& InvD, double r, unsigned int nr_threads) {
nr_threads = pow(2, ceil(log2(nr_threads)));
vector<future<Result>> results;
for (auto i=nr_threads+1; i<2*nr_threads; ++i)
results.emplace_back(async(launch::async, sym_ellipsoid, N, ref(A), ref(InvD), r, i));
auto result = sym_ellipsoid(N, A, InvD, r, nr_threads);
for (auto i=0U; i<nr_threads-1; ++i) result += results[i].get();
return result;
}
int main(int argc, char* const* argv) {
cin.exceptions(ios::failbit | ios::badbit);
cout.precision(12);
double epsilon = EPSILON; // Target absolute error
bool inv_modular = true; // Use modular transform to get the best matrix
bool lat_reduce = true; // Use lattice reduction to align the ellipsoid
bool conservative = false; // Use provable error bound instead of a guess
bool eigen_values = false; // Show eigenvalues
int threads_max = thread::hardware_concurrency();
int option_char;
while ((option_char = getopt(argc, argv, "p:n:MRce")) != EOF)
switch (option_char) {
case 'p': epsilon = atof(optarg); break;
case 'n': threads_max = atoi(optarg); break;
case 'M': inv_modular = false; break;
case 'R': lat_reduce = false; break;
case 'c': conservative = true; break;
case 'e': eigen_values = true; break;
default:
cerr << "usage: " << argv[0] << " [-p epsilon] [-n threads] [-M] [-R] [-e] [-c]" << endl;
exit(EXIT_FAILURE);
}
if (optind < argc) {
cerr << "Unexpected argument" << endl;
exit(EXIT_FAILURE);
}
if (threads_max < 1) threads_max = 1;
threads_max = pow(2, ceil(log2(threads_max)));
cout << "Using up to " << threads_max << " threads" << endl;
int N;
cin >> N;
mat P(N, N);
for (auto& v: P) cin >> v;
if (eigen_values) {
vec eigval = eig_sym(P);
cout << "Eigenvalues:\n" << eigval << endl;
}
// Decompose P = A * A.t()
mat A = chol(P, "lower");
// Calculate lattice determinant
double lat_det = 1;
for (auto i=0; i<N; ++i) {
if (A(i,i) <= 0) throw(logic_error("Diagonal not Positive"));
lat_det *= A(i,i);
}
cout << "Lattice determinant=" << lat_det << endl;
auto factor = lat_det / pow(M_PI, N/2.0);
if (inv_modular && factor < 1) {
epsilon *= factor;
cout << "Lattice determinant is small. Using inverse instead. Factor=" << factor << endl;
P = M_PI * M_PI * inv(P);
A = chol(P, "lower");
// We could simple calculate the new lat_det as pow(M_PI,N)/lat_det
lat_det = 1;
for (auto i=0; i<N; ++i) {
if (A(i,i) <= 0) throw(logic_error("Diagonal not Positive"));
lat_det *= A(i,i);
}
cout << "New lattice determinant=" << lat_det << endl;
} else
factor = 1;
// Prepare for lattice reduction.
// Since the library works on integer lattices we will scale up our matrix
double min = INFINITY;
for (auto i=0; i<N; ++i) {
for (auto j=0; j<N;++j)
if (A(i,j) != 0 && abs(A(i,j) < min)) min = abs(A(i,j));
}
auto upscale = UPSCALE/min;
mat_ZZ a;
a.SetDims(N,N);
for (auto i=0; i<N; ++i)
for (auto j=0; j<N;++j) a[i][j] = to_ZZ(A(i,j)*upscale);
// Finally do the actual lattice reduction
mat_ZZ u;
auto rank = G_BKZ_FP(a, u);
if (rank != N) throw(logic_error("Matrix is singular"));
mat U(N,N);
for (auto i=0; i<N;++i)
for (auto j=0; j<N;++j) U(i,j) = to_double(u[i][j]);
// There should now be a short lattice vector at row 0
ZZ sum = to_ZZ(0);
for (auto j=0; j<N;++j) sum += a[0][j]*a[0][j];
auto rho = sqrt(to_double(sum))/upscale;
cout << "Rho=" << rho << " (integer square " <<
rho*rho << " ~ " <<
static_cast<int>(rho*rho+0.5) << ")" << endl;
// Lattice reduction doesn't gain us anything conceptually.
// The same number of points is evaluated for the same exponential values
// However working through the ellipsoid dimensions from large lattice
// base vectors to small makes ellipsoid() a *lot* faster
if (lat_reduce) {
mat B = U * A;
P = B * B.t();
A = chol(P, "lower");
if (eigen_values) {
vec eigval = eig_sym(P);
cout << "New eigenvalues:\n" << eigval << endl;
}
}
vector<double> InvD(N);;
for (auto i=0; i<N; ++i) InvD[i] = 1 / A(i, i);
// Calculate radius needed for target precision
auto r = radius(N, rho, lat_det, epsilon);
cout << "Safe R=" << r << endl;
auto nr_threads = threads_max;
Result result;
if (conservative) {
// Walk all points inside the ellipsoid with transformed radius r
result = sym_ellipsoid_t(N, A, InvD, r, nr_threads);
} else {
// First grow the radius until we saw POINTS_MIN points or reach the
// target radius
double i = floor(N * log2(r/rho) / log2(GROW));
if (i < 0) i = 0;
auto R = r * pow(GROW, -i/N);
cout << "Initial R=" << R << endl;
result = sym_ellipsoid_t(N, A, InvD, R, nr_threads);
nr_threads = result.elapsed < THREAD_SEC ? 1 : threads_max;
auto max_new_points = result.points;
while (--i >= 0 && result.points < POINTS_MIN) {
R = r * pow(GROW, -i/N);
auto change = result;
result = sym_ellipsoid_t(N, A, InvD, R, nr_threads);
nr_threads = result.elapsed < THREAD_SEC ? 1 : threads_max;
change = result - change;
if (change.points > max_new_points) max_new_points = change.points;
}
// Now we have enough points that it's worth bothering to use threads
while (--i >= 0) {
R = r * pow(GROW, -i/N);
auto change = result;
result = sym_ellipsoid_t(N, A, InvD, R, nr_threads);
nr_threads = result.elapsed < THREAD_SEC ? 1 : threads_max;
change = result - change;
// This is probably too crude and might misestimate the error
// I've never seen it fail though
if (change.points > max_new_points) {
max_new_points = change.points;
if (change.sum < epsilon/2) break;
}
}
cout << "Final R=" << R << endl;
}
// We calculated half the points and skipped 0.
result.sum = 2*result.sum+1;
// Modular transform factor
result.sum /= factor;
// Report result
cout <<
"Evaluated " << result.points << " points\n" <<
"Sum = " << result.sum << endl;
}