Digit sum of central binomial coefficients
C++ (GMP) - (287,000,000 / 422,000) = 680.09
Shamelessly combine Kummer's Theorem by xnor and GMP by qwr.
Still not even close to the Go solution, not sure why.
Edit: Thanks Keith Randall for the reminder that multiplication is faster if the number is similar in size. I implemented multi-level multiplication, similar to memory coalescing concept on memory management. And the result is impressive. What used to take 51s, now takes only 0.5s (i.e., 100-fold improvement!!)
OLD CODE (n=14,000,000) Done sieving in 0.343s Done calculating binom in 51.929s Done summing in 0.901s 14000000: 18954729 real 0m53.194s user 0m53.116s sys 0m0.060s NEW CODE (n=14,000,000) Done sieving in 0.343s Done calculating binom in 0.552s Done summing in 0.902s 14000000: 18954729 real 0m1.804s user 0m1.776s sys 0m0.023s
The run for n=287,000,000
Done sieving in 4.211s Done calculating binom in 17.934s Done summing in 37.677s 287000000: 388788354 real 0m59.928s user 0m58.759s sys 0m1.116s
The code. Compile with -lgmp -lgmpxx -O3
#include <gmpxx.h>
#include <iostream>
#include <time.h>
#include <cstdio>
const int MAX=287000000;
const int PRIME_COUNT=15700000;
int primes[PRIME_COUNT], factors[PRIME_COUNT], count;
bool sieve[MAX];
int max_idx=0;
void run_sieve(){
sieve[2] = true;
primes[0] = 2;
count = 1;
for(int i=3; i<MAX; i+=2){
sieve[i] = true;
}
for(int i=3; i<17000; i+=2){
if(!sieve[i]) continue;
for(int j = i*i; j<MAX; j+=i){
sieve[j] = false;
}
}
for(int i=3; i<MAX; i+=2){
if(sieve[i]) primes[count++] = i;
}
}
mpz_class sum_digits(mpz_class n){
clock_t t = clock();
char* str = mpz_get_str(NULL, 10, n.get_mpz_t());
int result = 0;
for(int i=0;str[i]>0;i++){
result+=str[i]-48;
}
printf("Done summing in %.3fs\n", ((float)(clock()-t))/CLOCKS_PER_SEC);
return result;
}
mpz_class nc2_fast(const mpz_class &x){
clock_t t = clock();
int prime;
const unsigned int n = mpz_get_ui(x.get_mpz_t());
const unsigned int n2 = n/2;
unsigned int m;
unsigned int digit;
unsigned int carry=0;
unsigned int carries=0;
mpz_class result = 1;
mpz_class prime_prods = 1;
mpz_class tmp;
mpz_class tmp_prods[32], tmp_prime_prods[32];
for(int i=0; i<32; i++){
tmp_prods[i] = (mpz_class)NULL;
tmp_prime_prods[i] = (mpz_class)NULL;
}
for(int i=0; i< count; i++){
prime = primes[i];
carry=0;
carries=0;
if(prime > n) break;
if(prime > n2){
tmp = prime;
for(int j=0; j<32; j++){
if(tmp_prime_prods[j] == NULL){
tmp_prime_prods[j] = tmp;
break;
} else {
mpz_mul(tmp.get_mpz_t(), tmp.get_mpz_t(), tmp_prime_prods[j].get_mpz_t());
tmp_prime_prods[j] = (mpz_class)NULL;
}
}
continue;
}
m=n2;
while(m>0){
digit = m%prime;
carry = (2*digit + carry >= prime) ? 1 : 0;
carries += carry;
m/=prime;
}
if(carries>0){
tmp = 0;
mpz_ui_pow_ui(tmp.get_mpz_t(), prime, carries);
for(int j=0; j<32; j++){
if(tmp_prods[j] == NULL){
tmp_prods[j] = tmp;
break;
} else {
mpz_mul(tmp.get_mpz_t(), tmp.get_mpz_t(), tmp_prods[j].get_mpz_t());
tmp_prods[j] = (mpz_class)NULL;
}
}
}
}
result = 1;
prime_prods = 1;
for(int j=0; j<32; j++){
if(tmp_prods[j] != NULL){
mpz_mul(result.get_mpz_t(), result.get_mpz_t(), tmp_prods[j].get_mpz_t());
}
if(tmp_prime_prods[j] != NULL){
mpz_mul(prime_prods.get_mpz_t(), prime_prods.get_mpz_t(), tmp_prime_prods[j].get_mpz_t());
}
}
mpz_mul(result.get_mpz_t(), result.get_mpz_t(), prime_prods.get_mpz_t());
printf("Done calculating binom in %.3fs\n", ((float)(clock()-t))/CLOCKS_PER_SEC);
return result;
}
int main(int argc, char* argv[]){
const mpz_class n = atoi(argv[1]);
clock_t t = clock();
run_sieve();
printf("Done sieving in %.3fs\n", ((float)(clock()-t))/CLOCKS_PER_SEC);
std::cout << n << ": " << sum_digits(nc2_fast(n)) << std::endl;
return 0;
}
Go, 33.96 = (16300000 / 480000)
package main
import "math/big"
const n = 16300000
var (
sieve [n + 1]bool
remaining [n + 1]int
count [n + 1]int
)
func main() {
println("finding primes")
for p := 2; p <= n; p++ {
if sieve[p] {
continue
}
for i := p * p; i <= n; i += p {
sieve[i] = true
}
}
// count net number of times each prime appears in the result.
println("counting factors")
for i := 2; i <= n; i++ {
remaining[i] = i
}
for p := 2; p <= n; p++ {
if sieve[p] {
continue
}
for i := p; i <= n; i += p {
for remaining[i]%p == 0 { // may have multiple factors of p
remaining[i] /= p
// count positive for n!
count[p]++
// count negative twice for ((n/2)!)^2
if i <= n/2 {
count[p] -= 2
}
}
}
}
// ignore all the trailing zeros
count[2] -= count[5]
count[5] = 0
println("listing factors")
var m []uint64
for i := 0; i <= n; i++ {
for count[i] > 0 {
m = append(m, uint64(i))
count[i]--
}
}
println("grouping factors")
m = group(m)
println("multiplying")
x := mul(m)
println("converting to base 10")
d := 0
for _, c := range x.String() {
d += int(c - '0')
}
println("sum of digits:", d)
}
// Return product of elements in a.
func mul(a []uint64) *big.Int {
if len(a) == 1 {
x := big.NewInt(0)
x.SetUint64(a[0])
return x
}
m := len(a) / 2
x := mul(a[:m])
y := mul(a[m:])
x.Mul(x, y) // fast because x and y are about the same length
return x
}
// return a slice whose members have the same product
// as the input slice, but hopefully shorter.
func group(a []uint64) []uint64 {
var g []uint64
r := uint64(1)
b := 1
for _, x := range a {
c := bits(x)
if b+c <= 64 {
r *= x
b += c
} else {
g = append(g, r)
r = x
b = c
}
}
g = append(g, r)
return g
}
// bits returns the number of bits in the representation of x
func bits(x uint64) int {
n := 0
for x != 0 {
n++
x >>= 1
}
return n
}
Works by counting all the prime factors in the numerator and denominator and canceling matching factors. Multiplies the leftovers to get the result.
More than 80% of the time is spent in converting to base 10. There's got to be a better way to do that...
Python 3 (8.8 = 2.2 million / 0.25 million)
This is in Python, which isn't known for speed, so you can probably do better porting this to another language.
Primes generator taken from this StackOverflow contest.
import numpy
import time
def primesfrom2to(n):
""" Input n>=6, Returns a array of primes, 2 <= p < n """
sieve = numpy.ones(n//3 + (n%6==2), dtype=numpy.bool)
for i in range(1,int(n**0.5)//3+1):
if sieve[i]:
k=3*i+1|1
sieve[ k*k/3 ::2*k] = False
sieve[k*(k-2*(i&1)+4)/3::2*k] = False
return numpy.r_[2,3,((3*numpy.nonzero(sieve)[0][1:]+1)|1)]
t0 = time.clock()
N=220*10**4
n=N//2
print("N = %d" % N)
print()
print("Generating primes.")
primes = primesfrom2to(N)
t1 = time.clock()
print ("Time taken: %f" % (t1-t0))
print("Computing product.")
product = 1
for p in primes:
p=int(p)
carries = 0
carry = 0
if p>n:
product*=p
continue
m=n
#Count carries of n+n in base p as per Kummer's Theorem
while m:
digit = m%p
carry = (2*digit + carry >= p)
carries += carry
m//=p
if carries >0:
for _ in range(carries):
product *= p
#print(p,carries,product)
t2 = time.clock()
print ("Time taken: %f" % (t2-t1))
print("Converting number to string.")
# digit_sum = 0
# result=product
# while result:
# digit_sum+=result%10
# result//=10
digit_sum = 0
digit_string = str(product)
t3 = time.clock()
print ("Time taken: %f" % (t3-t2))
print("Summing digits.")
for d in str(digit_string):digit_sum+=int(d)
t4 = time.clock()
print ("Time taken: %f" % (t4-t3))
print ()
print ("Total time: %f" % (t4-t0))
print()
print("Sum of digits = %d" % digit_sum)
The main idea of the algorithm is to use Kummer's Theorem to get the prime-factorization of the binomial. For each prime, we learn the highest power of it that divides the answer, and multiply the running product by that power of the prime. In this way, we only need to multiply once for each prime in the prime-factorization of the answer.
Output showing time breakdown:
N = 2200000
Generating primes.
Time taken: 0.046408
Computing product.
Time taken: 17.931472
Converting number to string.
Time taken: 39.083390
Summing digits.
Time taken: 1.502393
Total time: 58.563664
Sum of digits = 2980107
Surprisingly, most of the time is spent converting the number to a string to sum its digits. Also surprisingly, converting to a string was much faster than getting digits from repeated %10
and //10
, even though the whole string must presumably be kept in memory.
Generating the primes takes negligible time (and hence I don't feel unfair copying existing code). Summing digits is fast. The actual multiplication takes one third of the time.
Given that digit summing seems to be the limiting factor, perhaps an algorithm to multiply numbers in decimal representation would save time in total by shortcutting the binary/decimal conversion.