Factoring a 64-bit integer
Total 36,757,269,913 cycles
830B assembled
Number Time (s) Cycles
8831269065180497 0.1 1148
2843901546547359024 55.0 9535194
6111061272747645669 351.4 60559378
11554045868611683619 0.8 135135
6764921230558061729 1.0 155407
16870180535862877896 43067.5 7126449414
3778974635503891117 148.7 22823483
204667546124958269 87.4 13635943
16927447722109721827 16119.0 2739172134
9929766466606501253 156158.3 26784802677
Factorization by trial division, with a few optimizations. Probably not the fastest, but as no one else has posted yet, I'll kick it off.
main:
call read
call factor
halt 0
# Prints factors. Pass number to be factored in register `a'.
factor:
mov n, a # Write routine expects factor in `a', so we'll just keep it there.
# Handle 2 specially
_factor_twos: # do {
geu b, n, 1
jz _factor_ret, b # if (number <= 1) return
and b, n, 1
jnz _factor_rest, b # if (number & 1 != 0) break
mov a, 2
call write # printf("%llu\n", 2)
shr n, n, 1 # number >>= 1
jmp _factor_twos # } while (0)
_factor_rest:
mov a, 3 # factor = 3
_factor_rest_loop: # do {
cmp b, n, 1
jnz _factor_ret, b # if (number == 1) return
divu b, c, n, a # (quotient, remainder) = number / factor
jnz _factor_not_divisible, c # if (remainder == 0) {
call write # printf("%llu\n", factor)
mov n, b # number = quotient
jmp _factor_rest_loop
_factor_not_divisible: # } else {
leu c, b, a
jnz _factor_print_last, c # if quotient < a, print n and return.
add a, a, 2 # else a += 2 }
jmp _factor_rest_loop # } while (0)
_factor_print_last:
mov a, n
call write # printf("%llu", n)
_factor_ret:
ret
# Read stdin. Must be one decimal string, ending with a newline.
# Behavior on invalid input is unspecified.
# Return value stored in a.
read:
mov a, 0
_read_loop_start: # do {
lw c, 0xffffffffffffffff # c = getchar()
sub b, c, ord('\n')
jz _read_ret, b # if (c == '\n'), return
sub c, c, ord('0') # c -= '0'
mulu a, y, a, 10
add a, a, c # a = 10*a + c
jmp _read_loop_start # } while (0)
_read_ret:
ret a
# Write number in register a to stdout, followed by a newline.
# Fortunately, a 64-bit number has at most 20 digits, and we have 25 registers.
# So we can keep them in registers instead of pushing them onto the stack or heap,
# avoiding the fairly expensive read back.
write:
divu a, b, a, 10
jz _write_b, a
divu a, c, a, 10
jz _write_c, a
divu a, d, a, 10
jz _write_d, a
divu a, e, a, 10
jz _write_e, a
divu a, f, a, 10
jz _write_f, a
divu a, g, a, 10
jz _write_g, a
divu a, h, a, 10
jz _write_h, a
divu a, i, a, 10
jz _write_i, a
divu a, j, a, 10
jz _write_j, a
divu a, k, a, 10
jz _write_k, a
divu a, l, a, 10
jz _write_l, a
divu a, m, a, 10
jz _write_m, a
divu a, n, a, 10
jz _write_n, a
divu a, o, a, 10
jz _write_o, a
divu a, p, a, 10
jz _write_p, a
divu a, q, a, 10
jz _write_q, a
divu a, r, a, 10
jz _write_r, a
divu a, s, a, 10
jz _write_s, a
divu a, t, a, 10
jz _write_t, a
add a, a, ord('0')
sw 0xffffffffffffffff, a
_write_t:
add t, t, ord('0')
sw 0xffffffffffffffff, t
_write_s:
add s, s, ord('0')
sw 0xffffffffffffffff, s
_write_r:
add r, r, ord('0')
sw 0xffffffffffffffff, r
_write_q:
add q, q, ord('0')
sw 0xffffffffffffffff, q
_write_p:
add p, p, ord('0')
sw 0xffffffffffffffff, p
_write_o:
add o, o, ord('0')
sw 0xffffffffffffffff, o
_write_n:
add n, n, ord('0')
sw 0xffffffffffffffff, n
_write_m:
add m, m, ord('0')
sw 0xffffffffffffffff, m
_write_l:
add l, l, ord('0')
sw 0xffffffffffffffff, l
_write_k:
add k, k, ord('0')
sw 0xffffffffffffffff, k
_write_j:
add j, j, ord('0')
sw 0xffffffffffffffff, j
_write_i:
add i, i, ord('0')
sw 0xffffffffffffffff, i
_write_h:
add h, h, ord('0')
sw 0xffffffffffffffff, h
_write_g:
add g, g, ord('0')
sw 0xffffffffffffffff, g
_write_f:
add f, f, ord('0')
sw 0xffffffffffffffff, f
_write_e:
add e, e, ord('0')
sw 0xffffffffffffffff, e
_write_d:
add d, d, ord('0')
sw 0xffffffffffffffff, d
_write_c:
add c, c, ord('0')
sw 0xffffffffffffffff, c
_write_b:
add b, b, ord('0')
sw 0xffffffffffffffff, b
sw 0xffffffffffffffff, ord('\n')
ret
# End of write.
Full output from my timing loop, in case anyone wants to check the results and/or my copy-paste and math.
13
23
31
37
41
47
47
59
61
79
Execution terminated after 1148 cycles with exit code 0.
/usr/local/bin/golf.py factor.bin <<< $num 0.09s user 0.02s system 94% cpu 0.115 total
2
2
2
2
3
3
107
1121707
164546579
Execution terminated after 9535194 cycles with exit code 0.
/usr/local/bin/golf.py factor.bin <<< $num 55.03s user 0.07s system 99% cpu 55.136 total
3
3
3
7
7
7
13
50759273983933
Execution terminated after 60559378 cycles with exit code 0.
/usr/local/bin/golf.py factor.bin <<< $num 351.38s user 0.32s system 99% cpu 5:51.89 total
7
7
7
13
2531
4091
250250521
Execution terminated after 135135 cycles with exit code 0.
/usr/local/bin/golf.py factor.bin <<< $num 0.84s user 0.01s system 99% cpu 0.857 total
31
89
97
3881
18211
357653
Execution terminated after 155407 cycles with exit code 0.
/usr/local/bin/golf.py factor.bin <<< $num 0.98s user 0.01s system 99% cpu 0.997 total
2
2
2
3
702924188994286579
Execution terminated after 7126449414 cycles with exit code 0.
/usr/local/bin/golf.py factor.bin <<< $num 43067.49s user 86.38s system 99% cpu 12:04:12.58 total
7
103
727
7209485975851
Execution terminated after 22823483 cycles with exit code 0.
/usr/local/bin/golf.py factor.bin <<< $num 134.60s user 0.13s system 99% cpu 2:14.83 total
66449
1604167
1920043
Execution terminated after 13635943 cycles with exit code 0.
/usr/local/bin/golf.py factor.bin <<< $num 81.02s user 0.06s system 99% cpu 1:21.12 total
322255481
52528036667
Execution terminated after 2739172134 cycles with exit code 0.
/usr/local/bin/golf.py factor.bin <<< $num 16119.02s user 12.31s system 99% cpu 4:29:03.61 total
9929766466606501253
Execution terminated after 26784802677 cycles with exit code 0.
/usr/local/bin/golf.py factor.bin <<< $num 156158.25s user 186.81s system 49% cpu 88:35:03.65 total
score = 378,867,816 cycles
[randomized - your results may vary]
Uses the Miller-Rabin primality test (a deterministic version that can handle up to 2^64), a bit of trial factoring, and ECM factoring. All the algebraic operations are in there, including modular addition, subtraction, multiplication, exponentiation, and inversion (mod n<2^64).
Modular multiplication is suboptimal - it does a binary search for the quotient. Computing the remainder of a 128 by 64 divide is tough without a matching builtin instruction. Speed that up and the whole thing will go faster.
2290 bytes compiled
step = i
test = j
tmp = k
# prints factors of number on stdin, one per line in arbitrary order, to stdout
main:
call read_int
mov a, x
two:
# remove factors of 2
and tmp, a, 1
jnz odd, tmp
mov x, 2
call write_int
sw -1, ord('\n')
shr a, a, 1
jmp two
odd:
cmp tmp, a, 1
jnz done2, tmp
call factorodd
done2:
halt 0
# a = input value >= 3 and odd. Prints factors of a and returns.
factorodd:
call isprime
jnz prime2, b
# n is composite. Try to find a (not necessarily prime) factor.
call trial
jnz split, b
call primepower
jnz split, b
call ecm
split:
# recurse on two parts
mov a, b
call factorodd
mov a, c
jmp factorodd
prime2:
mov x, a
call write_int
sw -1, ord('\n')
ret
# a = input value >= 3 and odd.
# b, c = output values, a = b*c, b,c > 1.
# returns b=0 if failure.
trial:
mov b, 3
loop5:
divu c, r, a, b
jz found, r
add b, b, 2
leu tmp, b, 1001 # TODO: tune max size
jnz loop5, tmp
mov b, 0
found:
ret b, c
# a = input value >= 3. Must be odd and have >=2 distinct prime factors.
# b, c = output values, a = b*c, b,c > 1.
ecm:
mov n, a
loop12:
# choose a random elliptic curve by picking m with 0 <= m < n
# y^2 = x^3 + mx + 1
rand a
mov b, 1
mov c, n
call mulmod
mov m, d
# check that the descriminant is nonzero. D = 4m^3+27
mov a, m
mov b, m
call mulmod
mov a, d
call mulmod
mov a, d
mov b, 4
call mulmod
mov a, d
mov b, 27
call addmod
jz loop12, d
# try to find factor of n with this elliptic curve
call ecmtry
jz loop12, b
divu c, tmp, n, b
ret b, c
# n = modulus
# m = curve param
# b = return value - 0 or a factor of n
ecmtry:
mov p, 0
mov q, 1
# multiply by 2
mov e, p
mov f, q
mov g, p
mov h, q
call ellipticadd
jnz foundfactor2, d
mov p, b
mov q, c
# continue multiplying by odd numbers
# TODO: just multiply by odd primes
mov i, 3
loop15:
mov e, p
mov f, q
mov g, i
call ellipticmul
jnz foundfactor3, d
mov p, b
mov q, c
add i, i, 2
jnz loop15, p
jnz loop15, q
mov b, 0
ret b
foundfactor3:
mov b, d
ret b
# multiplication by a constant in the elliptic curve "group"
# m = input curve parameter
# n = input modulus
# e,f = input x,y of point
# g = input multiplier
# b,c = output x,y of sum
# d = output 0 if successful, factor of n if not.
ellipticmul:
# compute g*(e,f) by repeated doubling
mov x, e
mov y, f
mov t, g
mov r, 0
mov s, 0
loop14:
jz done6, t
and tmp, t, 1
jz noadd, tmp
# r += x
mov e, r
mov f, s
mov g, x
mov h, y
call ellipticadd
jnz foundfactor2, d
mov r, b
mov s, c
noadd:
# x += x
mov e, x
mov f, y
mov g, x
mov h, y
call ellipticadd
jnz foundfactor2, d
mov x, b
mov y, c
shr t, t, 1
jmp loop14
done6:
mov b, r
mov c, s
mov d, 0
ret b, c, d
foundfactor2:
ret d
# addition in the elliptic curve "group"
# m = input curve parameter
# n = input modulus
# e,f = input x,y of first point (p)
# g,h = input x,y of second point (q)
# b,c = output x,y of sum
# d = output 0 if successful, factor of n if not.
ellipticadd:
px = e
py = f
qx = g
qy = h
rx = b
ry = c
gcd = d
# 0+q=q
or tmp, px, py
jz retq, tmp
# p+0=p
or tmp, qx, qy
jz retp, tmp
cmp tmp, px, qx
jz diffx, tmp
cmp tmp, py, qy
jz inf, tmp # px == qx, py != qy, return inf
jz inf, py # px == qx, py == qy, py==0, return inf
# px == qx, py == qy, py != 0. This is the double-root case
# u = numerator = 3 px^2 + a
mov a, px
mov b, px
mov c, n
call mulmod
mov a, d
mov b, 3
call mulmod
mov a, d
mov b, m
call addmod
mov u, d
# v = denominator = 2y
mov a, py
mov b, py
call addmod
mov v, d
jmp divide
diffx:
# px != qx
mov a, py
mov b, qy
call submod
mov u, d # u = numerator = py - qy
mov a, px
mov b, qx
call submod
mov v, d # v = denominator = px - qx
divide:
mov a, v
call inv
cmp tmp, c, 1 # gcd(v,n)==1?
jz foundfactor, tmp
mov v, d
# compute s = u/v mod n = u * v^-1 mod n
mov a, u
mov b, v
mov c, n
call mulmod
mov s, d
# compute x = s*s - px - qx
mov a, s
mov b, s
call mulmod
mov a, d
mov b, px
call submod
mov a, d
mov b, qx
call submod
mov x, d
# compute y = s * (px - x) - py
mov a, px
mov b, x
call submod
mov a, d
mov b, s
call mulmod
mov a, d
mov b, py
call submod
# return [s*s-px-qx, s*(px-x)-py]
mov rx, x
mov ry, d
mov gcd, 0
ret rx, ry, gcd
retp:
mov rx, px
mov ry, py
mov gcd, 0
ret rx, ry, gcd
retq:
mov rx, qx
mov ry, qy
mov gcd, 0
ret rx, ry, gcd
inf:
mov rx, 0
mov ry, 0
mov gcd, 0
ret rx, ry, gcd
foundfactor:
mov gcd, c
ret gcd
# a = input value >= 3. Must be odd.
# b = output factor of a.
# if a is a prime power, returns a factor of a.
# otherwise, returns 0.
primepower:
mov e, 2
loop6:
mov b, e
call nthroot
leu tmp, c, 3
jnz notprimepower, tmp
# compute c^e
mov r, 1
mov b, e
loop9:
mulu r, tmp, r, c
dec b
jnz loop9, b
cmp tmp, r, a
jnz isprimepower, tmp
inc e
cmp tmp, e, 41 # largest odd power in 2^64 is 3^40
jz loop6, tmp
ret
isprimepower:
mov b, c
ret
notprimepower:
mov b, 0
ret b
# a = input value
# b = input exponent
# c = output floor(a**(1/b))
nthroot:
# binary search for the root
mov c, 0
mov step, 0x80000000
loop7:
jz done3, step
add test, c, step
shr step, step, 1
# check if test^b > a
# test^b might overflow, so we can't just compute that and compare.
# so we start with e=a, divide by test b times, and check that
# the result is 0.
mov d, b
mov e, a
loop8:
divu e, tmp, e, test
dec d
jnz loop8, d
jz loop7, e # test is too big, don't change c
mov c, test # test is <= floor(a**(1/b)). Keep it.
jmp loop7
done3:
ret c
# a = input number >= 2
# b = ouput 1/0 if a is prime/composite
isprime:
# small primes (tests below require that we check these first)
cmp tmp, a, 2
jnz prime, tmp
cmp tmp, a, 3
jnz prime, tmp
cmp tmp, a, 5
jnz prime, tmp
cmp tmp, a, 7
jnz prime, tmp
cmp tmp, a, 11
jnz prime, tmp
cmp tmp, a, 13
jnz prime, tmp
cmp tmp, a, 17
jnz prime, tmp
cmp tmp, a, 19
jnz prime, tmp
cmp tmp, a, 23
jnz prime, tmp
cmp tmp, a, 29
jnz prime, tmp
cmp tmp, a, 31
jnz prime, tmp
cmp tmp, a, 37
jnz prime, tmp
# even numbers
and tmp, a, 1
jz composite, tmp
# uses Miller-Rabin with enough fixed bases to guarantee the
# correct result for numbers up to 2^64.
# see http://oeis.org/A014233
mov b, 2
call miller_rabin
jz composite, c
mov b, 3
call miller_rabin
jz composite, c
mov b, 5
call miller_rabin
jz composite, c
mov b, 7
call miller_rabin
jz composite, c
mov b, 11
call miller_rabin
jz composite, c
mov b, 13
call miller_rabin
jz composite, c
mov b, 17
call miller_rabin
jz composite, c
mov b, 19
call miller_rabin
jz composite, c
mov b, 23
call miller_rabin
jz composite, c
mov b, 29
call miller_rabin
jz composite, c
mov b, 31
call miller_rabin
jz composite, c
mov b, 37
call miller_rabin
jz composite, c
prime:
mov b, 1
ret b
composite:
mov b, 0
ret b
# a = input number >= 3
# b = input base value (1 < b < a-1)
# c = output 1 = may be prime, 0 = definitely composite
miller_rabin:
mov p, a
mov t, b
sub q, p, 1
mov s, 0
sub e, a, 1
loop2:
and tmp, e, 1
jnz endloop2, tmp
add s, s, 1
shr e, e, 1
jmp loop2
endloop2:
mov a, t
mov b, e
mov c, p
call pow
cmp tmp, d, 1
jnz unknown, tmp
cmp tmp, d, q
jnz unknown, tmp
mov x, d
loop4:
sub s, s, 1
jz composite2, s
mov a, x
mov b, x
mov c, p
call mulmod
cmp tmp, d, 1
jnz composite2, tmp
cmp tmp, d, q
jnz unknown, tmp
mov x, d
jmp loop4
composite2:
mov c, 0
ret c
unknown:
mov c, 1
ret c
# a = input value
# n = input modulus
# returns c, d such that
# c = gcd(a,n)
# d * a % n == gcd(a,n)
inv:
# extended euclidean algorithm, from
# http://stackoverflow.com/a/14093613/180090
new = e
old = f
pos = g
mov p, n
mov new, 1
mov old, 0
mov q, p
mov pos, 0
loop13:
jz done5, a
divu q, r, q, a
mulu h, tmp, q, new
add h, old, h
mov old, new
mov new, h
mov q, a
mov a, r
xor pos, pos, 1
jmp loop13
done5:
mov c, q
jnz positer, pos
sub old, p, old
positer:
mov d, old
ret c, d
# d = a ** b % c
pow:
mov r, 1
mov e, b
mov t, a
# t ** e % c
loop3:
jz endloop3, e
and tmp, e, 1
jz lowbitclear, tmp
mov a, t
mov b, r
call mulmod # r = t*r mod c
mov r, d
lowbitclear:
shr e, e, 1
mov a, t
mov b, t
call mulmod # t = t*t mod c
mov t, d
jmp loop3
endloop3:
mov d, r
ret d
# d = a * b % c
mulmod:
# do multiply
mulu e, f, a, b
# binary search for quotient
mov q, 0
mov step, 0x8000000000000000
loop:
jz done, step
# test = q + step
add test, q, step
shr step, step, 1
# check test*c
mulu g, h, test, c
geu tmp, h, f
jnz loop, tmp
leu tmp, h, f
jnz ok, tmp
geu tmp, g, e
jnz loop, tmp
ok:
# test*c <= a*b. Use test as the new q.
mov q, test
jmp loop
done:
# q == quotient of (a*b)/c
mulu g, h, q, c # q*c, g+h<<64 should be close to e+f<<64
sub d, e, g # remainder
ret d
# d = (a + b) % n
# requires 0 <= a, b < n
addmod:
add d, a, b
leu tmp, d, a # wrapped 2^64
jnz overflow, tmp
leu tmp, d, n # result less than modulus?
jnz inrange, tmp
overflow:
sub d, d, n
inrange:
ret d
# d = (a - b) % n
# requires 0 <= a, b < n
submod:
sub d, a, b
leu tmp, a, b # below 0?
jz nounderflow, tmp
add d, d, n
nounderflow:
ret d
read_int:
mov x, 0
read_int_loop:
lw c, -1
cmp q, c, ord("\n")
jnz read_int_done, q
sub c, c, ord("0")
mulu x, d, x, 10
add x, x, c
jmp read_int_loop
read_int_done:
ret x
write_int:
divu x, m, x, 10
jz write_int_done, x
call write_int
write_int_done:
add m, m, ord("0")
sw -1, m
ret
Output:
13
23
31
37
41
47
47
59
61
79
Execution terminated after 799601 cycles with exit code 0.
2
2
2
2
3
3
107
164546579
1121707
Execution terminated after 216769549 cycles with exit code 0.
3
3
3
7
7
7
13
50759273983933
Execution terminated after 939985 cycles with exit code 0.
7
7
7
13
2531
4091
250250521
Execution terminated after 34213849 cycles with exit code 0.
31
89
97
18211
3881
357653
Execution terminated after 9662357 cycles with exit code 0.
2
2
2
3
702924188994286579
Execution terminated after 748732 cycles with exit code 0.
7
103
727
7209485975851
Execution terminated after 912672 cycles with exit code 0.
66449
1920043
1604167
Execution terminated after 38514477 cycles with exit code 0.
322255481
52528036667
Execution terminated after 75568529 cycles with exit code 0.
9929766466606501253
Execution terminated after 738065 cycles with exit code 0.
2,279,635 cycles—7373 bytes (deterministic)
One-by-one:
robert@unity:~/golf-cpu/factor$ ./test.sh
13
23
31
37
41
47
47
59
61
79
Execution terminated after 1088 cycles with exit code 0.
2
2
2
2
3
3
107
164546579
1121707
Execution terminated after 41325 cycles with exit code 0.
3
3
3
7
7
7
13
50759273983933
Execution terminated after 9413 cycles with exit code 0.
7
7
7
13
250250521
2531
4091
Execution terminated after 22282 cycles with exit code 0.
31
89
97
357653
3881
18211
Execution terminated after 47656 cycles with exit code 0.
2
2
2
3
702924188994286579
Execution terminated after 17423 cycles with exit code 0.
7
103
7209485975851
727
Execution terminated after 15973 cycles with exit code 0.
1920043
66449
1604167
Execution terminated after 153683 cycles with exit code 0.
52528036667
322255481
Execution terminated after 1952229 cycles with exit code 0.
9929766466606501253
Execution terminated after 18563 cycles with exit code 0.
robert@unity:~/golf-cpu/factor$
Summary:
robert@unity:~/golf-cpu/factor$ ./test.py factor.golf factors_testset
binary length: 7373 bytes
10/10 passed in 0:00:08.53 s
2,279,635 cycles (227963.50 average)
(267.14 KCPS average)
0 ######################################################################
100000 ########
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000 ########
robert@unity:~/golf-cpu/factor$
I use a combination of trial division and the Brent-Pollard rho algorithm for factorization, and table lookup plus Miller-Rabin for primality testing. I'll add some more explanation tomorrow.
Note that because of the character limit on post length, the second data table is truncated. This gist includes the full table.
primes_53 = \
b'\x03\x05\x07\x0b\x0d\x11\x13\x17\x1d\x1f\x25\x29\x2b\x2f\x35\x3b' \
b'\x3d\x43\x47\x49\x4f\x53\x59\x61\x65\x67\x6b\x6d\x71\x7f\x83\x89' \
b'\x8b\x95\x97\x9d\xa3\xa7\xad\xb3\xb5\xbf\xc1\xc5\xc7\xd3\xdf\xe3' \
b'\xe5\xe9\xef\xf1\xfb'
# for 2**16+1:2:2**17-1, 1 if prime, 0 otherwise
prime_bytes = \
b'\x8b\x24\x60\x82\x10\x41\x81\x12\x40\x08\x26\x0d\x03\x00\x01\x41' \
...
b'\x41\x30\x20\x10\x00\x00\x80\x00\x40\x50\x24\x00\x83\x00\x01\x8a'
main:
call input
# check if input is 1
cmp c, n, 1
sz c, 3
mov o, n
call print
halt 0
# remove factors of two
mov o, 2
div_2_loop:
and c, n, 1
snz c, 3
call print
shr n, n, 1
jmp div_2_loop
# check if n is a power of two
cmp c, n, 1
sz c, 1
halt 0
# trial division
mov d, data(primes_53)
mov i, 53
trial_division_loop:
lbu p, d
trial_division_check:
divu q, r, n, p
snz r, 4
mov o, p
call print
mov n, q
jmp trial_division_check
mulu s, t, p, p
leu c, n, s
sz c, 5
cmp c, n, 1
snz c, 2
mov o, n
call print
halt 0
sub i, i, 1
add d, d, 1
jnz trial_division_loop, i
# mov g, 0 # initialize factor list pointers
# mov h, 0 # all registers are zero at program start
# *g is the next factor to check for primality
# *h is the last factor we've found so far
# the first factor is always *0
sw g, n
prime_test:
# by this point we know that n has no prime factors less than 256,
# so if n is less than 256**2, then it must be prime
lequ c, n, 2**16
jnz is_prime, c
geu c, n, 2**17+1
jnz miller_rabin, c
sub d, n, 2**16+1 # table starts at 2**16+1 (by above argument)
shr d, d, 1 # we know n is odd, so we ignore the last bit
and m, d, 2**3-1 # each byte is 3-bit lookup table
shr d, d, 3
add d, d, data(prime_bytes)
lbu p, d
shr p, p, m
and p, p, 1
jnz is_prime, p
jmp pollard_rho # I should probably do trial division here
# since we only have around 20 more primes to check
################
# Miller-Rabin primality test; bases from https://miller-rabin.appspot.com/
# Note that this subroutine, as well as most of the ones before the 'utility
# functions' section are not proper functions, as they sometimes jump directly
# to the next step instead of returning!
#
# input: n
# output: n/a
################
miller_rabin:
shr d, n, 1
mov b, 1
mr_rd_loop: # n - 1 == d * 2**b, d is odd
and c, d, 1
snz c, 3
shr d, d, 1
inc b
jmp mr_rd_loop
geu c, n, 2**32-1 # test if we can use single-precision arithmetic
jnz miller_rabin_montgomery, c
sub m, n, 1
mr_test_1:
gequ c, n, 0x000000000005361b
jnz mr_test_2, c
mov a, 0x81b33f22efdceaa9
call miller_rabin_test
jmp is_prime
mr_test_2:
gequ c, n, 0x000000003e9de64d
jnz mr_test_3, c
mov a, 0x0000004e69b6552d
call miller_rabin_test
mov a, 0x00223f5bb83fc553
call miller_rabin_test
jmp is_prime
mr_test_3:
mov a, 0x3ab4f88ff0cc7c80
call miller_rabin_test
mov a, 0xcbee4cdf120c10aa
call miller_rabin_test
mov a, 0xe6f1343b0edca8e7
call miller_rabin_test
jmp is_prime
miller_rabin_montgomery:
call montgomery_precompute
sub m, n, r # montgomery representation of n-1
mr_test_mt_3:
gequ c, n, 0x000000518dafbfd1
jnz mr_test_mt_4, c
mov a, 0x3ab4f88ff0cc7c80
call miller_rabin_test_montgomery
mov a, 0xcbee4cdf120c10aa
call miller_rabin_test_montgomery
mov a, 0xe6f1343b0edca8e7
call miller_rabin_test_montgomery
jmp is_prime
mr_test_mt_4:
gequ c, n, 0x0000323ee0e55e6b
jnz mr_test_mt_5, c
mov a, 0x0000000000000002
call miller_rabin_test_montgomery
mov a, 0x0000810c207b08bf
call miller_rabin_test_montgomery
mov a, 0x10a42595b01d3765
call miller_rabin_test_montgomery
mov a, 0x99fd2b545eab5322
call miller_rabin_test_montgomery
jmp is_prime
mr_test_mt_5:
gequ c, n, 0x001c6b470864f683
jnz mr_test_mt_6, c
mov a, 0x0000000000000002
call miller_rabin_test_montgomery
mov a, 0x000003c1c7396f6d
call miller_rabin_test_montgomery
mov a, 0x02142e2e3f22de5c
call miller_rabin_test_montgomery
mov a, 0x0297105b6b7b29dd
call miller_rabin_test_montgomery
mov a, 0x370eb221a5f176dd
call miller_rabin_test_montgomery
jmp is_prime
mr_test_mt_6:
gequ c, n, 0x081f23f390affe89
jnz mr_test_mt_7, c
mov a, 0x0000000000000002
call miller_rabin_test_montgomery
mov a, 0x000070722e8f5cd0
call miller_rabin_test_montgomery
mov a, 0x0020cd6bd5ace2d1
call miller_rabin_test_montgomery
mov a, 0x009bbc940c751630
call miller_rabin_test_montgomery
mov a, 0x0a90404784bfcb4d
call miller_rabin_test_montgomery
mov a, 0x1189b3f265c2b0c7
call miller_rabin_test_montgomery
jmp is_prime
mr_test_mt_7:
mov a, 0x0000000000000002
call miller_rabin_test_montgomery
mov a, 0x0000000000000145
call miller_rabin_test_montgomery
mov a, 0x000000000000249f
call miller_rabin_test_montgomery
mov a, 0x0000000000006e12
call miller_rabin_test_montgomery
mov a, 0x000000000006e0d7
call miller_rabin_test_montgomery
mov a, 0x0000000000953d18
call miller_rabin_test_montgomery
mov a, 0x000000006b0191fe
call miller_rabin_test_montgomery
jmp is_prime
is_prime:
mov o, n
call print
cmp c, g, h
sz c, 1
halt 0
add g, g, 8
lw n, g
jmp prime_test
found_factor:
divu n, b, n, a
add h, h, 8
sw h, a
jmp prime_test
################
# Miller-Rabin primality test subroutine; needs m == n - 1
#
# input: a, n, m
# output: n/a
################
miller_rabin_test:
divu q, a, a, n # a = a mod n
snz a, 1
ret # a certifies primality if a == 0 mod n
call power # a = a**d mod n
cmp c, a, 1 # r == 1
sz c, 1
ret # a certifies primality if a**d == 1 mod n
cmp c, a, m # m == n-1
sz c, 1
ret # a certifies primality if a**d == -1 mod n
mov d, 2
mr_square_loop:
dec b
jz pollard_rho, b
mulu x, y, a, a # a = a**2 mod n
divu q, a, x, n # q is unused
cmp c, a, 1 # r == 1
jnz pollard_rho, c # a witnesses compositeness if a**(d*2**i) == 1 mod n
cmp c, a, m # m == n-1
sz c, 1
ret # a certifies primality if a**(d*2**i) == -1 mod n
jmp mr_square_loop
################
# Miller-Rabin primality test subroutine using Montgomery arithmetic; needs
# the values from the precomputation step as well as m == n - 1
#
# input: a, n, m, (u, v, r, s)
# output: n/a
################
miller_rabin_test_montgomery:
divu q, a, a, n # a = a mod n
snz a, 1
ret # a certifies primality if a == 0 mod n
mulu x, y, a, s
call montgomery_reduce # convert a to montgomery form
mov a, x
call montgomery_power # a = a**d mod n
cmp c, a, r # r == 1 (mongtomery form)
sz c, 1
ret # a certifies primality if a**d == 1 mod n
cmp c, a, m # m == n-1 (montgomery form)
sz c, 1
ret # a certifies primality if a**d == -1 mod n
mr_mt_square_loop:
dec b
jz pollard_rho_montgomery, b
mulu x, y, a, a # a = a**2 mod n
call montgomery_reduce
mov a, x
cmp c, a, r # r == 1 (mongtomery form)
jnz pollard_rho_montgomery, c # a witnesses compositeness if a**(d*2**i) == 1 mod n
cmp c, a, m # m == n-1 (montgomery form)
sz c, 1
ret # a certifies primality if a**(d*2**i) == -1 mod n
jmp mr_mt_square_loop
################
# Pollard-Brent rho factorization algorithm
#
# input: n
# output: n/a
################
pollard_rho:
rho_skip_count = 64
mov o, 42
rho_retry:
mov b, o
mov j, 1
mov f, 1
mov d, 1
rho_outer_loop:
mov a, b
mov i, j
rho_fast_loop:
mulu b, y, b, b
add b, b, 1
divu y, b, b, n
#mov o, b
#call print
sub i, i, 1
jnz rho_fast_loop, i
mov k, 0
rho_slow_loop:
mov e, b
sub i, j, k
leu c, rho_skip_count, i
sz c, 1
mov i, rho_skip_count
rho_inner_loop:
mulu b, y, b, b
add b, b, 1
divu y, b, b, n
leu c, a, b
sz c, 1
sub x, b, a
snz c, 1
sub x, a, b
mulu f, y, x, f
divu y, f, f, n
#add o, f, 1000000000
#call print
sub i, i, 1
jnz rho_inner_loop, i
mov x, f
call gcd
mov d, x
#mov o, d
#call print
add k, k, rho_skip_count
gequ c, k, j
snz c, 2
lequ c, d, 1
jnz rho_slow_loop, c
shl j, j, 1
lequ c, d, 1
jnz rho_outer_loop, c
cmp c, d, n
sz c, 12
rho_final_loop:
mulu e, y, e, e
add e, e, 1
divu y, e, e, n
#add o, e, 2000000000
#call print
leu c, a, e
sz c, 1
sub x, e, a
snz c, 1
sub x, a, e
call gcd
mov d, x
#mov o, d
#call print
lequ c, d, 1
jnz rho_final_loop, c
cmp c, d, n
sz c, 2
add o, o, 1
jmp rho_retry
mov a, d
jmp found_factor
################
# Pollard-Brent rho factorization algorithm using Montgomery arithmetic.
# Requires m = n - r
#
# input: n, (u, v, r, s, m)
# output: n/a
################
pollard_rho_montgomery:
rho_mt_skip_count = 64
mov o, 42
rho_mt_retry:
mulu x, y, o, s
call montgomery_reduce
mov b, x
mov j, 1
mov f, r
mov d, 1
rho_mt_outer_loop:
mov a, b
mov i, j
rho_mt_fast_loop:
mulu x, y, b, b
call montgomery_reduce
mov b, x
geu c, b, m
sz c, 1
sub b, b, m
snz c, 1
add b, b, r
#mov o, b
#call print
sub i, i, 1
jnz rho_mt_fast_loop, i
mov k, 0
rho_mt_slow_loop:
mov e, b
sub i, j, k
leu c, rho_mt_skip_count, i
sz c, 1
mov i, rho_mt_skip_count
rho_mt_inner_loop:
mulu x, y, b, b
call montgomery_reduce
mov b, x
geu c, b, m
sz c, 1
sub b, b, m
snz c, 1
add b, b, r
leu c, a, b
sz c, 1
sub x, b, a
snz c, 1
sub x, a, b
mulu x, y, x, f
call montgomery_reduce
mov f, x
sub i, i, 1
jnz rho_mt_inner_loop, i
mov x, f
#mov o, x
#call print
call gcd
mov d, x
#mov o, d
#call print
add k, k, rho_mt_skip_count
gequ c, k, j
snz c, 2
lequ c, d, 1
jnz rho_mt_slow_loop, c
shl j, j, 1
lequ c, d, 1
jnz rho_mt_outer_loop, c
cmp c, d, n
sz c, 15
rho_mt_final_loop:
mulu x, y, e, e
call montgomery_reduce
mov e, x
geu c, e, m
sz c, 1
sub e, e, m
snz c, 1
add e, e, r
leu c, a, e
sz c, 1
sub x, e, a
snz c, 1
sub x, a, e
call gcd
mov d, x
lequ c, d, 1
jnz rho_mt_final_loop, c
cmp c, d, n
sz c, 2
add o, o, 1
jmp rho_mt_retry
mov a, d
jmp found_factor
################ UTILITY FUNCTIONS ################
################
# Loads a decimal number terminated by a newline from stdin to register n.
#
# inputs: none
# outputs: n
################
input:
lw c, -1
input_add_digit:
sub c, c, ord('0')
mov o, c
mulu n, h, n, 10
add n, n, c
lw c, -1
cmp q, c, ord('\n')
jz input_add_digit, q
ret n
################
# Prints the value of register o to stdout in decimal, followed by a newline.
#
# inputs: o
# outputs: none
################
print:
call print_char
sw -1, ord('\n')
ret
print_char:
divu o, x, o, 10
sz o, 1
call print_char
add x, x, ord('0')
sw -1, x
ret
################
# Precomputes values used for Montgomery multiplication:
# * u = 1/2**64 mod n
# * v = -1/n mod 2**64
# * r = 2**64 mod n (montgomery representation of 1)
# * s = 2**128 mod n (montgomery representation of 2**64)
#
# inputs: n
# outputs: u, v, s, t
################
montgomery_precompute:
# first compute the inverses; based on xbinGCD from
# http://www.hackersdelight.org/MontgomeryMultiplication.pdf
# This step takes something like an average of 7 cycles per bit in the
# 'fast' mode, and 8 cycles per bit in 'safe' mode, for a total of around
# 500 cycles: but we only have to do it once per modulus
mov u, 1
mov v, 0
mov i, 64
# HD defensively computes (u+beta)/2 to avoid overflow, taking two extra
# instructions; however, since u < beta overflow is only possible when
# beta >= 2**63, in most cases we can use a faster version and save
# 2 * 64 = 128 cycles, at the expense of a couple cycles comparison
gequ c, n, 2**63
jz mt_pre_gcd_loop, c
mt_pre_gcd_loop_safe:
dec i
and c, u, 1
snz c, 3
shr u, u, 1
shr v, v, 1
sz 0, 6
and c, u, n
xor u, u, n
shr u, u, 1
add u, u, c
shr v, v, 1
add v, v, 2**63
jnz mt_pre_gcd_loop_safe, i
jmp mt_pre_gcd_end
mt_pre_gcd_loop:
dec i
and c, u, 1
snz c, 3
shr u, u, 1
shr v, v, 1
sz 0, 4
add u, u, n
shr u, u, 1
shr v, v, 1
add v, v, 2**63
jnz mt_pre_gcd_loop, i
mt_pre_gcd_end:
# compute 2**64 mod n; since 2**64 is too large to fit in a register, we
# first calculate r = 2**63 mod n, if it's greater than n/2, r = 2r - n,
# otherwise just r = 2r
divu q, r, 2**63, n # q is unused
shr c, n, 1
geu c, r, c
shl r, r, 1
sz c, 1
sub r, r, n
# It would be entirely possible to compute 2**128 mod n by starting with
# r and using repeated doubling (64 times); this would take something
# like 4*64 = 256 cycles. Instead I'll use a doubleword division
# algorithm from Hacker's Delight (ch. 9-4, fig. 9-3), which takes
# fewer than 100 cycles.
mov l, 0
# normalize the divisor (n)
and c, n, 2**64 - 2**32
snz c, 2
shl n, n, 32
add l, l, 32
and c, n, 2**64 - 2**48
snz c, 2
shl n, n, 16
add l, l, 16
and c, n, 2**64 - 2**56
snz c, 2
shl n, n, 8
add l, l, 8
and c, n, 2**64 - 2**60
snz c, 2
shl n, n, 4
add l, l, 4
and c, n, 2**64 - 2**62
snz c, 2
shl n, n, 2
add l, l, 2
and c, n, 2**64 - 2**63
snz c, 2
shl n, n, 1
add l, l, 1
# normalize the numerator
shl a, r, l
# split the divisor into halfwords
shr b, n, 32
and m, n, 2**32 - 1
# compute first halfword of remainder
divu q, p, a, b
mt_pre_div_adjust1: # loop runs at most two times, 035x on average
gequ c, q, 2**32
snz c, 4
mulu d, f, q, m # f is unused
shl c, p, 32
geu c, d, c
sz c, 4
sub q, q, 1
add p, p, b
leu c, p, 2**32
jnz mt_pre_div_adjust1, c
shl a, a, 32
mulu d, f, q, n
sub a, a, d
# compute second halfword of remainder
divu q, p, a, b
mt_pre_div_adjust2:
snz c, 4
mulu d, f, q, m
shl c, p, 32
geu c, d, c
sz c, 4
sub q, q, 1
add p, p, b
leu c, p, 2**32
jnz mt_pre_div_adjust2, c
shl a, a, 32
mulu d, f, q, n
sub a, a, d
# de-normalize remainder
shr s, a, l
ret u, v, r, s
################
# Performs the Montgomery reduction step, x = [yx]/2**64 mod n
# Requres the u and v calculated in mongomery_precompute.
#
# inputs: x, y, n, (u, v)
# outputs: x
################
montgomery_reduce:
mulu m, u, x, v # m = ([yx] mod 2**64) / (-n) mod 2**64
mulu z, w, m, n # [wz] = mN
add x, x, z # [yx] += [wz]
leu c, x, z # this is where I really miss an ADC instruction
sz c, 1
inc w
add x, y, w # x = [yx] / 2**64 = y (combined with y+w from above)
leu c, x, y # x -= n if overflow
sz c, 1
sub x, x, n
gequ c, x, n # if x >= n
sz c, 1
sub x, x, n # x -= n
ret x
################
# Computes a = a**d mod n by repeated squaring.
#
# inputs: a, d, n
# outputs: a
################
power:
mov b, a
mov a, 1
mt_pow_loop:
and c, d, 1
sz c, 2
mulu x, y, a, b # a = a*b mod n
divu q, a, x, n # q is unused
mulu x, y, b, b # b = b*b mod n
divu q, b, x, n
shr d, d, 1
jnz mt_pow_loop, d
ret a
################
# Uses Montgomery multiplication to compute a = a**d mod n. Treats d as an
# ordinary unsigned integer; treats d as an integer in Montgomery form.
# Requres the u, v, and r calculated in mongomery_precompute.
#
# inputs: a, d, n, (u, v, r)
# outputs: a
################
montgomery_power:
mov b, a
mov a, r
pow_loop:
and c, d, 1
sz c, 3
mulu x, y, a, b
call montgomery_reduce
mov a, x
mulu x, y, b, b
call montgomery_reduce
mov b, x
shr d, d, 1
jnz pow_loop, d
ret a
################
# Computes x = GCD(x,n) using the binary GCD algorithm. Assumes that x < n
# and that n is odd. Note that we don't need a separate Montgomery GCD,
# since, surprisingly, GCD(x*(2**64) mod n, n) == GCD(x, n).
#
# inputs: x, n
# outputs: x
################
gcd:
snz x, 2 # GCD(0, n) == n
mov x, n
ret x
gcd_loop:
and c, x, 1 # n is odd, so the GCD has no factors of two
snz c, 3
gcd_div_2_loop:
shr x, x, 1
and c, x, 1
jz gcd_div_2_loop, c
geu c, n, x
sz c, 5
sub x, n, x
sub n, n, x
jnz gcd_loop, x
mov x, n
ret x
sub x, x, n
jnz gcd_loop, x
mov x, n
ret x