-
-
Save jzakiya/6c7e1868bd749a6b1add62e3e3b2341e to your computer and use it in GitHub Desktop.
#[ | |
This Nim source file is a multiple threaded implementation to perform an | |
extremely fast Segmented Sieve of Zakiya (SSoZ) to find Twin Primes <= N. | |
Inputs are single values N, or ranges N1 and N2, of 64-bits, 0 -- 2^64 - 1. | |
Output is the number of twin primes <= N, or in range N1 to N2; the last | |
twin prime value for the range; and the total time of execution. | |
Code originally developed on a System76 laptop with an Intel I7 6700HQ cpu, | |
2.6-3.5 GHz clock, with 8 threads, and 16GB of memory. Parameter tuning | |
would be needed to optimize for other hardware systems (ARM, PowerPC, etc). | |
For Nim >= 2.0.0 compile as: | |
$ nim c --cc:gcc --mm:arc --d:danger --d:release --threads:on --d:useMalloc twinprimes_ssozgist_new.nim | |
Then run executable: $ ./twinprimes_ssozgist1 <cr>, and enter range values. | |
Or alternatively: $ echo <range1> <range2> | ./twinprimes_ssoz | |
Range values can be provided in either order (larger or smaller first). | |
This source file, and updates, will be available here: | |
https://gist.github.com/jzakiya/6c7e1868bd749a6b1add62e3e3b2341e | |
Mathematical and technical basis for implementation are explained here: | |
https://www.academia.edu/37952623/The_Use_of_Prime_Generators_to_Implement_Fast_ | |
Twin_Primes_Sieve_of_Zakiya_SoZ_Applications_to_Number_Theory_and_Implications_ | |
for_the_Riemann_Hypotheses | |
https://www.academia.edu/7583194/The_Segmented_Sieve_of_Zakiya_SSoZ_ | |
https://www.academia.edu/19786419/PRIMES-UTILS_HANDBOOK | |
https://www.academia.edu/81206391/Twin_Primes_Segmented_Sieve_of_Zakiya_SSoZ_Explained | |
This code is provided free and subject to copyright and terms of the | |
GNU General Public License Version 3, GPLv3, or greater. | |
License copy/terms are here: http://www.gnu.org/licenses/ | |
Copyright (c) 2017-2024 Jabari Zakiya -- jzakiya at gmail dot com | |
Version Date: 2024/08/12 | |
]# | |
import math # for sqrt, gcd, mod functions | |
import strutils, typetraits # for number input | |
import times, os # for timing code execution | |
import osproc # for getting threads count | |
import threadpool # for parallel processing | |
import algorithm # for sort | |
import bitops # for countSetBits | |
{.experimental: "parallel".} # required to use 'parallel' (>= 1.4.x) | |
proc modinv(a0, b0: int): int = | |
## Compute modular inverse a^-1 of a to base b, e.g. a*(a^-1) mod b = 1. | |
var (a, b, x0) = (a0, b0, 0) | |
result = 1 | |
if b == 1: return | |
while a > 1: | |
result -= (a div b) * x0 | |
a = a mod b | |
swap a, b | |
swap x0, result | |
if result < 0: result += b0 | |
proc genPGparameters(prime: int): (int, int, int, seq[int], seq[int]) = | |
## Create prime generator parameters for given Pn | |
echo("using Prime Generator parameters for P", prime) | |
let primes = [2, 3, 5, 7, 11, 13, 17, 19, 23] | |
var (modpg, res_0) = (1, 0) # compute Pn's modulus and res_0 value | |
for prm in primes: (res_0 = prm; if prm > prime: break; modpg *= prm) | |
var restwins: seq[int] = @[] # save upper twin pair residues here | |
var inverses = newSeq[int](modpg+2) # save Pn's residues inverses here | |
var (rc, inc, res) = (5, 2, 0) # use P3's PGS to generate residue candidates | |
while rc < (modpg shr 1): # find PG's 1st half residues | |
if gcd(modpg, rc) == 1: # if rc is a residue | |
let mc = modpg - rc # create its modular complement | |
inverses[rc] = modinv(rc,modpg) # save rc amd mc inverses | |
inverses[mc] = modinv(mc,modpg) # if in twinpair save both hi residues | |
if res + 2 == rc: restwins.add(rc); restwins.add(mc + 2) | |
res = rc # save current found residue | |
rc += inc; inc = inc xor 0b110 # create next P3 sequence rc: 5 7 11 13 17 19 ... | |
restwins.sort(system.cmp[int]); restwins.add(modpg + 1) # last residue is last hi_tp | |
inverses[modpg + 1] = 1; inverses[modpg - 1] = modpg - 1 # last 2 resdiues are self inverses | |
(modpg, res_0, restwins.len, restwins, inverses) | |
# Global parameters | |
var | |
cnts: seq[uint] # holds twinprimes count for each twinpair | |
lastwins: seq[uint] # holds largest twinprime for each twinpair | |
proc setSieveParameters(start_num, end_num: uint): (uint, int, uint, uint, uint, uint, int, seq[int], seq[int]) = | |
## Select at runtime best PG and segment size factor to use for input value. | |
## These are good estimates derived from PG data profiling. Can be improved. | |
let range = end_num - start_num | |
var (Bn, pg) = (0, 3) | |
if end_num < 49'u: | |
Bn = 1; pg = 3 | |
elif range < 77_000_000: | |
Bn = 16; pg = 5 | |
elif range < 1_100_000_000'u: | |
Bn = 32; pg = 7 | |
elif range < 35_500_000_000'u: | |
Bn = 64; pg = 11 | |
elif range < 14_000_000_000_000'u: | |
pg = 13 | |
if range > 7_000_000_000_000'u: Bn = 384 | |
elif range > 2_500_000_000_000'u: Bn = 320 | |
elif range > 250_000_000_000'u: Bn = 196 | |
else: Bn = 128 | |
else: | |
Bn = 384; pg = 17 | |
let (modpg, res_0, pairscnt, restwins, resinvrs) = genPGparameters(pg) | |
let Kmin = (start_num - 2) div modpg.uint + 1 # number of resgroups to start_num | |
let Kmax = (end_num - 2) div modpg.uint + 1 # number of resgroups to end_num | |
let krange = Kmax - Kmin + 1 # number of range resgroups, at least 1 | |
let n = if krange < 37_500_000_000_000'u: 10 elif krange < 975_000_000_000_000'u: 16 else: 20 | |
let B = uint(Bn * 1024 * n) # set seg size to optimize for selected PG | |
let Ks = if krange < B: krange else: B # segments resgroups size | |
echo("segment size = ",Ks," resgroups; seg array is [1 x ",((Ks-1) shr 6) + 1,"] 64-bits") | |
let maxpairs = krange.int * pairscnt; # maximum number of twinprime pcs | |
echo("twinprime candidates = ", maxpairs, "; resgroups = ", krange); | |
(modpg.uint, res_0, Ks, Kmin, Kmax, krange, pairscnt, restwins, resinvrs) | |
proc sozp5(val: int | int64, res_0: int, start_num, end_num: uint): seq[int] = | |
## Return the primes r0..sqrt(end_num) within range (start_num...end_num) | |
let (md, rescnt) = (30, 8) # P5's modulus and residues count | |
let res = [7,11,13,17,19,23,29,31] # P5's residues list | |
let bitn = [0,0,0,0,0,1,0,0,0,2,0,4,0,0,0,8,0,16,0,0,0,32,0,0,0,0,0,64,0,128] | |
let range_size = end_num - start_num # integers size for inputs range | |
let kmax = (val - 2) div md + 1 # number of resgroups to input value | |
var prms = newSeq[uint8](kmax) # byte array of prime candidates init '0' | |
let sqrtn = int(sqrt float64(val)) # compute integer sqrt of val | |
var k = sqrtn div md # compute its resgroup value | |
var (resk, r) = (sqrtn - md*k, 0) # compute its residue value; set residue start posn | |
while resk >= res[r]: r += 1 # find largest residue <= sqrtn posn in its resgroup | |
let pcs_to_sqrtn = k*rescnt + r # number of pcs <= sqrtn | |
for i in 0..pcs_to_sqrtn: # for r0..sqrtN primes mark their multiples | |
let (k, r) = (i div rescnt, i mod rescnt) # update resgroup parameters | |
if (prms[k] and uint8(1 shl r)) != 0: continue # skip pc if not prime | |
let prm_r = res[r] # if prime save its residue value | |
let prime = md*k + prm_r # numerate its value | |
let rem = start_num mod uint(prime) # prime's modular distance to start_num | |
if not(uint(prime) - rem <= range_size or rem == 0): continue # skip prime if no multiple in range | |
for ri in res: # mark prime's multiples in prms | |
let prod = prm_r * ri - 2 # compute cross-product for prm_r|ri pair | |
let bit_r = uint8(bitn[prod mod md]) # bit mask value for prod's residue | |
var kpm = k * (prime + ri) + prod div md # resgroup for prime's 1st multiple | |
while kpm < kmax: prms[kpm] = prms[kpm] or bit_r; kpm += prime | |
# prms now contains the prime multiples positions marked for the pcs r0..N | |
# now along each restrack, identify|numerate the primes in each resgroup | |
# return only the primes with a multiple within range (start_num...end_num) | |
result = @[] # create empty dynamic array for primes | |
for r, res_r in res: # along each restrack|row til end | |
for k, resgroup in prms: # for each resgroup along restrack | |
if (resgroup and uint8(1 shl r)) == 0: # if bit location a prime | |
let prime = md * k + res_r # numerate its value, store if in range | |
# check if prime has multiple in range, if so keep it, if not don't | |
let rem = start_num mod prime.uint | |
if (prime in res_0..val) and (prime.uint - rem <= range_size or rem == 0): result.add(prime) | |
proc nextp_init(hi_r: int, kmin: uint, modpg: int, primes: seq[int], resinvrs: seq[int]): seq[uint64] = | |
## Initialize 'nextp' array for twinpair upper residue rhi in 'restwins'. | |
## Compute 1st prime multiple resgroups for each prime r0..sqrt(N) and | |
## store consecutively as lo_tp|hi_tp pairs for their restracks. | |
var nextp = newSeq[uint64](primes.len * 2) # 1st mults array for twinpair | |
let (r_hi, r_lo) = (hi_r, hi_r - 2) # upper|lower twinpair residues vals | |
for j, prime in primes: # for each prime r0..sqrt(N) | |
let k = (prime - 2) div modpg # find the resgroup it's in | |
let r = (prime - 2) mod modpg + 2 # and its residue value | |
let r_inv = resinvrs[r] # and its residue inverse | |
var ri = (r_lo * r_inv - 2) mod modpg + 2 # compute r's ri for r_lo | |
var ki = uint(k * (prime + ri) + (r * ri - 2) div modpg) # and 1st mult | |
if ki < kmin: # if 1st mult index < start_num's | |
ki = (kmin - ki) mod prime.uint # how many indices short is it | |
if ki > 0'u: ki = prime.uint - ki # adjust index value into range | |
else: ki = ki - kmin # else here, adjust index if it was > | |
nextp[j * 2] = ki # lo_tp index val >= start of range | |
ri = (r_hi * r_inv - 2) mod modpg + 2 # compute r's ri for r_hi | |
ki = uint(k * (prime + ri) + (r * ri - 2) div modpg) # and 1st mult | |
if ki < kmin: # if 1st mult index < start_num's | |
ki = (kmin - ki) mod prime.uint # how many indices short is it | |
if ki > 0'u: ki = prime.uint - ki # adjust index value into range | |
else: ki = ki - kmin # else here, adjust index if it was > | |
nextp[j * 2 or 1] = ki # hi_tp index val >= start of range | |
result = nextp | |
proc twins_sieve(r_hi, kmin, kmax, Ks, start_num, end_num, modpg: uint, primes: seq[int], resinvrs: seq[int], indx: int) = | |
## Perform in thread the ssoz for given twinpair residues for Kmax resgroups. | |
## First create|init 'nextp' array of 1st prime mults for given twinpair, | |
## stored consequtively in 'nextp', and init seg array for Ks resgroups. | |
## For sieve, mark resgroup bits to '1' if either twinpair restrack is nonprime | |
## for primes mults resgroups, and update 'nextp' restrack slices acccordingly. | |
## Find last twin prime|sum for range, store at array[indx] for this twinpair. | |
## Can optionally compile to print mid twinprime values generated by twinpair. | |
{.gcsafe.}: | |
const S = 6 # shift value for 64 bits | |
const BMASK = (1 shl S) - 1 # bitmask val for 64 bits | |
var (sum, Ki, Kn) = (0'u, kmin-1, Ks.int) # init these parameters | |
var (hi_tp, Kmax) = (0'u, kmax) # max twinprime|resgroup val | |
var seg = newSeq[uint](((Ks-1) shr S) + 1) # seg array for Ks resgroups | |
if r_hi - 2 < (start_num - 2) mod modpg + 2: Ki.inc # ensure lo tps in range | |
if r_hi > (end_num - 2) mod modpg + 2: Kmax.dec # ensure hi tps in range | |
var nextp = nextp_init(r_hi.int, Ki, modpg.int, primes, resinvrs) # init nextp array | |
while Ki < Kmax: # for Ks resgroup size slices upto Kmax | |
if Ks > (Kmax - Ki): Kn = int(Kmax - Ki) # adjust Kn size for last seg | |
for j, prime in primes: # for each prime r0..sqrt(N) | |
# for lower twinpair residue track | |
var k1 = nextp[j * 2].int # starting from this resgroup in seg | |
while k1 < Kn: # mark primenth resgroup bits prime mults | |
seg[k1 shr S] = seg[k1 shr S] or (1'u shl (k1 and BMASK)).uint | |
k1 += prime # set next prime multiple resgroup | |
nextp[j * 2] = uint(k1 - Kn) # save 1st resgroup in next eligible seg | |
# for upper twinpair residue track | |
var k2 = nextp[j * 2 or 1].int # starting from this resgroup in seg | |
while k2 < Kn: # mark primenth resgroup bits prime mults | |
seg[k2 shr S] = seg[k2 shr S] or (1'u shl (k2 and BMASK)).uint | |
k2 += prime # set next prime multiple resgroup | |
nextp[j * 2 or 1] = uint(k2 - Kn) # save 1st resgroup in next eligible seg | |
# set as nonprime unused bits in last seg[n] | |
# so fast, do for every seg[i] | |
seg[(Kn-1) shr S] = seg[(Kn-1) shr S] or (not 1'u shl ((Kn-1) and BMASK)).uint | |
var cnt = 0 # count the twinprimes in the segment | |
for m in seg[0..(Kn-1) shr S]: cnt += (1 shl S) - countSetBits(m) | |
if cnt > 0: # if segment has twin primes | |
sum += cnt.uint # add segment count to total range count | |
var upk = Kn - 1 # from end of seg count back to largest tp | |
while (seg[upk shr S] and (1'u shl (upk and BMASK)).uint) != 0: upk.dec | |
hi_tp = Ki + upk.uint # numerate its full resgroup value | |
Ki += Ks # set 1st resgroup val of next seg slice | |
if Ki < Kmax: (for b in 0..(Kn - 1) shr S: seg[b] = 0) # set seg to all primes | |
# when sieve done for full range | |
# numerate largest twinprime in segs | |
hi_tp = if r_hi > end_num: 0'u else: hi_tp * modpg.uint + r_hi | |
lastwins[indx] = if sum == 0: 1'u else: hi_tp # store final seg tp value | |
cnts[indx] = sum # sum for twin pair | |
proc twinprimes_ssoz() = | |
## Main routine to setup, time, and display results for twin primes sieve. | |
let x = stdin.readline.split # Inputs are 1 or 2 range values < 2**64 | |
var end_num = max(x[0].parseUInt, 3'u) | |
var start_num = if x.len > 1: max(x[1].parseUInt, 3'u) else: 3'u | |
if start_num > end_num: swap start_num, end_num | |
start_num = start_num or 1 # if start_num even add 1 | |
end_num = (end_num - 1) or 1 # if end_num even subtract 1 | |
if end_num - start_num < 2: (start_num, end_num) = (7'u, 7'u) | |
echo("threads = ", countProcessors()) | |
let ts = epochTime() # start timing sieve setup execution | |
# select Pn, set sieving params for inputs | |
let (modpg, res_0, Ks, Kmin, Kmax, Krange, | |
pairscnt, restwins, resinvrs) = setSieveParameters(start_num, end_num) | |
# create sieve primes <= sqrt(end_num), only use those whose multiples within inputs range | |
let primes: seq[int] = if end_num < 49: @[5] | |
else: sozp5(int(sqrt float64(end_num)), res_0, start_num, end_num) | |
echo("each of ", pairscnt, " threads has nextp[", 2, " x ", primes.len, "] array") | |
var twinscnt = 0'u64 # init twinprimes range count | |
let lo_range = uint(restwins[0] - 3) # lo_range = lo_tp - 1 | |
for tp in [3, 5, 11, 17]: # excluded low tp values for PGs used | |
if end_num == 3'u: break # if 3 end of range, no twin primes | |
if tp.uint in start_num..lo_range: twinscnt += 1 # cnt small tps if any | |
let te = epochTime() - ts # sieve setup time | |
echo("setup time = ", te.formatFloat(ffDecimal, 6), " secs") | |
echo("perform twinprimes ssoz sieve") | |
let t1 = epochTime() # start timing ssoz sieve execution | |
cnts = newSeq[uint](pairscnt) # array to hold count of tps for each thread | |
lastwins = newSeq[uint](pairscnt) # array to hold largest tp for each thread | |
#parallel: # perform in parallel | |
for indx, r_hi in restwins: # for each twin pair row index | |
spawn twins_sieve(r_hi.uint, Kmin, Kmax, Ks, start_num, end_num, modpg, primes, resinvrs, indx) | |
stdout.write("\r", (indx + 1), " of ", pairscnt, " twinpairs done") | |
sync() # when all the threads finish | |
var last_twin = 0'u # find largest twin prime in range | |
for i in 0 ..< pairscnt: # and determine total twin primes count | |
twinscnt += cnts[i] | |
if last_twin < lastwins[i]: last_twin = lastwins[i] | |
if end_num == 5'u and twinscnt == 1: last_twin = 5'u | |
var Kn = Krange mod Ks # set number of resgroups in last slice | |
if Kn == 0: Kn = Ks # if multiple of seg size set to seg size | |
let t2 = epochTime() - t1 # sieve execution time | |
echo("\nsieve time = ", t2.formatFloat(ffDecimal, 3), " secs") | |
echo("total time = ", (t2 + te).formatFloat(ffDecimal, 3), " secs") | |
echo("last segment = ", Kn, " resgroups; segment slices = ", (Krange - 1) div Ks + 1) | |
echo("total twins = ", twinscnt, "; last twin = ", last_twin - 1, "+/-1") | |
twinprimes_ssoz() |
Thank you. I resolved the little differences from nim to java.
Can you provide a link to your Java version?
I'd like to see what that looks like, especially compared to Nim.
Have you benchmarked both versions on your system to do an apples-to apples comparison?
For the moment it's very limited due in part of some nim particularity
Took 1s for 1E9 without multithread
OK. Take your time.
Thanks for showing interest in the code.
There's one typo line 233, it's KB, not Kb, and maybe another one line 227, when you allocate Kmax to Kmax
Yeh, thanks for pointing that out.
However for Nim, it's case insensitive, so KB
and Kb
are treated the same.
That got past me from an earlier version where I had used different names.
I'll fix that to eliminate confusion from people coming from other languages.
Line 227 is not an error when I do: var (hi_tp, Kmax) = (0'u, Kmax)
I'm converting the global variable Kmax
to a thread local variable of the same name.
On line 231 I do: if (Kmax - 1) * modpg.uint + r_hi > end_num: Kmax.dec
I don't want to change the global value for Kmax
but only the thread local value if necessary.
The Nim compiler sees these as two separate parameters. Nim let's you overload names/functions like this.
I could have used a different name, but if you know Nim you know the variable assignment creates separate parameters.
I wrote it to keep it conceptually simple to read/understand; it's Kmax
for that thread.
It's ok for me I've already understood this 'facility' of caseInsensitive, and local/global difference.
This can be just confusing, and want to tell you.
My last problem is the ability to debug each step, to determine when rewriting in another language produce a bad result.
ie mixing uint8/32/64 can be a good source of errors. (As I have for now between 1E9 and 5E9)
line 252, to follow your heavy optimizations, 'mod 8'
can be replaced by '& 7'
? (a mod 2^n -> a & (n-1)
)
Another one to check please : For 5E9
> Task :SSOZ.main()
threads = 8
generating parameters for P 11
each thread segment is [1 x 65536] bytes array
twinprime candidates = 292207905; resgroups = 2164503
each 135 threads has nextp[2 x 6999] array
restwins [19, 31, 43, 61, 73, 103, 109, 139, 151, 169, 181, 193, 199, 223, 229, 241, 271, 283, 313, 349, 361, 379, 391, 403, 421, 433, 439, 463, 481, 493, 523, 529, 559, 571, 589, 601, 613, 619, 631, 643, 661, 691, 703, 733, 769, 799, 811, 823, 829, 841, 853, 859, 883, 901, 943, 949, 991, 1009, 1021, 1033, 1039, 1051, 1063, 1081, 1093, 1123, 1153, 1159, 1189, 1219, 1231, 1249, 1261, 1273, 1279, 1291, 1303, 1321, 1363, 1369, 1411, 1429, 1453, 1459, 1471, 1483, 1489, 1501, 1513, 1543, 1579, 1609, 1621, 1651, 1669, 1681, 1693, 1699, 1711, 1723, 1741, 1753, 1783, 1789, 1819, 1831, 1849, 1873, 1879, 1891, 1909, 1921, 1933, 1951, 1963, 1999, 2029, 2041, 2071, 2083, 2089, 2113, 2119, 2131, 2143, 2161, 2173, 2203, 2209, 2239, 2251, 2269, 2281, 2293, 2311]
setup time = 70 msecs
perform twinprimes ssoz sieve
135 of 135 threads done
sieve time = 2814 msecs
last segment = 1815 resgroups; segment slices = 331
total twins = 14618169; last twin = 705034274+/-1
total time = 2884 msecs
➜ nim echo 5000000000 | ./twinprimes_ssozdual
threads = 8
each thread segment is [1 x 65536] bytes array
twinprime candidates = 292207905; resgroups = 2164503
each 135 threads has nextp[2 x 6999] array
setup time = 0.000368 secs
perform twinprimes ssoz sieve
135 of 135 threads done
sieve time = 0.227 secs
last segment = 1815 resgroups; segment slices = 34
total twins = 14618166; last twin = 4999999860+/-1
total time = 0.228 secs
There's something wrong with the number of slices you do, which messes up the twins count.
What are the specs on your hardware?
Also, if you print out the twin pair residues it eats up time and display space, unless that's just for temporary diagnostics.
The number of threads used is the number of twin pair residues.
If you post your code in a gist, et al, so I can look at it I may be able to help you debug it.
I also encourage you to install Nim on your system (1.0.4 just released) and compile my code and use as your reference check.
I checked each proc
independently to make sure that for given inputs they produced expected outputs.
So make sure sozpg
, nextp_init
, and then especially twins_sieve
produce correct results and that's 95% of it.
It sounds like you're not doing multi-threading yet, which should make design verification easier.
Just take your time until you figure it out.
Lines 259 runs when s = 3, for "large" numbers (8 bits per byte used), and line 260 for "small" numbers (1 bit per byte used).
At the end of sieving each segment, if the segment has twin primes, you start from its end and count backward until you find the largest twin prime resgroup (k value) for that segment, and store it (its the largest twin prime in the segment). This should be very fast.
I suspect either your logic is wrong, or you are not starting from the end of the segment memory, or you are somehow skipping over the last twin pair occurrence and going through the whole segment, which would explain the amount of time you see for those lines.
Be sure your code is doing the same math as mine (for 8 bit bytes). You may need to mask off the low 8 bits to be sure the count is correct.
I could help you better if you shared your code for doing this.
This doesnt work in Nim:
nim-1.0.4>twinprimes_ssoz 500000000000 600000000000
strutils.nim(1118) parseUInt
Error: unhandled exception: invalid unsigned integer: [ValueError]
This is working in Java:
Please enter an range of integer (comma or space separated):
5e11 6e11
Max threads = 8
generating parameters for P 13
each thread segment is [1 x 131072] bytes array
twinprime candidates = 4945055940; resgroups = 3330004
each 1485 threads has nextp[2 x 62068] array
setup time = 0.119 secs
perform twinprimes ssoz sieve with s=0
1485 of 1485 threads done
sieve time = 23.375 secs
last segment = 53204 resgroups; segment slices = 26
total twins = 328566078; last twin = 599999999772+/-1
total time = 23.494 secs
These are my results for your value.