Last active
August 23, 2024 19:54
-
-
Save jzakiya/ae93bfa03dbc8b25ccc7f97ff8ad0f61 to your computer and use it in GitHub Desktop.
Twinprimes generator, multi-threaded, using SSoZ (Segmented Sieve of Zakiya), written in D
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* This D source file is a multiple threaded implementation to perform an | |
* extremely fast Segmented Sieve of Zakiya (SSoZ) to find Twin Primes <= N. | |
* | |
* Inputs are single values N, or ranges N1 and N2, of 64-bits, 0 -- 2^64 - 1. | |
* Output is the number of twin primes <= N, or in range N1 to N2; the last | |
* twin prime value for the range; and the total time of execution. | |
* | |
* Code originally developed on a System76 laptop with an Intel I7 6700HQ cpu, | |
* 2.6-3.5 GHz clock, with 8 threads, and 16GB of memory. Parameter tuning | |
* would be needed to optimize for other hadware systems (ARM, PowerPC, etc). | |
* | |
* To compile with ldc2: $ ldc2 --release -O3 -mcpu native twinprimes_ssoz.d | |
* To reduce binary size do: $ strip twinprimes_ssoz | |
* Then run executable: $ ./twinprimes_ssoz <cr>, and enter range values. | |
* Or alternatively: $ echo <range1> <range2> | ./twinprimes_ssoz | |
* Range values can be provided in either order (larger or smaller first). | |
* | |
* This D source file, and updates, will be available here: | |
* https://gist.github.com/jzakiya/ae93bfa03dbc8b25ccc7f97ff8ad0f61 | |
* | |
* Mathematical and technical basis for implementation are explained here: | |
* https://www.academia.edu/37952623/The_Use_of_Prime_Generators_to_Implement_Fast_ | |
* Twin_Primes_Sieve_of_Zakiya_SoZ_Applications_to_Number_Theory_and_Implications_ | |
* for_the_Riemann_Hypotheses | |
* https://www.academia.edu/7583194/The_Segmented_Sieve_of_Zakiya_SSoZ_ | |
* https://www.academia.edu/19786419/PRIMES-UTILS_HANDBOOK | |
* https://www.academia.edu/81206391/Twin_Primes_Segmented_Sieve_of_Zakiya_SSoZ_Explained | |
* | |
* This code is provided free and subject to copyright and terms of the | |
* GNU General Public License Version 3, GPLv3, or greater. | |
* License copy/terms are here: http://www.gnu.org/licenses/ | |
* | |
* Copyright (c) 2017-2024 Jabari Zakiya -- jzakiya at gmail dot com | |
* Version Date: 2024/08/23 | |
*/ | |
module twinprimes_ssoz; | |
import std.algorithm : min, max, swap, sort; | |
import std.datetime.stopwatch : StopWatch; | |
import std.math : sqrt; | |
import std.parallelism : parallel, totalCPUs; | |
import std.range : iota; | |
import std.stdio : readf, write, writeln, readln; | |
import std.numeric : gcd; | |
import std.array: appender; | |
import core.bitop: popcnt; | |
import core.atomic: atomicOp; | |
/// Compute modular inverse a^-1 of a to base b, e.g. a*(a^-1) mod b = 1. | |
int modinv(int a0, int b0) pure @nogc | |
{ | |
int a = a0; | |
int b = b0; | |
int x0 = 0; | |
int result = 1; | |
if (b == 1) { return result; } | |
while (a > 1) { | |
immutable q = a / b; | |
a = a % b; | |
swap(a, b); | |
result -= q * x0; | |
swap(x0, result); | |
} | |
if (result < 0) { result += b0; } | |
return result; | |
} | |
// Global parameters | |
shared uint Ks = 0; // segment size for each seg restrack | |
shared ulong start_num; // lo number for range | |
shared ulong end_num; // hi number for range | |
shared ulong Kmax; // number of resgroups to end_num | |
shared ulong Kmin; // number of resgroups to start_num | |
shared ulong[] primes; // list of primes r1..sqrt(N) | |
shared uint[] cnts; // holds twin primes count for each tp|thread | |
shared ulong[] lastwins; // holds largest twin prime for each tp|thread | |
shared uint modpg; // PG's modulus value | |
shared uint res_0; // PG's first residue value | |
shared ulong[] restwins; // PG's list of twinpair residues | |
shared ulong[] resinvrs; // PG's list of residues inverses | |
shared uint bn; // segment size factor for PG and input number | |
/// Creates constant parameters for selected Prime Generator at runtime. | |
void genPGparameters(int prime) | |
{ | |
writeln("using parameters for P", prime); | |
static immutable primes = [2, 3, 5, 7, 11, 13, 17, 19, 23]; | |
uint modpn = 1; | |
uint res0 = 0; // compute Pn's modulus and res_0 value | |
foreach (prm; primes) { res0 = prm; if (prm > prime) break; modpn *= prm; } | |
restwins = []; // save upper twin pair residues here | |
resinvrs = new ulong[modpn + 2]; // save Pn's residues inverses here | |
uint rc = 5; // use P3's PGS to generate residue candidates | |
uint inc = 2; // init value in its seq: 2 4 2 4 2 4 ... | |
auto res = 0; // init residue value | |
while (rc < (modpn >> 1)) { // find PG's 1st half residues | |
if (gcd(rc, modpn) == 1) { // if rc is a residue | |
auto mc = modpn - rc; // create its modular complement | |
resinvrs[rc] = modinv(rc,modpn); // save rc and mc inverses | |
resinvrs[mc] = modinv(mc,modpn); // if in twinpair save both hi residues | |
if (res + 2 == rc) { restwins ~= rc; restwins ~= mc + 2; } | |
res = rc; // save current found residue | |
} | |
rc += inc; inc = inc ^ 0b110; // create next P3 sequence rc: 5 7 11 13 17 19 ... | |
} | |
restwins.sort; restwins ~= modpn + 1; // last residue is last hi_tp | |
resinvrs[modpn + 1] = 1; resinvrs[modpn - 1] = modpn - 1; // last 2 residues are self inverses | |
modpg = modpn; res_0 = res0; // save global variables | |
} | |
/// Select at runtime best PG and segment size factor to use for input range. | |
/// These are good estimates derived from PG data profiling. Can be improved. | |
void setSieveParameters(ulong start_range, ulong end_range) { | |
ulong range = end_range - start_range; | |
int pg = 3; | |
if (end_range < 49) { | |
bn = 1; pg = 3; | |
} else if (range < 77_000_000) { | |
bn = 16; pg = 5; | |
} else if (range < 1_100_000_000) { | |
bn = 32; pg = 7; | |
} else if (range < 35_500_000_000uL) { | |
bn = 64; pg = 11; | |
} else if (range < 14_000_000_000_000uL) { | |
pg = 13; | |
if (range > 7_000_000_000_000uL) { bn = 384; } | |
else if (range > 2_500_000_000_000uL) { bn = 320; } | |
else if (range > 250_000_000_000uL) { bn = 196; } | |
else { bn = 128; } | |
} | |
else { | |
bn = 384; pg = 17; | |
} | |
genPGparameters(pg); | |
auto pairscnt = restwins.length; // number of twin pairs|threads for selected PG | |
Kmin = (start_num-2) / modpg + 1; // number of resgroups to start_num | |
Kmax = (end_num - 2) / modpg + 1; // number of resgroups to end_num | |
auto krange = Kmax - Kmin + 1; // number of range resgroups, at least 1 | |
auto n = (krange < 37_500_000_000_000uL) ? 10 : (krange < 950_000_000_000_000uL) ? 16 : 20; | |
auto b = bn * 1024 * n; // set seg size to optimize for selected PG | |
Ks = cast(uint)((krange < b) ? krange : b); // segments resgroups size | |
writeln("segment size = ",Ks," resgroups; seg array is [1 x ",((Ks-1) >> 6) + 1,"] 64-bits"); | |
auto maxpairs = krange * pairscnt; // maximum number of twin prime pcs | |
writeln("twinprime candidates = ", maxpairs, "; resgroups = ", krange); | |
} | |
/// Return the primes r0..sqrt(end_num) within range (start_num...end_num) | |
void sozP5(ulong val) { | |
uint md = 30; // P5's modulus value | |
ulong rescnt = 8; // P5's residue count | |
static immutable res = [7, 11, 13, 17, 19, 23, 29, 31]; | |
static immutable bitn = [0,0,0,0,0,1,0,0,0,2,0,4,0,0,0,8,0,16,0,0,0,32,0,0,0,0,0,64,0,128]; | |
ulong range_size = end_num - start_num;// integers size of inputs range | |
ulong kmax = (val - 2) / md + 1; // number of resgroups upto input value | |
ubyte[] prms = new ubyte[](kmax); // byte array of prime candidates init '0' | |
ulong sqrtn = cast(ulong) sqrt(cast(double) val); // integer sqrt of val | |
auto k = sqrtn/rescnt; auto resk = sqrtn-md*k; auto rr=0; // sqrtn resgroup|residue values, 1st res posn | |
while (resk >= res[rr]) rr++; // find largest residue <= sqrtn posn in its resgroup | |
auto pcs_to_sqrtn = k*rescnt + rr; // number of pcs <= sqrtn | |
foreach (i; 0..pcs_to_sqrtn) { // for r0..sqrtn primes mark their multiples | |
k = i/rescnt; auto r = i%rescnt; // update resgroup parameters | |
if ((prms[k] & (1 << r)) != 0) continue; // skip this pc if not prime | |
auto prm_r = res[r]; // if prime save its residue value | |
ulong prime = md*k + prm_r; // numerate its value | |
auto rem = start_num % prime; // prime's modular distance to start_num | |
if (!(prime - rem <= range_size || rem == 0)) continue; // skip prime if no multiple in range | |
foreach (ri; res) { // mark prime's multiples in prms | |
uint prod = prm_r * ri - 2; // compute cross-product for prm_r|ri pair | |
ubyte bit_r = cast(ubyte) bitn[prod % md]; // bit mask value for prod's residue | |
ulong kpm = k * (prime + ri) + prod / md; // resgroup for prime's 1st multiple | |
while (kpm < kmax) { prms[kpm] |= bit_r; kpm += prime; } // mark prime's multiples | |
} | |
} | |
// prms now contains the prime multiples positions marked for the pcs r0..N | |
// now along each restrack, identify|numerate the primes in each resgroup | |
// return only the primes with a multiple within range (start_num...end_num) | |
primes = []; // create empty dynamic array for primes | |
foreach (i, res_i; res) { // along each restrack|row til end | |
foreach (kn, resgroup; prms) { // for each resgroup along restrack | |
if ((resgroup & (1 << i)) == 0) { // if bit location a prime | |
ulong prime = md * kn + res_i; // numerate its value, store if in range | |
// check if prime has multiple in range, if so keep it, if not don't | |
ulong rem = start_num % prime; // if rem is 0 then start_num multiple of prime | |
if (((res_0 <= prime) && (prime <= val)) && (prime - rem <= range_size || rem == 0)) { primes ~= prime; } | |
} } } } // primes stored in restrack|row sequence order | |
/// Initialize 'nextp' array for given twin pair residues for thread. | |
/// Compute 1st prime multiple resgroups in range for each prime r1..sqrt(N) | |
/// and store consecutively as lo_tp|hi_tp pairs for their restracks. | |
ulong[] nextp_init(ulong hi_r, ulong k_min) { | |
ulong[] nextp = new ulong[](primes.length * 2); // 1st mults array for tp | |
ulong r_hi = hi_r; // upper twin pair value | |
ulong r_lo = hi_r - 2; // lower twin pair value | |
foreach(size_t j, prime; primes) { // for each prime r1..sqrt(N) | |
auto k = (prime - 2) / modpg; // find the resgroup it's in | |
auto r = (prime - 2) % modpg + 2; // and its residue value | |
auto r_inv = resinvrs[r]; // and its residue inverse | |
auto rl = (r_lo * r_inv - 2) % modpg + 2; // compute the ri for r for lo_cp | |
auto rh = (r_hi * r_inv - 2) % modpg + 2; // compute the ri for r for hi_tp | |
auto kl = k * (prime + rl) + (r * rl - 2) / modpg; // kl 1st mult resgroup | |
auto kh = k * (prime + rh) + (r * rh - 2) / modpg; // kh 1st mult resgroup | |
if (kl < k_min) { // if 1st mult index < start_num's | |
kl = (k_min - kl) % prime; // how many indices short is it | |
if (kl > 0) kl = prime - kl; // adjust index value into range | |
} else { kl = kl - k_min; } // else here, addjust index if it was > | |
if (kh < k_min) { // if 1st mult index < start_num's | |
kh = (k_min - kh) % prime; // how many indices short is it | |
if (kh > 0) kh = prime - kh; // adjust index value into range | |
} else { kh = kh - k_min; } // else here, addjust index if it was > | |
nextp[j * 2] = kl; // lo_cp index val >= start of range | |
nextp[j * 2 | 1] = kh; // hi_cp index val >= start of range | |
} | |
return nextp; | |
} | |
/// Perform in a thread, the ssoz for a given twinpair, for Kmax resgroups. | |
/// First create|init 'nextp' array of 1st prime mults for given twin pair, | |
/// (stored consequtively in 'nextp') and init seg byte array for Ks resgroups. | |
/// For sieve, mark resgroup bits to '1' if either twinpair restrack is nonprime, | |
/// for primes mults resgroups, and update 'nextp' restrack slices acccordingly. | |
/// Find last twin prime|sum for range, store in their arrays for this twinpair. | |
/// Can optionally compile to print mid twinprime values generated by twinpair. | |
void twinsSieve(uint indx) { | |
uint s = 6; // shift value for 64 bits | |
uint bmask = (1 << s) - 1; // bitmask val for 64 bits | |
auto sum = 0; // twins count for twin pair for segment | |
ulong Ki = Kmin - 1; // first resgroup val for range for slice | |
ulong K_max = Kmax; // last resgroup val for range for slice | |
uint Kn = Ks; // resgroup seg size for each seg slice | |
ulong hi_tp = 0; // value of largest tp hi prime in seg | |
ulong r_hi = restwins[indx]; // hi tp residue for this thread | |
ulong[] seg = new ulong[](((Ks-1) >> s) + 1); // seg array for Ks regroups | |
if (r_hi - 2 < (start_num - 2) % modpg + 2) ++Ki; // ensure lo cp in range | |
if (r_hi > (end_num - 2) % modpg + 2) --K_max; // ensure hi cp in range | |
ulong[] nextp = nextp_init(r_hi, Ki); // 1st prime mults for twin pair restracks | |
while (Ki < K_max) { // for Kn resgroup size slices upto Kmax | |
Kn = min(Ks, K_max - Ki); // set segment slice resgroup length | |
foreach (j, prime; primes) { // for each prime index r1..sqrt(N) | |
// for lower pair residue track | |
ulong k1 = nextp[j * 2]; // starting from this lower resgroup in seg | |
while (k1 < Kn) { // mark primenth resgroup bits prime mults | |
seg[k1 >> s] |= 1uL << (k1 & bmask); | |
k1 += prime; } // set next prime multiple resgroup | |
nextp[j * 2] = k1 - Kn; // save 1st resgroup in next eligible seg | |
// for upper pair residue track | |
ulong k2 = nextp[j * 2 | 1]; // starting from this upper resgroup in seg | |
while (k2 < Kn) { // mark primenth resgroup prime mults | |
seg[k2 >> s] |= 1uL << (k2 & bmask); | |
k2 += prime; } // set next prime multiple resgroup | |
nextp[j * 2 | 1] = k2 - Kn; // save 1st resgroup in next eligible seg | |
} // set as nonprime unused bits in last seg[n] | |
// so fast, do for every seg[i] | |
seg[(Kn-1) >> s] = seg[(Kn-1) >> s] | (~1uL << ((Kn-1) & bmask)); | |
auto cnt = 0; // count the cousinprimes in the segment | |
foreach (i; 0..((Kn-1) >> s) + 1) { cnt += popcnt(~seg[i]); } | |
if (cnt > 0) { // if segment has twin primes | |
sum += cnt; // add the segment count to total count | |
uint upk = Kn - 1; // from end of seg count backwards to largest tp | |
while ((seg[upk >> s] & (1uL << (upk & bmask))) != 0) --upk; | |
hi_tp = Ki + upk; // numerate its full resgroup value | |
} | |
Ki += Ks; // set 1st resgroup val of next seg slice | |
if (Ki < K_max) {foreach (b; 0..((Kn-1) >> s) + 1) seg[b] = 0;} // set seg all primes | |
} | |
// numerate largest twin prime in seg | |
hi_tp = ((r_hi > end_num) || (sum == 0)) ? 1 : hi_tp * modpg + r_hi; | |
lastwins[indx] = hi_tp; // store final seg tp value | |
cnts[indx] = sum; // sum for twin pair | |
} | |
/// Main routine to run twinprimes sieve, input ranges within 0..2^64 - 1. | |
void twinPrimesSsoz() { | |
ulong[] x; | |
foreach (_; 0 .. 2) { ulong a; readf!" %d"(a); x ~= a; } | |
end_num = max(x[0], 3); | |
start_num = max(x[1], 3); | |
if (start_num > end_num) swap(start_num, end_num); | |
start_num = start_num | 1; // if start_num even add 1 | |
end_num = (end_num - 1) | 1; // if end_num even subtract 1 | |
if (end_num - start_num < 2) { start_num = 7; end_num = 7; } | |
writeln("threads = ", totalCPUs); | |
auto stopWatchSetup = StopWatch(); | |
stopWatchSetup.start(); // start timing sieve setup execution | |
setSieveParameters(start_num, end_num); | |
auto pairscnt = restwins.length; | |
auto krange = Kmax - Kmin + 1; | |
// create sieve primes <= sqrt(end_num), only use those whose multiples within inputs range | |
if (end_num < 49) primes ~= 5; // generate sieving primes | |
else sozP5(cast(ulong) sqrt(cast(double) end_num)); // <= sqrt(end_num) | |
writeln("each of ", pairscnt, " threads has nextp[", 2, " x ", primes.length, "] array"); | |
ulong twinscnt = 0; // number of twin primes for range | |
auto lo_range = restwins[0] - 3; // lo_range = lo_tp - 1 | |
foreach (tp; [3, 5, 11, 17]) { // excluded low tp values for PGs used | |
if (end_num == 3) break; // if 3 end of range, no twin primes | |
if ((start_num <= tp) && (tp < lo_range)) ++twinscnt; // count small tp if any | |
} | |
stopWatchSetup.stop(); // sieve setup time | |
writeln("setup time = ", stopWatchSetup.peek()); | |
writeln("perform twinprimes ssoz sieve"); | |
auto stopWatchExecution = StopWatch(); // start timing ssoz sieve execution | |
stopWatchExecution.start(); | |
cnts = new uint[](pairscnt); // count of tps for each thread | |
lastwins = new ulong[](pairscnt); // largest hi_tp for each thread | |
shared uint threadscnt = 0; // count of finished threads | |
foreach (indx; parallel(iota(0, pairscnt))) { // do in parallel for each tp | |
twinsSieve(cast(uint) indx); // sieve selected twinpair restracks | |
write("\r", threadscnt.atomicOp!"+="(1), " of ", pairscnt, " twinpairs done"); | |
} | |
write("\r", pairscnt, " of ", pairscnt, " twinpairs done"); | |
ulong last_twin = 0uL; | |
foreach(i; 0..pairscnt) { | |
twinscnt += cnts[i]; | |
if (last_twin < lastwins[i]) last_twin = lastwins[i]; | |
} | |
if ((end_num == 5) && (twinscnt == 1)) last_twin = 5; | |
auto Kn = krange % Ks; // set num of resgroups in last slice | |
if (Kn == 0) Kn = Ks; // if mult of segsize set to seg size | |
stopWatchExecution.stop(); // sieve execution time | |
writeln("\nsieve time = ", stopWatchExecution.peek()); | |
writeln("total time = ", stopWatchSetup.peek() + stopWatchExecution.peek()); | |
writeln("last segment = ", Kn, " resgroups; segment slices = ", (krange-1) / Ks + 1); | |
writeln("total twins = ", twinscnt, "; last twin = ", last_twin - 1, "+/-1"); | |
} | |
void main() | |
{ twinPrimesSsoz(); } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, just in case you didn't notice, I made a small contribution to the compile time part of this code: https://forum.dlang.org/post/[email protected]
I didn't cross-check the results with the Nim code, you might want to do that before publishing it.
Cheers,
Bastiaan.