Last active
August 12, 2024 16:55
-
-
Save jzakiya/3799bd8604bdcba34df5c79aae6e55ac to your computer and use it in GitHub Desktop.
Cousin Primes generator, multi-threaded, using SSoZ (Segmented Sieve of Zakiya), written in C++
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
This C++ source file is a multiple threaded implementation to perform an | |
extremely fast Segmented Sieve of Zakiya (SSoZ) to find Cousin Primes <= N. | |
Inputs are single values N, or ranges N1 and N2, of 64-bits, 0 -- 2^64 - 1. | |
Output is the number of cousin primes <= N, or in range N1 to N2; the last | |
cousin prime value for the range; and the total time of execution. | |
Code originally developed on a System76 laptop with an Intel I7 6700HQ cpu, | |
2.6-3.5 GHz clock, with 8 threads, and 16GB of memory. Parameter tuning | |
probably needed to optimize for other hadware systems (ARM, PowerPC, etc). | |
On Linux use -O compiler flag to compile for optimum speed: | |
$ g++ -O3 -fopenmp cousinprimes_ssoz.cpp -o cousinprimes_ssoz | |
To reduce binary size do: $ strip cousinprimes_ssoz | |
Single val: $ echo val1 | ./cousinprimes_ssoz | |
Range vals: $ echo val1 val2 | ./cousinprimes_ssoz | |
Range values can be provided in either order (larger or smaller first). | |
Mathematical and technical basis for implementation are explained here: | |
https://www.academia.edu/37952623/The_Use_of_Prime_Generators_to_Implement_Fast_ | |
Twin_Primes_Sieve_of_Zakiya_SoZ_Applications_to_Number_Theory_and_Implications_ | |
for_the_Riemann_Hypotheses | |
https://www.academia.edu/7583194/The_Segmented_Sieve_of_Zakiya_SSoZ_ | |
https://www.academia.edu/19786419/PRIMES-UTILS_HANDBOOK | |
https://www.academia.edu/81206391/Twin_Primes_Segmented_Sieve_of_Zakiya_SSoZ_Explained | |
This C++ source code, and updates, can be found here: | |
https://gist.github.com/jzakiya/3799bd8604bdcba34df5c79aae6e55ac | |
This code is provided under the terms of the | |
GNU General Public License Version 3, GPLv3, or greater. | |
License copy/terms are here: http://www.gnu.org/licenses/ | |
Use of this code is free subject to acknowledgment of copyright. | |
Copyright (c) 2017-2024 Jabari Zakiya -- jzakiya at gmail dot com | |
Last update: 2024/08/12 | |
*/ | |
#include <cmath> | |
#include <algorithm> | |
#include <vector> | |
#include <array> | |
#include <tuple> | |
#include <cstdlib> | |
#include <iostream> | |
#include <stdint.h> | |
#include <chrono> | |
#include <omp.h> | |
#include <atomic> | |
using namespace std; | |
typedef uint64_t uint64; | |
// Compute modular inverse a^(-1) of a to base b, e.g. a*(a^-1) mod b = 1. | |
int modinv(int a0, int b0) { | |
int b = b0, a = a0, x = 0; | |
int inv = 1; | |
if (b == 1) return 1; | |
while (a > 1) { | |
int q = a / b; | |
int t = b; b = a % b; a = t; | |
t = x; x = inv - q * x; inv = t; | |
} | |
if (inv < 0) inv += b0; | |
return inv; | |
} | |
// Create prime generator parameters for given Pn | |
tuple<int, int, int, vector<int>, vector<int> > genPgParameters(int prime) { | |
cout << "using Prime Generator parameters for P" << prime << endl; | |
int primes[9] = {2, 3, 5, 7, 11, 13, 17, 19, 23}; | |
int modpg = 1; int res_0 = 0; | |
for (int prm : primes) { res_0 = prm; if (prm > prime) break; modpg *= prm; } | |
vector<int> rescousins; // save upper cousin pair residues here | |
vector<int> inverses(modpg + 2, 0); // save Pn's residues inverses here | |
int rc = 5; int inc = 2; int res = 0; // use P3's PGS to generate residue candidates | |
int midmodpg = modpg >> 1; // mid value of modpg | |
while (rc < midmodpg) { // find 1st half of PG's residues | |
if (__gcd(rc, modpg) == 1) { // if rc is a residue | |
int mc = modpg - rc; // create its modular complement | |
inverses[rc] = modinv(rc, modpg); // save rc and mc inverses | |
inverses[mc] = modinv(mc, modpg); // if in cousinpair save both hi residue | |
if (res + 4 == rc) { rescousins.push_back(rc); rescousins.push_back(mc + 4); } | |
res = rc; // save current found residue | |
} | |
rc += inc; inc ^= 0b110; // create next P3 sequence rc: 5 7 11 13 17 19 ... | |
} | |
rescousins.push_back(midmodpg + 2); std::sort(rescousins.begin(), rescousins.end()); | |
inverses[modpg + 1] = 1; inverses[modpg - 1] = modpg - 1; | |
return make_tuple(modpg, res_0, rescousins.size(), rescousins, inverses); | |
} | |
// Select at runtime best PG and segment size factor to use for input vals. | |
// These are good estimates derived from PG data profiling. Can be improved. | |
tuple<int, int, int, uint64, uint64, uint64, int, vector<int>, | |
vector<int>> setSieveParameters(uint64 start_num, uint64 end_num) { | |
uint64 nrange = end_num - start_num; | |
int Bn; int pg = 3; | |
if (end_num < 49) { | |
Bn = 1; pg = 3; | |
} | |
else if (nrange < 77000000) { | |
Bn = 16; pg = 5; | |
} | |
else if (nrange < 1100000000) { | |
Bn = 32; pg = 7; | |
} | |
else if (nrange < 35500000000) { | |
Bn = 64; pg = 11; | |
} | |
else if (nrange < 14000000000000) { | |
pg = 13; | |
if (nrange > 7000000000000) Bn = 384; | |
else if (nrange > 2500000000000) Bn = 320; | |
else if (nrange > 250000000000) Bn = 196; | |
else Bn = 128; | |
} else { | |
Bn = 384; pg = 17; | |
} | |
int modpg, res_0, pairscnt; vector<int> rescousins, resinvrs; | |
tie(modpg, res_0, pairscnt, rescousins, resinvrs) = genPgParameters(pg); | |
uint64 kmin = (start_num-2) / modpg+1; // number of resgroups to start_num | |
uint64 kmax = (end_num - 2) / modpg+1; // number of resgroups to end_num | |
uint64 krange = kmax - kmin + 1; // number of resgroups in range, at least 1 | |
uint n = (krange < 37500000000000) ? 10 : (krange < 975000000000000) ? 16 : 20; | |
uint b = Bn * 1024 * n; // set seg size to optimize for selected PG | |
uint ks = (krange < b) ? krange : b; // segments resgroups size | |
cout << "segment size = " << ks << " resgroups; seg array is [1 x " << ((ks-1) >> 6) + 1 << "] 64-bits\n"; | |
uint64 maxpairs = krange * pairscnt; // maximum number of cousinprime pcs | |
cout << "cousinprime candidates = " << maxpairs << "; resgroups = " << krange << endl; | |
return make_tuple(modpg, res_0, ks, kmin, kmax, krange, pairscnt, rescousins, resinvrs); | |
} | |
// Return the primes r0..sqrt(end_num) within range (start_num...end_num) | |
vector<uint64> sozp5(uint64 val, uint res_0, uint64 start_num, uint64 end_num) { | |
uint md = 30; // P5's modulus value | |
int rescnt = 8; // P5's residue count | |
int res[8] = {7,11,13,17,19,23,29,31}; // P5's residues list | |
int bitn[30] = {0,0,0,0,0,1,0,0,0,2,0,4,0,0,0,8,0,16,0,0,0,32,0,0,0,0,0,64,0,128}; | |
uint range_size = end_num - start_num; // integers size of inputs range | |
uint64 kmax = (val - 2) / md + 1; // number of resgroups upto input value | |
vector<uint8_t> prms(kmax, 0); // byte array of prime candidates init '0' | |
uint sqrtn = (uint) sqrt((double) val);// compute integer sqrt of val | |
uint k = sqrtn/md; // compute its resgroup value | |
uint resk = sqrtn - md*k; int r = 0; // sqrtn resgroup|residue values, residue start posn | |
while (resk >= res[r]) ++r; // find largest residue <= sqrtn posn in its resgroup | |
uint pcs_to_sqrtn = k*rescnt + r; // number of pcs <= sqrtn | |
for (uint i = 0; i < pcs_to_sqrtn; ++i) { // for r0..sqrtN primes mark their multiples | |
uint k = i/rescnt; int r = i%rescnt; // update resgroup parameters | |
if ((prms[k] & (1 << r)) != 0) continue; // skip pc if not prime | |
uint prm_r = res[r]; // if prime save its residue value | |
uint64 prime = m d*k + prm_r; // numerate its value | |
uint rem = start_num % prime; // prime's modular distance to start_num | |
if (!(prime - rem <= range_size || rem == 0)) continue; // skip prime if no multiple in range | |
for (int ri : res) { // mark prime's multiples in prms | |
uint prod = prm_r * ri - 2; // compute cross-product for prm_r|ri pair | |
uint8_t bit_r = bitn[prod % md]; // bit mask value for prod's residue | |
uint64 kpm = k * (prime + ri) + prod / md; // resgroup for prime's 1st multiple | |
for (; kpm < kmax; kpm += prime) prms[kpm] |= bit_r; // mark primes's multiples | |
} } | |
// prms now contains the prime multiples positions marked for the pcs r0..N | |
// now along each restrack, identify|numerate the primes in each resgroup | |
// return only the primes with a multiple within range (start_num...end_num) | |
vector<uint64> primes; // dynamic array to store primes | |
for (int r = 0; r < rescnt; ++r) { // along each restrack|row til end | |
for (uint64 k = 0; k < kmax; ++k) { // for each resgroup along restrack | |
if ((prms[k] & (1 << r)) == 0) { // if bit location is prime | |
uint64 prime = md * k + res[r]; // numerate its value, store if in range | |
// check if prime has multiple in range, if so keep it, if not don't | |
uint64 rem = start_num % prime; // if rem is 0 then start_num multiple of prime | |
if ((res_0 <= prime && prime <= val) && (prime - rem <= range_size || rem == 0)) primes.push_back(prime); | |
} } } | |
return primes; // primes stored in restrack|row sequence order | |
} | |
// Initialize 'nextp' array for cousinpair upper residue rhi in 'rescousins'. | |
// Compute 1st prime multiple resgroups for each prime r0..sqrt(N) and | |
// store consecutively as lo_tp|hi_tp pairs for their restracks. | |
vector<uint64> nextp_init(uint rhi, uint64 kmin, uint modpg, vector<uint64> primes, vector<int> resinvrs) { | |
vector<uint64> nextp(primes.size() * 2); // 1st mults array for cousinpair | |
uint r_hi = rhi; // upper cousinpair residue | |
uint r_lo = rhi - 4; // lower cousinpair residue | |
for (int j = 0; j < primes.size(); ++j) { // for each prime r1..sqrt(N) | |
uint prime = primes[j]; // get the prime | |
uint k = (prime - 2) / modpg; // find the resgroup it's in | |
uint r = (prime - 2) % modpg + 2; // and its residue value | |
uint64 r_inv = resinvrs[r]; // and its residue inverse | |
uint64 rl = (r_inv * r_lo - 2) % modpg + 2; // compute the ri for r for lo_tp | |
uint64 rh = (r_inv * r_hi - 2) % modpg + 2; // compute the ri for r for hi_tp | |
uint64 kl = k * (prime + rl) + (r * rl - 2) / modpg; // kl 1st mult restroup | |
uint64 kh = k * (prime + rh) + (r * rh - 2) / modpg; // kh 1st mult restroup | |
if (kl < kmin) { kl = (kmin - kl) % prime; if (kl > 0) kl = prime - kl; } else { kl -= kmin; } | |
if (kh < kmin) { kh = (kmin - kh) % prime; if (kh > 0) kh = prime - kh; } else { kh -= kmin; } | |
nextp[j << 1] = kl; // prime's 1st mult lo_cp resgroups in range | |
nextp[j << 1 | 1] = kh; // prime's 1st mult lo_cp resgroups in range | |
} | |
return nextp; | |
} | |
// Perform in thread the ssoz for given cousinpair residues for kmax resgroups. | |
// First create|init 'nextp' array of 1st prime mults for given cousinpair, | |
// stored consequtively in 'nextp', and init seg array for ks resgroups. | |
// For sieve, mark resgroup bits to '1' if either cousinpair restrack is nonprime | |
// for primes mults resgroups, and update 'nextp' restrack slices acccordingly. | |
// Return the last cousinprime|sum for the range for this cousinpair residues | |
tuple<uint64, uint64> cousins_sieve(uint r_hi, uint64 kmin, uint64 kmax, uint ks, uint64 start_num, | |
uint64 end_num, uint modpg, vector<uint64> primes, vector<int>resinvrs) { | |
uint s = 6; // shift value for 64 bits | |
uint bmask = (1 << s) - 1; // bitmask val for 64 bits | |
uint64 sum = 0; // init cp cnt for this cousin pair | |
uint64 ki = kmin - 1; // 1st resgroup seg val for each slice | |
uint kn = ks; // segment resgroup size | |
uint64 hi_cp = 0; // init value of largest cp in segment | |
uint64 k_max = kmax; // max number of resgroups for segment | |
vector<uint64> seg(((ks-1) >> s) + 1); // seg array for ks resgroups | |
if (r_hi - 4 < (start_num - 2) % modpg + 2) ++ki; // ensure lo cp in range | |
if (r_hi > (end_num -2) % modpg + 2) --k_max; // ensure hi cp in range | |
vector<uint64> nextp = nextp_init(r_hi, ki, modpg, primes, resinvrs); // 1st prime mults | |
while (ki < k_max) { // for ks resgroup size slices upto kmax | |
if (ks > (k_max - ki)) kn = (k_max - ki); // adjust kn for last seg size | |
for (int j = 0; j < primes.size(); ++j) { // for each prime index r1..sqrt(N) | |
uint prime = primes[j]; // for this prime | |
// for lower pair residue track | |
uint64 k1 = nextp[j << 1]; // starting from this lower resgroup in seg | |
for (; k1 < kn; k1 += prime) // mark primenth resgrouup bits prime mults | |
seg[k1 >> s] |= uint64(1) << (k1 & bmask); | |
nextp[j << 1] = k1 - kn; // save 1st lower resgroup in next eligible seg | |
// for upper pair residue track | |
uint64 k2 = nextp[j << 1 | 1]; // starting from this upper resgroup in seg | |
for (; k2 < kn; k2 += prime) // mark primenth resgrouup bits prime mults | |
seg[k2 >> s] |= uint64(1) << (k2 & bmask); | |
nextp[j << 1 | 1] = k2 - kn; // save 1st upper resgroup in next eligible seg | |
} // set as nonprime unused bits in last seg[n] | |
// so fast, do for every seg[i] | |
seg[(kn-1) >> s] |= ~uint64(1) << ((kn-1) & bmask); | |
uint cnt = 0; // count the cousinprimes in the segment | |
for (int k = 0; k <= (kn-1) >> s; ++k) cnt += __builtin_popcountl(~seg[k]); | |
if (cnt > 0) { // if segment has cousin primes | |
sum += cnt; // add the segment count to total count | |
uint upk = kn - 1; // from end of seg count backwards to largest tp | |
while ((seg[upk >> s] & (uint64(1) << (upk & bmask))) != 0) upk--; | |
hi_cp = ki + upk; // numerate its full resgroup value | |
} | |
ki += ks; // set 1st resgroup val of next seg slice | |
if (ki < k_max) {for (int b = 0; b <= ((kn-1) >> s); ++b) seg[b] = 0;} // set all seg bits prime | |
} | |
hi_cp = ((r_hi > end_num) || (sum == 0)) ? 0 : hi_cp * modpg + r_hi; | |
return make_tuple(hi_cp, sum); | |
} | |
int main() { | |
vector<uint64> x; | |
uint64 num; | |
while ( cin >> num ) x.push_back(num); | |
uint64 end_num = (x[0] > 3) ? x[0] : 3; | |
uint64 start_num = (x[1] > 3) ? x[1] : 3; | |
if (start_num > end_num) swap(start_num, end_num); | |
start_num |= 1; // if start_num even increase by 1 | |
end_num = (end_num - 1) | 1; // if end_num even decrease by 1 | |
if (end_num - start_num < 4) { start_num = 7; end_num = 7; } | |
cout << "threads = " << omp_get_max_threads() << endl; | |
auto ts = chrono::system_clock::now(); // start timing sieve setup execution | |
int modpg, res_0, pairscnt; vector<int> rescousins, resinvrs; | |
uint64 ks, kmin, kmax, krange; | |
tie(modpg, res_0, ks, kmin, kmax, krange, pairscnt, rescousins, resinvrs) = setSieveParameters(start_num, end_num); | |
// create sieve primes <= sqrt(end_num), only use those whose multiples within inputs range | |
vector<uint64> primes; | |
(end_num < 49) ? primes = {5} : primes = sozp5((uint64) sqrt((double) end_num), res_0, start_num, end_num); | |
cout << "each of "<< pairscnt << " threads has nextp["<< 2 << " x " <<primes.size() << "] array\n"; | |
auto te = chrono::system_clock::now() - ts; // sieve setup time | |
chrono::duration<double> seconds = te; | |
cout << "setup time = " << seconds.count() << " secs\n"; | |
cout << "perform cousinprimes ssoz sieve\n"; | |
auto t1 = chrono::system_clock::now(); // start timing ssoz sieve execution | |
vector<uint64> cnts(pairscnt); // cousins cnts for each cousinpair | |
vector<uint64> lastcousins(pairscnt); // last|largest cousin per cousinpair | |
std::atomic<uint> threadscnt(0); // count of finished threads | |
#pragma omp parallel for // process cousinpairs in parallel | |
for (int i = 0; i < pairscnt; ++i) { // process each cousinpair in own thread | |
uint64 l, c; | |
tie(l, c) = cousins_sieve(rescousins[i], kmin, kmax, ks, start_num, end_num, modpg, primes, resinvrs); | |
lastcousins[i] = l; cnts[i] = c; | |
cout << "\r" << threadscnt++ << " of " << pairscnt << " cousinpairs done"; | |
} | |
cout << "\r" << pairscnt << " of " << pairscnt << " cousinpairs done"; | |
uint64 cousinscnt = 0; // number of cousin primes in range | |
uint64 last_cousin = 0; // largest hi_cp cousin in range | |
if (end_num < 49) { // check for cousins in low ranges | |
for (uint c_hi : {11, 17, 23, 41, 47}) { | |
if (start_num <= c_hi - 4 && c_hi <= end_num) { last_cousin = c_hi; cousinscnt += 1;} | |
} } | |
else { // check for cousins from sieve | |
for (int i = 0; i < pairscnt; ++i) { // find largest cousinprime|count in range | |
cousinscnt += cnts[i]; | |
if (last_cousin < lastcousins[i]) last_cousin = lastcousins[i]; | |
} } | |
// account for 1st cousin prime, defined as (3, 7) | |
if (start_num == 3 && end_num > 6) { cousinscnt += 1; if (end_num < 11) last_cousin = 7;} | |
uint kn = krange % ks; | |
if (kn == 0) kn = ks; | |
auto t2 = chrono::system_clock::now() - t1; | |
chrono::duration<double> secs1 = t2; | |
chrono::duration<double> secs2 = t2 + te; | |
cout << "\nsieve time = " << secs1.count() << " secs\n"; | |
cout << "total time = " << secs2.count() << " secs\n"; | |
cout << "last segment = "<<kn<<" resgroups; segment slices = "<< (krange-1)/ks + 1<<"\n"; | |
cout << "total cousins = " << cousinscnt << "; last cousin = " << last_cousin << "|-4" << endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment