Skip to content

Instantly share code, notes, and snippets.

@jzakiya
Last active August 12, 2024 16:58
Show Gist options
  • Save jzakiya/fa76c664c9072ddb51599983be175a3f to your computer and use it in GitHub Desktop.
Save jzakiya/fa76c664c9072ddb51599983be175a3f to your computer and use it in GitHub Desktop.
Twinprimes generator, multi-threaded, using SSoZ (Segmented Sieve of Zakiya), written in C++.
/*
This C++ source file is a multiple threaded implementation to perform an
extremely fast Segmented Sieve of Zakiya (SSoZ) to find Twin Primes <= N.
Inputs are single values N, or ranges N1 and N2, of 64-bits, 0 -- 2^64 - 1.
Output is the number of twin primes <= N, or in range N1 to N2; the last
twin prime value for the range; and the total time of execution.
Code originally developed on a System76 laptop with an Intel I7 6700HQ cpu,
2.6-3.5 GHz clock, with 8 threads, and 16GB of memory. Parameter tuning
probably needed to optimize for other hadware systems (ARM, PowerPC, etc).
On Linux use -O compiler flag to compile for optimum speed:
$ g++ -O3 -fopenmp twinprimes_ssoz.cpp -o twinprimes_ssoz
To reduce binary size do: $ strip twinprimes_ssoz
Single val: $ echo val1 | ./twinprimes_ssoz
Range vals: $ echo val1 val2 | ./twinprimes_ssoz
Range values can be provided in either order (larger or smaller first).
Mathematical and technical basis for implementation are explained here:
https://www.academia.edu/37952623/The_Use_of_Prime_Generators_to_Implement_Fast_
Twin_Primes_Sieve_of_Zakiya_SoZ_Applications_to_Number_Theory_and_Implications_
for_the_Riemann_Hypotheses
https://www.academia.edu/7583194/The_Segmented_Sieve_of_Zakiya_SSoZ_
https://www.academia.edu/19786419/PRIMES-UTILS_HANDBOOK
https://www.academia.edu/81206391/Twin_Primes_Segmented_Sieve_of_Zakiya_SSoZ_Explained
This C++ source code, and updates, can be found here:
https://gist.github.com/jzakiya/fa76c664c9072ddb51599983be175a3f
This code is provided under the terms of the
GNU General Public License Version 3, GPLv3, or greater.
License copy/terms are here: http://www.gnu.org/licenses/
Use of this code is free subject to acknowledgment of copyright.
Copyright (c) 2017-2024 Jabari Zakiya -- jzakiya at gmail dot com
Last update: 2024/08/12
*/
#include <cmath>
#include <algorithm>
#include <vector>
#include <array>
#include <tuple>
#include <cstdlib>
#include <iostream>
#include <stdint.h>
#include <chrono>
#include <omp.h>
#include <atomic>
using namespace std;
typedef uint64_t uint64;
// Compute modular inverse a^(-1) of a to base b, e.g. a*(a^-1) mod b = 1.
int modinv(int a0, int b0) {
int b = b0, a = a0, x = 0;
int inv = 1;
if (b == 1) return 1;
while (a > 1) {
int q = a / b;
int t = b; b = a % b; a = t;
t = x; x = inv - q * x; inv = t;
}
if (inv < 0) inv += b0;
return inv;
}
// Create prime generator parameters for given Pn
tuple<int, int, int, vector<int>, vector<int> > genPgParameters(int prime) {
cout << "using Prime Generator parameters for P" << prime << endl;
int primes[9] = {2, 3, 5, 7, 11, 13, 17, 19, 23};
int modpg = 1; int res_0 = 0;
for (int prm : primes) { res_0 = prm; if (prm > prime) break; modpg *= prm; }
vector<int> restwins; // save upper twin pair residues here
vector<int> inverses(modpg + 2, 0); // save Pn's residues inverses here
int rc = 5; int inc = 2; int res = 0; // use P3's PGS to generate resdidue candidates
while (rc < (modpg >> 1)) { // find 1st half of PG's residues
if (__gcd(rc, modpg) == 1) { // if rc is a residue
int mc = modpg - rc; // create its modular complement
inverses[rc] = modinv(rc, modpg); // save rc and mc inverses
inverses[mc] = modinv(mc, modpg); // if in twinpair save both hi residues
if (res + 2 == rc) { restwins.push_back(rc); restwins.push_back(mc + 2); }
res = rc; // save current found residue
}
rc += inc; inc ^= 0b110; // create next P3 sequence rc: 5 7 11 13 17 19 ...
}
std::sort(restwins.begin(), restwins.end()); restwins.push_back(modpg + 1);
inverses[modpg + 1] = 1; inverses[modpg - 1] = modpg - 1;
return make_tuple(modpg, res_0, restwins.size(), restwins, inverses);
}
// Select at runtime best PG and segment size factor to use for input vals.
// These are good estimates derived from PG data profiling. Can be improved.
tuple<int, int, int, uint64, uint64, uint64, int, vector<int>,
vector<int>> setSieveParameters(uint64 start_num, uint64 end_num) {
uint64 nrange = end_num - start_num;
int Bn; int pg = 3;
if (end_num < 49) {
Bn = 1; pg = 3;
}
else if (nrange < 77000000) {
Bn = 16; pg = 5;
}
else if (nrange < 1100000000) {
Bn = 32; pg = 7;
}
else if (nrange < 35500000000) {
Bn = 64; pg = 11;
}
else if (nrange < 14000000000000) {
pg = 13;
if (nrange > 7000000000000) Bn = 384;
else if (nrange > 2500000000000) Bn = 320;
else if (nrange > 250000000000) Bn = 196;
else Bn = 128;
} else {
Bn = 384; pg = 17;
}
int modpg, res_0, pairscnt; vector<int> restwins, resinvrs;
tie(modpg, res_0, pairscnt, restwins, resinvrs) = genPgParameters(pg);
uint64 kmin = (start_num-2) / modpg+1; // number of resgroups to start_num
uint64 kmax = (end_num - 2) / modpg+1; // number of resgroups to end_num
uint64 krange = kmax - kmin + 1; // number of resgroups in range, at least 1
uint n = (krange < 37500000000000) ? 10 : (krange < 975000000000000) ? 16 : 20;
uint b = Bn * 1024 * n; // set seg size to optimize for selected PG
uint ks = (krange < b) ? krange : b; // segments resgroups size
cout << "segment size = " << ks << " resgroups; seg array is [1 x " << ((ks-1) >> 6) + 1 << "] 64-bits\n";
uint64 maxpairs = krange * pairscnt; // maximum number of twinprime pcs
cout << "twinprime candidates = " << maxpairs << "; resgroups = " << krange << endl;
return make_tuple(modpg, res_0, ks, kmin, kmax, krange, pairscnt, restwins, resinvrs);
}
// Return the primes r0..sqrt(end_num) within range (start_num...end_num)
vector<uint64> sozp5(uint64 val, uint res_0, uint64 start_num, uint64 end_num) {
uint md = 30; // P5's modulus value
int rescnt = 8; // P5's residue count
int res[8] = {7,11,13,17,19,23,29,31}; // P5's residues list
int bitn[30] = {0,0,0,0,0,1,0,0,0,2,0,4,0,0,0,8,0,16,0,0,0,32,0,0,0,0,0,64,0,128};
uint range_size = end_num - start_num; // integers size of inputs range
uint64 kmax = (val - 2) / md + 1; // number of resgroups upto input value
vector<uint8_t> prms(kmax, 0); // byte array of prime candidates init '0'
uint sqrtn = (uint) sqrt((double) val);// compute integer sqrt of val
uint k = sqrtn/md; // compute its resgroup value
uint resk = sqrtn - md*k; int r =0; // compute its residue value; set residue start posn
while (resk >= res[r]) ++r; // find largest residue <= sqrtn posn in its resgroup
uint pcs_to_sqrtn = k*rescnt + r; // number of pcs <= sqrtn
for (uint i = 0; i < pcs_to_sqrtn; ++i) { // for r0..sqrtN primes mark their multiples
uint k = i/rescnt; int r = i%rescnt; // update resgroup parameters
if ((prms[k] & (1 << r)) != 0) continue; // skip pc if not prime
uint prm_r = res[r]; // if prime save its residue value
uint64 prime = md*k + prm_r; // numerate its value
uint rem = start_num % prime; // prime's modular distance to start_num
if (!(prime - rem <= range_size || rem == 0)) continue; // skip prime if no multiple in range
for (int ri : res) { // mark prime's multiples in prms
uint prod = prm_r * ri - 2; // compute cross-product for prm_r|ri pair
uint8_t bit_r = bitn[prod % md]; // bit mask value for prod's residue
uint64 kpm = k * (prime + ri) + prod / md; // resgroup for prime's 1st multiple
for (; kpm < kmax; kpm += prime) prms[kpm] |= bit_r; // mark prime's multiples
} }
// prms now contains the prime multiples positions marked for the pcs r0..N
// now along each restrack, identify|numerate the primes in each resgroup
// return only the primes with a multiple within range (start_num...end_num)
vector<uint64> primes; // dynamic array to store primes
for (int r = 0; r < rescnt; ++r) { // along each restrack|row til end
for (uint64 k = 0; k < kmax; ++k) { // for each resgroup along restrack
if ((prms[k] & (1 << r)) == 0) { // if bit location is prime
uint64 prime = md * k + res[r]; // numerate its value, store if in range
// check if prime has multiple in range, if so keep it, if not don't
uint64 rem = start_num % prime; // if rem is 0 then start_num multiple of prime
if ((res_0 <= prime && prime <= val) && (prime - rem <= range_size || rem == 0)) primes.push_back(prime);
} } }
return primes; // primes stored in restrack|row sequence order
}
// Initialize 'nextp' array for twinpair upper residue rhi in 'restwins'.
// Compute 1st prime multiple resgroups for each prime r0..sqrt(N) and
// store consecutively as lo_tp|hi_tp pairs for their restracks.
vector<uint64> nextp_init(uint rhi, uint64 kmin, uint modpg, vector<uint64> primes, vector<int> resinvrs) {
vector<uint64> nextp(primes.size() * 2); // 1st mults array for twinpair
uint r_hi = rhi; // upper twinpair residue
uint r_lo = rhi - 2; // lower twinpair residue
for (int j = 0; j < primes.size(); ++j) { // for each prime r1..sqrt(N)
uint prime = primes[j]; // get the prime
uint k = (prime - 2) / modpg; // find the resgroup it's in
uint r = (prime - 2) % modpg + 2; // and its residue value
uint64 r_inv = resinvrs[r]; // and its residue inverse
uint64 rl = (r_inv * r_lo - 2) % modpg + 2; // compute the ri for r for lo_tp
uint64 rh = (r_inv * r_hi - 2) % modpg + 2; // compute the ri for r for hi_tp
uint64 kl = k * (prime + rl) + (r * rl - 2) / modpg; // kl 1st mult restroup
uint64 kh = k * (prime + rh) + (r * rh - 2) / modpg; // kh 1st mult restroup
if (kl < kmin) { kl = (kmin - kl) % prime; if (kl > 0) kl = prime - kl; } else { kl -= kmin; }
if (kh < kmin) { kh = (kmin - kh) % prime; if (kh > 0) kh = prime - kh; } else { kh -= kmin; }
nextp[j << 1] = kl; // prime's 1st mult lo_tp resgroups in range
nextp[j << 1 | 1] = kh; // prime's 1st mult hi_tp resgroups in range
}
return nextp;
}
// Perform in thread the ssoz for given twinpair residues for kmax resgroups.
// First create|init 'nextp' array of 1st prime mults for given twinpair,
// stored consequtively in 'nextp', and init seg array for ks resgroups.
// For sieve, mark resgroup bits to '1' if either twinpair restrack is nonprime
// for primes mults resgroups, and update 'nextp' restrack slices acccordingly.
// Return the last twinprime|sum for the range for this twinpair residues
tuple<uint64, uint64> twins_sieve(uint r_hi, uint64 kmin, uint64 kmax, uint ks, uint64 start_num,
uint64 end_num, uint modpg, vector<uint64> primes, vector<int>resinvrs) {
uint s = 6; // shift value for 64 bits
uint bmask = (1 << s) - 1; // bitmask val for 64 bits
uint64 sum = 0; // init tp cnt for this twin pair
uint64 ki = kmin - 1; // 1st resgroup seg val for each slice
uint kn = ks; // segment resgroup size
uint64 hi_tp = 0; // init value of largest tp in segment
uint64 k_max = kmax; // max number of resgroups for segment
vector<uint64> seg(((ks-1) >> s) + 1); // seg array for ks resgroups
if (r_hi - 2 < (start_num - 2) % modpg + 2) ++ki; // ensure lo tp in range
if (r_hi > (end_num -2) % modpg + 2) --k_max; // ensure hi tp in range
vector<uint64> nextp = nextp_init(r_hi, ki, modpg, primes, resinvrs); // 1st prime mults
while (ki < k_max) { // for ks resgroup size slices upto kmax
if (ks > (k_max - ki)) kn = (k_max - ki); // adjust kn for last seg size
for (int j = 0; j < primes.size(); ++j) { // for each prime index r1..sqrt(N)
uint prime = primes[j]; // for this prime
// for lower pair residue track
uint64 k1 = nextp[j << 1]; // starting from this lower resgroup in seg
for (; k1 < kn; k1 += prime) // mark primenth resgrouup bits prime mults
seg[k1 >> s] |= uint64(1) << (k1 & bmask);
nextp[j << 1] = k1 - kn; // save 1st lower resgroup in next eligible seg
// for upper pair residue track
uint64 k2 = nextp[j << 1 | 1]; // starting from this upper resgroup in seg
for (; k2 < kn; k2 += prime) // mark primenth resgrouup bits prime mults
seg[k2 >> s] |= uint64(1) << (k2 & bmask);
nextp[j << 1 | 1] = k2 - kn; // save 1st upper resgroup in next eligible seg
} // set as nonprime unused bits in last seg[n]
// so fast, do for every seg[i]
seg[(kn-1) >> s] |= ~uint64(1) << ((kn-1) & bmask);
uint cnt = 0; // count the twinprimes in the segment
for (int k = 0; k <= (kn-1) >> s; ++k) cnt += __builtin_popcountl(~seg[k]);
if (cnt > 0) { // if segment has twin primes
sum += cnt; // add the segment count to total count
uint upk = kn - 1; // from end of seg count backwards to largest tp
while ((seg[upk >> s] & (uint64(1) << (upk & bmask))) != 0) upk--;
hi_tp = ki + upk; // numerate its full resgroup value
}
ki += ks; // set 1st resgroup val of next seg slice
if (ki < k_max) {for (int b = 0; b <= ((kn-1) >> s); ++b) seg[b] = 0;} // set all seg bits prime
}
hi_tp = ((r_hi > end_num) || (sum == 0)) ? 1 : hi_tp * modpg + r_hi;
return make_tuple(hi_tp, sum);
}
int main() {
vector<uint64> x;
uint64 num;
while ( cin >> num ) x.push_back(num);
uint64 end_num = (x[0] > 3) ? x[0] : 3;
uint64 start_num = (x[1] > 3) ? x[1] : 3;
if (start_num > end_num) swap(start_num, end_num);
start_num |= 1; // if start_num even increase by 1
end_num = (end_num - 1) | 1; // if end_num even decrease by 1
if (end_num - start_num < 2) { start_num = 7; end_num = 7; }
cout << "threads = " << omp_get_max_threads() << endl;
auto ts = chrono::system_clock::now(); // start timing sieve setup execution
int modpg, res_0, pairscnt; vector<int> restwins, resinvrs;
uint64 ks, kmin, kmax, krange;
tie(modpg, res_0, ks, kmin, kmax, krange, pairscnt, restwins, resinvrs) = setSieveParameters(start_num, end_num);
// create sieve primes <= sqrt(end_num), only use those whose multiples within inputs range
vector<uint64> primes;
(end_num < 49) ? primes = {5} : primes = sozp5((uint64) sqrt((double) end_num), res_0, start_num, end_num);
cout << "each of "<< pairscnt << " threads has nextp["<< 2 << " x " <<primes.size() << "] array\n";
uint64 twinscnt = 0; // number of twin primes in range
int lo_range = restwins[0] - 3; // lo_range = lo_tp - 1
for (int tp : {3, 5, 11, 17}) { // lo tps for available Pn's
if (end_num == 3) break; // if 3 end of range, no twin primes
if (start_num <= tp && tp < lo_range) twinscnt++; // cnt small tps if any
}
auto te = chrono::system_clock::now() - ts; // sieve setup time
chrono::duration<double> seconds = te;
cout << "setup time = " << seconds.count() << " secs\n";
cout << "perform twinprimes ssoz sieve\n";
auto t1 = chrono::system_clock::now(); // start timing ssoz sieve execution
vector<uint64> cnts(pairscnt); // twins cnts for each twinpair
vector<uint64> lastwins(pairscnt); // last|largest twin per twinpair
std::atomic<uint> threadscnt(0); // count of finished threads
#pragma omp parallel for // process twinpairs in parrallel
for (int i = 0; i < pairscnt; ++i) { // process each twinpair in own thread
uint64 l, c;
tie(l, c) = twins_sieve(restwins[i], kmin, kmax, ks, start_num, end_num, modpg, primes, resinvrs);
lastwins[i] = l; cnts[i] = c;
cout << "\r" << threadscnt++ << " of " << pairscnt << " twinpairs done";
}
cout << "\r" << pairscnt << " of " << pairscnt << " twinpairs done";
uint64 last_twin = 0; // init val for largest twinprime
for (int i = 0; i < pairscnt; ++i) { // find largest twinprime|sum in range
twinscnt += cnts[i];
if (last_twin < lastwins[i]) last_twin = lastwins[i];
}
if ((end_num == 5) && (twinscnt == 1)) last_twin = 5;
uint kn = krange % ks;
if (kn == 0) kn = ks;
auto t2 = chrono::system_clock::now() - t1;
chrono::duration<double> secs1 = t2;
chrono::duration<double> secs2 = t2 + te;
cout << "\nsieve time = " << secs1.count() << " secs\n";
cout << "total time = " << secs2.count() << " secs\n";
cout << "last segment = "<<kn<<" resgroups; segment slices = "<< (krange-1)/ks + 1<<"\n";
cout << "total twins = " << twinscnt << "; last twin = " << (last_twin - 1) << "+/-1"<< endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment