Last active
August 12, 2024 16:50
-
-
Save jzakiya/8879c0f4dfda543eaf92a3186de554d7 to your computer and use it in GitHub Desktop.
Cousin Primes generator, multi-threaded, using SSoZ (Segmented Sieve of Zakiya), written in Rust
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This Rust source file is a multiple threaded implementation to perform an | |
// extremely fast Segmented Sieve of Zakiya (SSoZ) to find Cousin Primes <= N. | |
// Inputs are single values N, or ranges N1 and N2, of 64-bits, 0 -- 2^64 - 1. | |
// Output is the number of cousin primes <= N, or in range N1 to N2; the last | |
// cousin prime value for the range; and the total time of execution. | |
// Code originally developed on a System76 laptop with an Intel I7 6700HQ cpu, | |
// 2.6-3.5 GHz clock, with 8 threads, and 16GB of memory. Parameter tuning | |
// probably needed to optimize for other hardware systems (ARM, PowerPC, etc). | |
// Can compile as: $ cargo build --release | |
// or: $ RUSTFLAGS="-C opt-level=3 -C debuginfo=0 -C target-cpu=native" cargo build --release | |
// The later compilation creates faster runtime on my system. | |
// To reduce binary size in target/release/ do: $ strip cousinprimes_ssoz | |
// Single val: $ echo val1 | ./cousinprimes_ssoz | |
// Range vals: $ echo val1 val2 | ./cousinprimes_ssoz | |
// val1 and val2 can now be entered as either: 123456789 or 123_456_789 | |
// Mathematical and technical basis for implementation are explained here: | |
// https://www.academia.edu/37952623/The_Use_of_Prime_Generators_to_Implement_Fast_ | |
// Twin_Primes_Sieve_of_Zakiya_SoZ_Applications_to_Number_Theory_and_Implications_ | |
// for_the_Riemann_Hypotheses | |
// https://www.academia.edu/7583194/The_Segmented_Sieve_of_Zakiya_SSoZ_ | |
// https://www.academia.edu/19786419/PRIMES-UTILS_HANDBOOK | |
// https://www.academia.edu/81206391/Twin_Primes_Segmented_Sieve_of_Zakiya_SSoZ_Explained | |
// This source code, and updates, can be found here: | |
// https://gist.github.com/jzakiya/8879c0f4dfda543eaf92a3186de554d7 | |
// Significant contributions provided from https://users.rust-lang.org/ | |
// This code is provided free and subject to copyright and terms of the | |
// GNU General Public License Version 3, GPLv3, or greater. | |
// License copy/terms are here: http://www.gnu.org/licenses/ | |
// Copyright (c) 2017-2024; Jabari Zakiya -- jzakiya at gmail dot com | |
// Last update: 2024/08/12 | |
extern crate integer_sqrt; | |
extern crate num_cpus; | |
extern crate rayon; | |
use integer_sqrt::IntegerSquareRoot; | |
use rayon::prelude::*; | |
use std::sync::atomic::{self, AtomicUsize, AtomicU8, Ordering}; | |
use std::time::SystemTime; | |
// A counter implemented using relaxed (unsynchronized) atomic operations. | |
struct RelaxedCounter(AtomicUsize); | |
impl RelaxedCounter { | |
fn new() -> Self { RelaxedCounter(AtomicUsize::new(0)) } | |
/// Increment and get the new value. | |
fn increment(&self) -> usize { | |
self.0.fetch_add(1, atomic::Ordering::Relaxed) + 1 } | |
} | |
fn print_time(title: &str, time: SystemTime) { | |
print!("{} = ", title); | |
println!("{} secs", { | |
match time.elapsed() { | |
Ok(e) => e.as_secs() as f64 + e.subsec_nanos() as f64 / 1_000_000_000f64, | |
Err(e) => { panic!("Timer error {:?}", e) }, | |
} | |
}); | |
} | |
// Customized gcd to determine coprimality; n > m; m odd | |
fn coprime(mut m: usize, mut n: usize) -> bool { | |
while m|1 != 1 { let t = m; m = n % m; n = t } | |
m > 0 | |
} | |
// Compute modular inverse a^-1 to base m, e.g. a*(a^-1) mod m = 1 | |
fn modinv(a0: usize, m0: usize) -> usize { | |
if m0 == 1 { return 1 } | |
let (mut a, mut m) = (a0 as isize, m0 as isize); | |
let (mut x0, mut inv) = (0, 1); | |
while a > 1 { | |
inv -= (a / m) * x0; | |
a = a % m; | |
std::mem::swap(&mut a, &mut m); | |
std::mem::swap(&mut x0, &mut inv); | |
} | |
if inv < 0 { inv += m0 as isize } | |
inv as usize | |
} | |
fn gen_pg_parameters(prime: usize) -> (usize, usize, usize, Vec<usize>, Vec<usize>) { | |
// Create prime generator parameters for given Pn | |
println!("using Prime Generator parameters for P{}", prime); | |
let primes: Vec<usize> = vec![2, 3, 5, 7, 11, 13, 17, 19, 23]; | |
let (mut modpg, mut res_0) = (1, 0); // compute Pn's modulus and res_0 value | |
for prm in primes { res_0 = prm; if prm > prime { break }; modpg *= prm } | |
let mut rescousins: Vec<usize> = vec![]; // save upper cousinpair residues here | |
let mut inverses = vec![0usize; modpg+2]; // save Pn's residues inverses here | |
let (mut rc, mut inc, mut res) = (5,2,0); // use P3's PGS to generate residue candidates | |
let midmodpg = modpg >> 1; // mid value for modulus | |
while rc < midmodpg { // find PG's 1st half residues | |
if coprime(rc, modpg) { // if rc is a residue | |
let mc = modpg - rc; // create its modular complement | |
inverses[rc] = modinv(rc, modpg); // save rc and mc inverses | |
inverses[mc] = modinv(mc, modpg); // if in cousinprime save both hi residues | |
if res + 4 == rc { rescousins.push(rc); rescousins.push(mc + 4) } | |
res = rc; // save current found residue | |
} | |
rc += inc; inc ^= 0b110; // create next P3 sequence rc: 5 7 11 13 17 19 ... | |
} | |
rescousins.push(midmodpg + 2); rescousins.sort(); // create pivot hi_cp | |
inverses[modpg + 1] = 1; inverses[modpg - 1] = modpg - 1; // last 2 residues are self inverses | |
(modpg, res_0, rescousins.len(), rescousins, inverses) | |
} | |
fn set_sieve_parameters(start_num: usize, end_num: usize) -> | |
(usize, usize, usize, usize, usize, usize, usize, Vec<usize>, Vec<usize>) { | |
// Select at runtime best PG and segment size parameters for input values. | |
// These are good estimates derived from PG data profiling. Can be improved. | |
let nrange = end_num - start_num; | |
let bn: usize; let pg: usize; | |
if end_num < 49 { | |
bn = 1; pg = 3; | |
} else if nrange < 77_000_000 { | |
bn = 16; pg = 5; | |
} else if nrange < 1_100_000_000 { | |
bn = 32; pg = 7; | |
} else if nrange < 35_500_000_000 { | |
bn = 64; pg = 11; | |
} else if nrange < 14_000_000_000_000 { | |
pg = 13; | |
if nrange > 7_000_000_000_000 { bn = 384; } | |
else if nrange > 2_500_000_000_000 { bn = 320; } | |
else if nrange > 250_000_000_000 { bn = 196; } | |
else { bn = 128; } | |
} else { | |
bn = 384; pg = 17; | |
} | |
let (modpg, res_0, pairscnt, rescousins, resinvrs) = gen_pg_parameters(pg); | |
let kmin = (start_num-2) / modpg + 1; // number of resgroups to start_num | |
let kmax = (end_num - 2) / modpg + 1; // number of resgroups to end_num | |
let krange = kmax - kmin + 1; // number of resgroups in range, at least 1 | |
let n = if krange < 37_500_000_000_000 { 10 } else if krange < 975_000_000_000_000 { 16 } else { 20 }; | |
let b = bn * 1024 * n; // set seg size to optimize for selected PG | |
let ks = if krange < b { krange } else { b }; // segments resgroups size | |
println!("segment size = {} resgroups; seg array is [1 x {}] 64-bits", ks, ((ks-1) >> 6) + 1); | |
let maxpairs = krange * pairscnt; // maximum number of cousinprime pcs | |
println!("cousinprime candidates = {}; resgroups = {}", maxpairs, krange); | |
(modpg, res_0, ks, kmin, kmax, krange, pairscnt, rescousins, resinvrs) | |
} | |
fn atomic_slice(slice: &mut [u8]) -> &[AtomicU8] { | |
unsafe { &*(slice as *mut [u8] as *const [AtomicU8]) } | |
} | |
fn sozp5(val: usize, res_0: usize, start_num : usize, end_num : usize) -> Vec<usize> { | |
// Return the primes r0..sqrt(end_num) within range (start_num...end_num) | |
let (md, rescnt) = (30, 8); // P5's modulus and residues count | |
static RES: [usize; 8] = [7,11,13,17,19,23,29,31]; | |
static BITN: [u8; 30] = [0,0,0,0,0,1,0,0,0,2,0,4,0,0,0,8,0,16,0,0,0,32,0,0,0,0,0,64,0,128]; | |
let range_size = end_num - start_num; // integers size of inputs range | |
let kmax = (val - 2) / md + 1; // number of resgroups upto input value | |
let mut prms = vec![0u8; kmax]; // byte array of prime candidates, init '0' | |
let sqrtn = val.integer_sqrt(); // compute integer sqrt of val | |
let k = sqrtn / md; // compute its resgroup value | |
let (resk, mut r) = (sqrtn - md*k, 0); // compute its residue value; set residue start posn | |
while resk >= RES[r] { r += 1 } // find largest residue <= sqrtn posn in its resgroup | |
let pcs_to_sqrtn = k*rescnt + r; // number of pcs <= sqrtn | |
for i in 0..pcs_to_sqrtn { // for r0..sqrtn primes mark their multiples) | |
let (k, r) = (i/rescnt, i%rescnt); // update resgroup parameters | |
if (prms[k] & (1 << r)) != 0 { continue } // skip pc if not prime | |
let prm_r = RES[r]; // if prime save its residue value | |
let prime = md*k + prm_r; // numerate its value | |
let rem = start_num % prime; // prime's modular distance to start_num | |
if !(prime - rem <= range_size || rem == 0) { continue } // skip prime if no multiple in range | |
let prms_atomic = atomic_slice(&mut prms); // to parallel sieve along bit rows | |
RES.par_iter().for_each (|ri| { // mark prime's multiples in prms in parallel | |
let prod = prm_r * ri - 2; // compute cross-product for prm_r|ri pair | |
let bit_r = BITN[prod % md]; // bit mask value for prod's residue | |
let mut kpm = k * (prime + ri) + prod / md; // resgroup for prime's 1st multiple | |
while kpm < kmax { prms_atomic[kpm].fetch_or(bit_r, Ordering::Relaxed); kpm += prime; }; | |
}); | |
} | |
// prms now contains the prime multiples positions marked for the pcs r0..N | |
// in parallel along each restrack, identify|numerate the primes in each resgroup | |
// return only the primes with a multiple within range (start_num...end_num) | |
let primes = RES.par_iter().enumerate().flat_map_iter( |(i, ri)| { | |
prms.iter().enumerate().filter_map(move |(k, resgroup)| { | |
if resgroup & (1 << i) == 0 { | |
let prime = md * k + ri; | |
let rem = start_num % prime; | |
if (prime >= res_0 && prime <= val) && (prime - rem <= range_size || rem == 0) { | |
return Some(prime); | |
} } None | |
}) }).collect(); | |
primes | |
} | |
fn nextp_init(rhi: usize, kmin: usize, modpg: usize, primes: &[usize], resinvrs: &[usize]) -> Vec<usize> { | |
// Initialize 'nextp' array for cousinpair upper residue rhi in 'rescousins'. | |
// Compute 1st prime multiple resgroups for each prime r0..sqrt(N) and | |
// store consecutively as lo_tp|hi_tp pairs for their restracks. | |
let mut nextp = vec![0usize; primes.len() * 2]; // 1st mults array for cousinpair | |
let (r_hi, r_lo) = (rhi, rhi - 4); // upper|lower cousinpair residue values | |
for (j, prime) in primes.iter().enumerate() { // for each prime r0..sqrt(N) | |
let k = (prime - 2) / modpg; // find the resgroup it's in | |
let r = (prime - 2) % modpg + 2; // and its residue value | |
let r_inv = resinvrs[r]; // and residue inverse | |
let rl = (r_lo * r_inv - 2) % modpg + 2; // compute r's rl for r_lo | |
let rh = (r_hi * r_inv - 2) % modpg + 2; // compute r's rh for r_hi | |
let mut kl = k * (prime + rl) + (r * rl - 2) / modpg; // kl 1st mult resgroup | |
let mut kh = k * (prime + rh) + (r * rh - 2) / modpg; // kh 1st mult restroup | |
if kl < kmin { kl = (kmin - kl) % prime; if kl > 0 { kl = prime - kl } } | |
else { kl = kl - kmin }; | |
if kh < kmin { kh = (kmin - kh) % prime; if kh > 0 { kh = prime - kh } } | |
else { kh = kh - kmin }; | |
nextp[j * 2] = kl; // prime's 1st mult lo_cp resgroups in range | |
nextp[j * 2 | 1] = kh; // prime's 1st mult hi_cp resgroups in range | |
} | |
nextp | |
} | |
fn cousins_sieve(r_hi: usize, kmin: usize, kmax: usize, ks: usize, start_num: usize, | |
end_num: usize, modpg: usize, primes: &[usize], resinvrs: &[usize]) -> (usize, usize) { | |
// Perform in thread the ssoz for given cousinpair residues for kmax resgroups. | |
// First create|init 'nextp' array of 1st prime mults for given cousinpair, | |
// stored consequtively in 'nextp', and init seg array for ks resgroups. | |
// For sieve, mark resgroup bits to '1' if either cousinpair restrack is nonprime | |
// for primes mults resgroups, and update 'nextp' restrack slices acccordingly. | |
// Return the last cousinprime|sum for the range for this cousinpair residues. | |
// For speed, disable runtime seg array bounds checking; using 64-bit elem seg array | |
unsafe { // allow fast array indexing | |
type MWord = u64; // mem size for 64-bit cpus | |
const S: usize = 6; // shift value for 64 bits | |
const BMASK: usize = (1 << S) - 1; // bitmask val for 64 bits | |
let (mut sum, mut ki, mut kn) = (0usize, kmin-1, ks); // init these parameters | |
let (mut hi_cp, mut k_max) = (0usize, kmax); // max cousinprime|resgroup val | |
let mut seg = vec![0 as MWord; ((ks - 1) >> S) + 1]; // seg array for ks resgroups | |
if r_hi - 4 < (start_num - 2) % modpg +2 { ki += 1; } // ensure lo cp in range | |
if r_hi > (end_num - 2) % modpg + 2 { k_max -= 1; } // ensure hi cp in range | |
let mut nextp = nextp_init(r_hi, ki, modpg, primes, resinvrs); // init nextp array | |
while ki < k_max { // for ks size slices upto kmax | |
if ks > (k_max - ki) { kn = k_max - ki } // adjust kn size for last slice | |
for (j, prime) in primes.iter().enumerate() { // for each prime r0..sqrt(N) | |
// for lower cousinpair residue track | |
let mut k = *nextp.get_unchecked(j * 2); // starting from this resgroup in seg | |
while k < kn { // mark primenth resgrouup bits prime mults | |
*seg.get_unchecked_mut(k >> S) |= 1 << (k & BMASK); | |
k += prime; } // set resgroup for prime's next multiple | |
*nextp.get_unchecked_mut(j * 2) = k - kn; // save 1st resgroup in next eligible seg | |
// for upper cousinpair residue track | |
k = *nextp.get_unchecked(j * 2 | 1); // starting from this resgroup in seg | |
while k < kn { // mark primenth resgroup bits prime mults | |
*seg.get_unchecked_mut(k >> S) |= 1 << (k & BMASK); | |
k += prime; } // set resgroup for prime's next multiple | |
*nextp.get_unchecked_mut(j * 2 | 1) = k - kn; // save 1st resgroup in next eligible seg | |
} // set as nonprime unused bits in last seg[n] | |
// so fast, do for every seg[i] | |
*seg.get_unchecked_mut((kn - 1) >> S) |= !1u64 << ((kn - 1) & BMASK); | |
let mut cnt = 0usize; // count the twinprimes in the segment | |
for &m in &seg[0..=(kn - 1) >> S] { cnt += m.count_zeros() as usize; } | |
if cnt > 0 { // if segment has cousinprimes | |
sum += cnt; // add the segment count to total count | |
let mut upk = kn - 1; // from end of seg count back to largest cp | |
while *seg.get_unchecked(upk >> S) & (1 << (upk & BMASK)) != 0 { upk -= 1 } | |
hi_cp = ki + upk; // set resgroup value for largest cp in seg | |
} | |
ki += ks; // set 1st resgroup val of next seg slice | |
if ki < k_max { seg.fill(0); } // set seg to all primes | |
} // when sieve done, numerate largest cp in range | |
// for small ranges w/o cousins, set largest to 0 | |
hi_cp = if r_hi > end_num || sum == 0 { 0 } else { hi_cp * modpg + r_hi }; | |
(hi_cp, sum) // return largest cousinprime|cousins count | |
} | |
} | |
fn main() { | |
let mut val = String::new(); // Inputs are 1|2 values < 2**64; w/wo underscores: 12345|12_345 | |
std::io::stdin().read_line(&mut val).expect("Failed to read line"); | |
let mut substr_iter = val.split_whitespace(); | |
let mut next_or_default = |def| -> usize { | |
substr_iter.next().unwrap_or(def).replace('_', "").parse().expect("Input is not a number") | |
}; | |
let mut end_num = std::cmp::max(next_or_default("3"), 3); // min vals 3 | |
let mut start_num = std::cmp::max(next_or_default("3"), 3); | |
if start_num > end_num { std::mem::swap(&mut end_num, &mut start_num) } | |
start_num |= 1; // if start_num even increase by 1 | |
end_num = (end_num - 1) | 1; // if end_num even decrease by 1 | |
if end_num - start_num < 4 { end_num = 7; start_num = 7 } | |
println!("threads = {}", num_cpus::get()); | |
let ts = SystemTime::now(); // start timing sieve setup execution | |
// select Pn, set sieving params for inputs | |
let (modpg, res_0, ks, kmin, kmax, krange, pairscnt, rescousins, resinvrs) = set_sieve_parameters(start_num, end_num); | |
// create sieve primes <= sqrt(end_num), only use those whose multiples within inputs range | |
let primes: Vec<usize> = if end_num < 49 { vec![5] } | |
else { sozp5(end_num.integer_sqrt(), res_0, start_num, end_num) }; | |
println!("each of {} threads has nextp[2 x {}] array", pairscnt, primes.len()); | |
print_time("setup time", ts); // sieve setup time | |
println!("perform cousinprimes ssoz sieve"); | |
let t1 = SystemTime::now(); // start timing ssoz sieve execution | |
// sieve each cousinpair restracks in parallel | |
let (lastcousins, cnts): (Vec<_>, Vec<_>) = { // store outputs in these arrays | |
let counter = RelaxedCounter::new(); | |
rescousins.par_iter().map( |r_hi| { | |
let out = cousins_sieve(*r_hi, kmin, kmax, ks, start_num, end_num, modpg, &primes, &resinvrs); | |
print!("\r{} of {} cousinpairs done", counter.increment(), pairscnt); | |
out | |
}).unzip() | |
}; | |
let mut cousinscnt = 0usize; // number of cousinprimes in range | |
let mut last_cousin = 0usize; // largest hi_cp cousin in range | |
if end_num < 49 { // check for cousins in low ranges | |
for c_hi in [11, 17, 23, 41, 47 ].into_iter() { | |
if start_num <= c_hi - 4 && c_hi <= end_num { last_cousin = c_hi; cousinscnt += 1} | |
} } | |
else { // check for cousins from sieve | |
for (i, cnt_i) in cnts.iter().enumerate() { | |
cousinscnt += cnt_i; | |
if last_cousin < lastcousins[i] { last_cousin = lastcousins[i]; } | |
} } | |
// account for 1st cousin prime, defined as (3, 7) | |
if start_num == 3 && end_num > 6 { cousinscnt += 1; if end_num < 11 { last_cousin = 7 }} | |
let mut kn = krange % ks; // set number of resgroups in last slice | |
if kn == 0 { kn = ks }; // if multiple of seg size set to seg size | |
print_time("\nsieve time", t1); // ssoz sieve time | |
print_time("total time", ts); // setup + sieve time | |
println!("last segment = {} resgroups; segment slices = {}", kn, (krange - 1)/ks + 1); | |
println!("total cousins = {}; last cousin = {}|-4", cousinscnt, last_cousin); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment