Last active
August 12, 2024 16:47
-
-
Save jzakiya/4b04f8ab9f911c6fc91ed616ae6d4281 to your computer and use it in GitHub Desktop.
Primes generator, multi-threaded using rayon, using SSoZ (Segmented Sieve of Zakiya), written in Rust
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This Rust source file is a multiple threaded implementation to perform an | |
// extremely fast Segmented Sieve of Zakiya (SSoZ) to find Primes <= N. | |
// Inputs are single values N, or ranges N1 and N2, of 64-bits, 0 -- 2^64 - 1. | |
// Output is the number of twin primes <= N, or in range N1 to N2; the last | |
// twin prime value for the range; and the total time of execution. | |
// Code originally developed on a System76 laptop with an Intel I7 6700HQ cpu, | |
// 2.6-3.5 GHz clock, with 8 threads, and 16GB of memory. Parameter tuning | |
// probably needed to optimize for other hardware systems (ARM, PowerPC, etc). | |
// Can compile as: $ cargo build --release | |
// or: $ RUSTFLAGS="-C opt-level=3 -C debuginfo=0 -C target-cpu=native" cargo build --release | |
// The later compilation creates faster runtime on my system. | |
// To reduce binary size in target/release/ do: $ strip primes_ssoz | |
// Single val: $ echo val1 | ./primes_ssoz | |
// Range vals: $ echo val1 val2 | ./primes_ssoz | |
// val1 and val2 can now be entered as either: 123456788 or 123_456_789 | |
// Mathematical and technical basis for implementation are explained here: | |
// https://www.academia.edu/37952623/The_Use_of_Prime_Generators_to_Implement_Fast_ | |
// Twin_Primes_Sieve_of_Zakiya_SoZ_Applications_to_Number_Theory_and_Implications_ | |
// for_the_Riemann_Hypotheses | |
// https://www.academia.edu/7583194/The_Segmented_Sieve_of_Zakiya_SSoZ_ | |
// https://www.academia.edu/19786419/PRIMES-UTILS_HANDBOOK | |
// https://www.academia.edu/81206391/Twin_Primes_Segmented_Sieve_of_Zakiya_SSoZ_Explained | |
// This source code, and updates, can be found here: | |
// https://gist.github.com/jzakiya/4b04f8ab9f911c6fc91ed616ae6d4281 | |
// Significant contributions provided from https://users.rust-lang.org/ | |
// This code is provided free and subject to copyright and terms of the | |
// GNU General Public License Version 3, GPLv3, or greater. | |
// License copy/terms are here: http://www.gnu.org/licenses/ | |
// Copyright (c) 2017-2024; Jabari Zakiya -- jzakiya at gmail dot com | |
// Last update: 2024/08/12 | |
extern crate integer_sqrt; | |
extern crate num_cpus; | |
extern crate rayon; | |
use integer_sqrt::IntegerSquareRoot; | |
use rayon::prelude::*; | |
use std::sync::atomic::{self, AtomicUsize, AtomicU8, Ordering}; | |
use std::time::SystemTime; | |
// A counter implemented using relaxed (unsynchronized) atomic operations. | |
struct RelaxedCounter(AtomicUsize); | |
impl RelaxedCounter { | |
fn new() -> Self { RelaxedCounter(AtomicUsize::new(0)) } | |
/// Increment and get the new value. | |
fn increment(&self) -> usize { | |
self.0.fetch_add(1, atomic::Ordering::Relaxed) + 1 } | |
} | |
fn print_time(title: &str, time: SystemTime) { | |
print!("{} = ", title); | |
println!("{} secs", { | |
match time.elapsed() { | |
Ok(e) => e.as_secs() as f64 + e.subsec_nanos() as f64 / 1_000_000_000f64, | |
Err(e) => { panic!("Timer error {:?}", e) }, | |
} | |
}); | |
} | |
// Customized gcd to determine coprimality; n > m; m odd | |
fn coprime(mut m: usize, mut n: usize) -> bool { | |
while m|1 != 1 { let t = m; m = n % m; n = t } | |
m > 0 | |
} | |
// Compute modular inverse a^-1 to base m, e.g. a*(a^-1) mod m = 1 | |
fn modinv(a0: usize, m0: usize) -> usize { | |
if m0 == 1 { return 1 } | |
let (mut a, mut m) = (a0 as isize, m0 as isize); | |
let (mut x0, mut inv) = (0, 1); | |
while a > 1 { | |
inv -= (a / m) * x0; | |
a = a % m; | |
std::mem::swap(&mut a, &mut m); | |
std::mem::swap(&mut x0, &mut inv); | |
} | |
if inv < 0 { inv += m0 as isize } | |
inv as usize | |
} | |
fn gen_pg_parameters(prime: usize) -> (usize, usize, usize, Vec<usize>, Vec<usize>) { | |
// Create prime generator parameters for given Pn | |
println!("using Prime Generator parameters for P{}", prime); | |
let primes: Vec<usize> = vec![2, 3, 5, 7, 11, 13, 17, 19, 23]; | |
let (mut modpg, mut res_0) = (1, 0); // compute Pn's modulus and res_0 value | |
for prm in primes { res_0 = prm; if prm > prime { break }; modpg *= prm } | |
let mut residues: Vec<usize> = vec![]; // save Pn's residues here | |
let mut inverses = vec![0usize; modpg+2]; // save Pn's residues inverses here | |
let (mut rc, mut inc) = (5,2); // use P3's PGS to generate residue candidates | |
while rc < (modpg >> 1) { // find Pn's 1st half residues | |
if coprime(rc, modpg) { // if rc is a residue | |
let mc = modpg - rc; // create its modular complement | |
inverses[rc] = modinv(rc, modpg); // save rc inverse | |
residues.push(mc); // save mc residue | |
inverses[mc] = modinv(mc, modpg); // save mc inverse | |
residues.push(rc); // save rc residue | |
} | |
rc += inc; inc ^= 0b110; // create next P3 sequence rc: 5 7 11 13 17 19 ... | |
} | |
residues.sort(); residues.push(modpg - 1); residues.push(modpg + 1); // last residue is last hi_tp | |
inverses[modpg + 1] = 1; inverses[modpg - 1] = modpg - 1; // last 2 residues are self inverses | |
(modpg, res_0, residues.len(), residues, inverses) | |
} | |
fn set_sieve_parameters(start_num: usize, end_num: usize) -> | |
(usize, usize, usize, usize, usize, usize, usize, Vec<usize>, Vec<usize>) { | |
// Select at runtime best PG and segment size parameters for input values. | |
// These are good estimates derived from PG data profiling. Can be improved. | |
let nrange = end_num - start_num; | |
let bn: usize; let pg: usize; | |
if end_num < 49 { | |
bn = 1; pg = 3; | |
} else if nrange < 77_000_000 { | |
bn = 16; pg = 5; | |
} else if nrange < 1_100_000_000 { | |
bn = 32; pg = 7; | |
} else if nrange < 35_500_000_000 { | |
bn = 64; pg = 11; | |
} else if nrange < 140_000_000_000_000 { | |
pg = 13; | |
if nrange > 7_000_000_000_000 { bn = 384; } | |
else if nrange > 2_500_000_000_000 { bn = 320; } | |
else if nrange > 250_000_000_000 { bn = 196; } | |
else { bn = 128; } | |
} else { | |
bn = 384; pg = 17; | |
} | |
let (modpg, res_0, rescnt, residues, resinvrs) = gen_pg_parameters(pg); | |
let kmin = (start_num-2) / modpg + 1; // number of resgroups to start_num | |
let kmax = (end_num - 2) / modpg + 1; // number of resgroups to end_num | |
let krange = kmax - kmin + 1; // number of resgroups in range, at least 1 | |
let n = if krange < 37_500_000_000_000 { 10 } else if krange < 975_000_000_000_000 { 16 } else { 20 }; | |
let b = bn * 1024 * n; // set seg size to optimize for selected PG | |
let ks = if krange < b { krange } else { b }; // segments resgroups size | |
println!("segment size = {} resgroups; seg array is [1 x {}] 64-bits", ks, ((ks-1) >> 6) + 1); | |
let maxprimes = krange * rescnt; // maximum number of prime pcs | |
println!("prime candidates = {}; resgroups = {}", maxprimes, krange); | |
(modpg, res_0, ks, kmin, kmax, krange, rescnt, residues, resinvrs) | |
} | |
fn atomic_slice(slice: &mut [u8]) -> &[AtomicU8] { | |
unsafe { &*(slice as *mut [u8] as *const [AtomicU8]) } | |
} | |
fn sozp5(val: usize, res_0: usize, start_num : usize, end_num : usize) -> Vec<usize> { | |
// Return the primes r0..sqrt(end_num) within range (start_num...end_num) | |
let (md, rescnt) = (30, 8); // P5's modulus and residues count | |
static RES: [usize; 8] = [7,11,13,17,19,23,29,31]; | |
static BITN: [u8; 30] = [0,0,0,0,0,1,0,0,0,2,0,4,0,0,0,8,0,16,0,0,0,32,0,0,0,0,0,64,0,128]; | |
let range_size = end_num - start_num; // integers size of inputs range | |
let kmax = (val - 2) / md + 1; // number of resgroups upto input value | |
let mut prms = vec![0u8; kmax]; // byte array of prime candidates, init '0' | |
let sqrtn = val.integer_sqrt(); // compute integer sqrt of val | |
let k = sqrtn / md; // compute its resgroup value | |
let (refk, mut r) = (sqrtn - md*k, 0); // compute its residue value; set residue start posn | |
while refk >= RES[r] { r += 1 } // find largest residue <= sqrtn posn in its resgroup | |
let pcs_to_sqrtn = k*rescnt + r; // number of pcs <= sqrtn | |
for i in 0..pcs_to_sqrtn { // for r0..sqrtn primes mark their multiples) | |
let (k, r) = (i/rescnt, i%rescnt); // update resgroup parameters | |
if (prms[k] & (1 << r)) != 0 { continue } // skip pc if not prime | |
let prm_r = RES[r]; // if prime save its residue value | |
let prime = md*k + prm_r; // numerate its value | |
let rem = start_num % prime; // prime's modular distance to start_num | |
if !(prime - rem <= range_size || rem == 0) { continue } // skip prime if no multiple in range | |
let prms_atomic = atomic_slice(&mut prms); // to parallel sieve along bit rows | |
RES.par_iter().for_each (|ri| { // mark prime's multiples in prms in parallel | |
let prod = prm_r * ri - 2; // compute cross-product for prm_r|ri pair | |
let bit_r = BITN[prod % md]; // bit mask value for prod's residue | |
let mut kpm = k * (prime + ri) + prod / md; // resgroup for prime's 1st multiple | |
while kpm < kmax { prms_atomic[kpm].fetch_or(bit_r, Ordering::Relaxed); kpm += prime; }; | |
}); | |
} | |
// prms now contains the prime multiples positions marked for the pcs r0..N | |
// in parallel along each restrack, identify|numerate the primes in each resgroup | |
// return only the primes with a multiple within range (start_num...end_num) | |
let primes = RES.par_iter().enumerate().flat_map_iter( |(i, ri)| { | |
prms.iter().enumerate().filter_map(move |(k, resgroup)| { | |
if resgroup & (1 << i) == 0 { | |
let prime = md * k + ri; | |
let rem = start_num % prime; | |
if (prime >= res_0 && prime <= val) && (prime - rem <= range_size || rem == 0) { | |
return Some(prime); | |
} } None | |
}) }).collect(); | |
primes | |
} | |
fn nextp_init(r_i: usize, kmin: usize, modpg: usize, primes: &[usize], resinvrs: &[usize]) -> Vec<usize> { | |
// Initialize 'nextp' array for residue r_i in 'residues'. | |
// Compute|store 1st prime multiple resgroups for each sieve prime along r_i. | |
let mut nextp = vec![0usize; primes.len()]; // 1st mults array for r_i | |
for (j, prime) in primes.iter().enumerate() { // for each prime r0..sqrt(N) | |
let k = (prime - 2) / modpg; // find the resgroup it's in | |
let r = (prime - 2) % modpg + 2; // and its residue value | |
let r_inv = resinvrs[r]; // and residue inverse | |
let ri = (r_i * r_inv - 2) % modpg + 2; // compute r's ri for r_i | |
let mut ki = k * (prime + ri) + (r * ri - 2) / modpg; // and 1st mult | |
if ki < kmin { ki = (kmin - ki) % prime; if ki > 0 { ki = prime - ki } } | |
else { ki = ki - kmin }; | |
nextp[j] = ki; | |
} | |
nextp | |
} | |
fn primes_sieve(r_i: usize, kmin: usize, kmax: usize, ks: usize, start_num: usize, | |
end_num: usize, modpg: usize, primes: &[usize], resinvrs: &[usize]) -> (usize, usize) { | |
// Perform in thread the ssoz for given prime residue r_i for Kmax resgroups. | |
// Create 'nextp' array of 1st prime mults for r_i, and seg array for ks resgroups. | |
// Mark resgroup bits to '1' that are prime multiples, and update 'nextp' for each prime. | |
// Return the number of primes, and largest one, in the range for restrack r_i. | |
// For speed, disable runtime seg array bounds checking; using 64-bit elem seg array | |
unsafe { // allow fast array indexing | |
type MWord = u64; // mem size for 64-bit cpus | |
const S: usize = 6; // shift value for 64 bits | |
const BMASK: usize = (1 << S) - 1; // bitmask val for 64 bits | |
let (mut sum, mut ki, mut kn) = (0usize, kmin-1, ks);// init these parameters | |
let (mut hi_p, mut k_max) = (0usize, kmax); // max prime|resgroup val | |
let mut seg = vec![0 as MWord; ((ks - 1) >> S) + 1]; // seg array for kb resgroups | |
if r_i < (start_num - 2) % modpg + 2 { ki += 1; } // ensure lo val in range for r_i | |
if r_i > (end_num -2) % modpg + 2 { k_max -= 1; } // ensure hi val in range for r_i | |
let mut nextp = nextp_init(r_i, ki, modpg, primes, resinvrs); // init nextp array | |
while ki < k_max { // for ks size slices upto kmax | |
if ks > (k_max - ki) { kn = k_max - ki } // adjust kn size for last seg | |
for (j, prime) in primes.iter().enumerate() { // for each sieve prime | |
// with multiples along r_i for range | |
let mut k = *nextp.get_unchecked(j); // starting from this resgroup in seg | |
while k < kn { // mark primenth resgrouup bits prime mults | |
*seg.get_unchecked_mut(k >> S) |= 1 << (k & BMASK); | |
k += prime; } // set resgroup for prime's next multiple | |
*nextp.get_unchecked_mut(j) = k - kn; // save 1st resgroup in next eligible seg | |
} // set as nonprime unused bits in last seg[n] | |
// so fast, do for every seg[i] | |
*seg.get_unchecked_mut((kn - 1) >> S) |= (!0u64 << ((kn - 1) & BMASK)) << 1; | |
let mut cnt = 0usize; // count the twinprimes in the segment | |
for &m in &seg[0..=(kn - 1) >> S] { cnt += m.count_zeros() as usize; } | |
if cnt > 0 { // if segment has primes | |
sum += cnt; // add segment count to total range count | |
let mut upk = kn - 1; // from end of seg, count down to largest prime | |
while *seg.get_unchecked(upk >> S) & (1 << (upk & BMASK)) != 0 { upk -= 1 } | |
hi_p = ki + upk; // set resgroup value for largest tp in seg | |
} | |
ki += ks; // set 1st resgroup val of next seg slice | |
if ki < k_max { seg.fill(0); } // set seg to all primes (for >= version 1.50) | |
} // when sieve done, numerate largest prime in range | |
// for small ranges w/o primes, set largest to 0 | |
hi_p = if r_i > end_num || sum == 0 { 0 } else { hi_p * modpg + r_i }; | |
(hi_p, sum) | |
} | |
} | |
fn main() { | |
let mut val = String::new(); // Inputs are 1|2 values < 2**64; w/wo underscores: 12345|12_345 | |
std::io::stdin().read_line(&mut val).expect("Failed to read line"); | |
let mut substr_iter = val.split_whitespace(); | |
let mut next_or_default = |def| -> usize { | |
substr_iter.next().unwrap_or(def).replace('_', "").parse().expect("Input is not a number") | |
}; | |
let mut end_num = std::cmp::max(next_or_default("1"), 1); | |
let mut start_num = std::cmp::max(next_or_default("1"), 1); | |
if start_num > end_num { std::mem::swap(&mut end_num, &mut start_num) } | |
if end_num < 2 && start_num < 2 {start_num = 4; end_num = 4} | |
if end_num > 1 && start_num == 1 {start_num = 2} | |
println!("threads = {}", num_cpus::get()); | |
let ts = SystemTime::now(); // start timing sieve setup execution | |
// select Pn, set sieving params for inputs | |
let (modpg, res_0, ks, kmin, kmax, range, rescnt, residues, resinvrs) = set_sieve_parameters(start_num, end_num); | |
// create sieve primes <= sqrt(end_num), only use those whose multiples within inputs range | |
let primes: Vec<usize> = if end_num < 49 { vec![5] } | |
else { sozp5(end_num.integer_sqrt(), res_0, start_num, end_num) }; | |
println!("each of {} threads has nextp[1 x {}] array", rescnt, primes.len()); | |
print_time("setup time", ts); // sieve setup time | |
let mut primescnt = 0usize; // init primes range count | |
let mut last_prime = 0usize; // init val for largest prime | |
let lo_prime = residues[0]; // first residue|prime for selected PG | |
for prm in [2, 3, 5, 7, 11, 13, 17] { // cnt small primes, keep last one | |
if prm >= start_num && prm < lo_prime { primescnt += 1; last_prime = prm }; | |
} | |
println!("perform primes ssoz sieve"); | |
let t1 = SystemTime::now(); // start timing ssoz sieve execution | |
// sieve each prime restracks in parallel | |
let (lastprimes, cnts): (Vec<_>, Vec<_>) = { // store outputs in these arrays | |
let counter = RelaxedCounter::new(); | |
residues.par_iter().map( |r_i| { | |
let out = primes_sieve(*r_i, kmin, kmax, ks, start_num, end_num, modpg, &primes, &resinvrs); | |
print!("\r{} of {} residues done", counter.increment(), rescnt); | |
out | |
}).unzip() | |
}; | |
for i in 0..rescnt { // find largest twinprime|sum in range | |
primescnt += cnts[i]; | |
if last_prime < lastprimes[i] { last_prime = lastprimes[i]; } | |
} | |
if start_num == 2 && end_num == 2 {primescnt = 1; last_prime = 2} // for both inputs = 2 | |
let mut kn = range % ks; // set number of resgroups in last slice | |
if kn == 0 { kn = ks }; // if multiple of seg size set to seg size | |
print_time("\nsieve time", t1); // ssoz sieve time | |
print_time("total time", ts); // setup + sieve time | |
println!("last segment = {} resgroups; segment slices = {}", kn, (range - 1)/ks + 1); | |
println!("total primes = {}; last prime = {}", primescnt, last_prime); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment