Last active
August 29, 2019 15:34
-
-
Save thehans/0669096bebde08b153672cab34e3c630 to your computer and use it in GitHub Desktop.
POC "PunchWheel" tweak to Kim Walisch's Segmented Sieve example. (First revision is Kim's unedited for comparison)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// @file segmented_bit_sieve.cpp | |
/// @author Kim Walisch, <[email protected]> | |
/// @brief This is an implementation of the segmented sieve of | |
/// Eratosthenes which uses a bit array with 16 numbers per | |
/// byte. It generates the primes below 10^10 in 7.25 seconds | |
/// on an Intel Core i7-6700 3.4 GHz CPU. | |
/// @license Public domain. | |
#include <iostream> | |
#include <algorithm> | |
#include <cmath> | |
#include <vector> | |
#include <cstdlib> | |
#include <cstdint> | |
#include <cassert> | |
#include <cstring> | |
// # of bits shifted to convert between a number <-> byte offset. | |
// Evens are skipped, so 8bits*2 == 16 == (1<<4) | |
constexpr size_t BASESHIFT = 4; | |
// simple helper for below | |
constexpr size_t tobitpow(size_t x) { | |
switch(x) { | |
case 1: return 0; | |
case 2: return 1; | |
case 4: return 2; | |
case 8: return 3; | |
case 16: return 4; | |
case 32: return 5; | |
case 64: return 6; | |
case 128: return 7; | |
case 256: return 8; | |
default: assert(false && "bad input, expected a smallish power of 2"); return 0; | |
}; | |
} | |
// Declare the width of "word" used for PunchWheel and Sieve vectors | |
// And some helper constants to make converting to other widths a little easier | |
typedef uint64_t word_t; | |
constexpr word_t WORDINIT = ~word_t(0); // initialization value with all bits set | |
constexpr size_t WORDBYTES = sizeof(word_t); | |
constexpr size_t WORDSHIFT = tobitpow(WORDBYTES); // # of bits shifted to convert between word count <-> byte count | |
constexpr size_t WORDMASK = WORDBYTES-1; // mask based on WORDSHIFT bits | |
constexpr size_t FULLSHIFT = 4+WORDSHIFT; // # of bits shifted to convert between actual number <-> bit index | |
constexpr size_t FULLMASK = (1<<FULLSHIFT)-1; // mask based on above | |
/// Set your CPU's L1 data cache size (in bytes) here | |
const int64_t L1D_CACHE_SIZE = 32768; | |
/// Bitmasks needed to unset bits corresponding to multiples | |
const uint64_t unset_bit64[128] = | |
{ | |
~(1ull << 0), ~(1ull << 0), | |
~(1ull << 1), ~(1ull << 1), | |
~(1ull << 2), ~(1ull << 2), | |
~(1ull << 3), ~(1ull << 3), | |
~(1ull << 4), ~(1ull << 4), | |
~(1ull << 5), ~(1ull << 5), | |
~(1ull << 6), ~(1ull << 6), | |
~(1ull << 7), ~(1ull << 7), | |
~(1ull << 8), ~(1ull << 8), | |
~(1ull << 9), ~(1ull << 9), | |
~(1ull << 10), ~(1ull << 10), | |
~(1ull << 11), ~(1ull << 11), | |
~(1ull << 12), ~(1ull << 12), | |
~(1ull << 13), ~(1ull << 13), | |
~(1ull << 14), ~(1ull << 14), | |
~(1ull << 15), ~(1ull << 15), | |
~(1ull << 16), ~(1ull << 16), | |
~(1ull << 17), ~(1ull << 17), | |
~(1ull << 18), ~(1ull << 18), | |
~(1ull << 19), ~(1ull << 19), | |
~(1ull << 20), ~(1ull << 20), | |
~(1ull << 21), ~(1ull << 21), | |
~(1ull << 22), ~(1ull << 22), | |
~(1ull << 23), ~(1ull << 23), | |
~(1ull << 24), ~(1ull << 24), | |
~(1ull << 25), ~(1ull << 25), | |
~(1ull << 26), ~(1ull << 26), | |
~(1ull << 27), ~(1ull << 27), | |
~(1ull << 28), ~(1ull << 28), | |
~(1ull << 29), ~(1ull << 29), | |
~(1ull << 30), ~(1ull << 30), | |
~(1ull << 31), ~(1ull << 31), | |
~(1ull << 32), ~(1ull << 32), | |
~(1ull << 33), ~(1ull << 33), | |
~(1ull << 34), ~(1ull << 34), | |
~(1ull << 35), ~(1ull << 35), | |
~(1ull << 36), ~(1ull << 36), | |
~(1ull << 37), ~(1ull << 37), | |
~(1ull << 38), ~(1ull << 38), | |
~(1ull << 39), ~(1ull << 39), | |
~(1ull << 40), ~(1ull << 40), | |
~(1ull << 41), ~(1ull << 41), | |
~(1ull << 42), ~(1ull << 42), | |
~(1ull << 43), ~(1ull << 43), | |
~(1ull << 44), ~(1ull << 44), | |
~(1ull << 45), ~(1ull << 45), | |
~(1ull << 46), ~(1ull << 46), | |
~(1ull << 47), ~(1ull << 47), | |
~(1ull << 48), ~(1ull << 48), | |
~(1ull << 49), ~(1ull << 49), | |
~(1ull << 50), ~(1ull << 50), | |
~(1ull << 51), ~(1ull << 51), | |
~(1ull << 52), ~(1ull << 52), | |
~(1ull << 53), ~(1ull << 53), | |
~(1ull << 54), ~(1ull << 54), | |
~(1ull << 55), ~(1ull << 55), | |
~(1ull << 56), ~(1ull << 56), | |
~(1ull << 57), ~(1ull << 57), | |
~(1ull << 58), ~(1ull << 58), | |
~(1ull << 59), ~(1ull << 59), | |
~(1ull << 60), ~(1ull << 60), | |
~(1ull << 61), ~(1ull << 61), | |
~(1ull << 62), ~(1ull << 62), | |
~(1ull << 63), ~(1ull << 63) | |
}; | |
const int64_t popcnt[256] = | |
{ | |
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, | |
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, | |
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, | |
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, | |
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, | |
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 | |
}; | |
// copy (count) bytes from src to src+count, repeating contiguously until n total instances exist. | |
// src must have allocation for n*count bytes | |
void memextend(void* src, const size_t count, size_t n) { | |
if (n>1) { | |
char* s = (char*)src; | |
n--; // use n to track how many remaining to copy; src counts as 1 already | |
char* d = s+count; | |
for(size_t count2=count,scale=1; n>=scale; d+=count2, n-=scale, count2<<=1,scale<<=1) { | |
memcpy(d, s, count2); | |
} | |
if (n) memcpy(d,s,n*count); // fill remainder that doesn't fit power of 2 | |
} | |
} | |
// PunchWheel keeps a vector of words to apply to the sieve. | |
// Each vector begins with a partial word | |
class PunchWheel | |
{ | |
typedef std::vector<word_t> bitvector; | |
size_t max_size; | |
bitvector bits; // vector of words representing the sieve bits for prime(s) in this cycle/wheel | |
bitvector::const_iterator bitp; // keep a pointer to the next wheel word to apply between calls/segments | |
bitvector::const_iterator resetp; // pointer to reset the cycle. Must occur after word containing (p*p) for the largest prime in wheel. | |
size_t cyclestart; // contains the word index of the cycle start (after initial words which may have partial filter bits if p*p starts mid word) | |
const size_t istart; // sieve index which first (header) wheel word corresponds to | |
size_t i; // sieve index to apply from. Initialize to istart for first segment, then 0 for others | |
int64_t prod; // product of primes covered by this wheel | |
public: | |
// bits start from the first word containing p*p. This first word is only used once, after which we skip over it. | |
PunchWheel(size_t max_size, int64_t p) : max_size(max_size), bits(p + 1, WORDINIT), | |
bitp(bits.begin()), resetp(bits.begin()+1), cyclestart(1), | |
istart((p*p) >> FULLSHIFT), i(istart), prod(p) | |
{ | |
// This loop only generates the first cycle of *bytes*, which is then copied to the wider word_t width. | |
for (size_t j = (p*p) & FULLMASK, p2=p<<1; j < (cyclestart<<FULLSHIFT)+(p << BASESHIFT); j += p2) | |
bits[j >> FULLSHIFT] &= unset_bit64[j & FULLMASK]; | |
/************** Since partial bytes from a word_t are read/copied here... *************** | |
*********************** THIS IS LITTLE-ENDIAN ARCHITECTURE DEPENDENT *******************/ | |
memextend((void*)&bits[cyclestart], p, WORDBYTES); // copy repeating bytes into 64bit words | |
std::cout << "created new punchwheel with p=" << p << std::endl; | |
} | |
// Insert another prime into the wheel. | |
// Primes are expected to be added in order from smallest to largest, but consecutive primes are not necessarily required per wheel. | |
// return false and do nothing if max_size would be exceeded | |
bool extend(int64_t p) { | |
// This prime's first filter bit (p*p) could be in a different 64bit word than the first prime in wheel. | |
size_t extrahead = ((p*p)>>FULLSHIFT) - istart + (1 - cyclestart); | |
size_t newsize = (cyclestart + extrahead + prod*p)<<WORDSHIFT; | |
if (newsize > max_size) return false; | |
size_t bitpoff = bitp-bits.begin(); | |
bits.resize(cyclestart+extrahead+prod*p); | |
if (extrahead) { | |
// extend existing cycle to account for new cyclestart | |
memcpy(&bits[cyclestart+prod], &bits[cyclestart], extrahead<<WORDSHIFT); | |
} | |
bitp = bits.begin()+(bitpoff>=cyclestart ? bitpoff+extrahead : bitpoff); | |
cyclestart += extrahead; | |
resetp = bits.begin()+cyclestart; | |
// need enough sieve bits to cover new prime product (but only in terms of bytes) | |
if (p > WORDBYTES) | |
memextend(&bits[cyclestart], prod<<WORDSHIFT, (p>>WORDSHIFT)+1); | |
prod *= p; | |
for (size_t j = ((cyclestart-1)<<FULLSHIFT) + ((p*p) & FULLMASK), p2=p<<1; j < (cyclestart<<FULLSHIFT)+(prod<<BASESHIFT); j += p2) | |
bits[j >> FULLSHIFT] &= unset_bit64[j & FULLMASK]; | |
// extend bytes to word_t | |
memextend(&bits[cyclestart], prod, WORDBYTES); // copy repeating bytes into 64bit words | |
std::cout << "...extended with p=" << p; | |
if(extrahead) std::cout << " (cyclestart extended by " << extrahead << ")"; | |
std::cout << std::endl; | |
return true; | |
} | |
void apply(std::vector<word_t> &sieve) | |
{ | |
while (sieve.size()-i >= bits.end()-bitp) | |
{ | |
for (; bitp != bits.end(); i++) | |
{ | |
sieve[i] &= *(bitp++); | |
} | |
bitp = resetp; | |
} | |
// final (maybe partial) cycle | |
for (; i < sieve.size(); i++) { | |
sieve[i] &= *(bitp++); | |
} | |
if (bitp == bits.end()) bitp = resetp; | |
i = 0; // after first segment we always will go from start of sieve | |
} | |
}; | |
/// Generate primes using the segmented sieve of Eratosthenes. | |
/// This algorithm uses O(n log log n) operations and O(sqrt(n)) space. | |
/// @param limit Sieve primes <= limit. | |
/// | |
void segmented_sieve(int64_t limit) | |
{ | |
int64_t sqrt = (int64_t) std::sqrt(limit); | |
int64_t sieve_size = std::max(sqrt, L1D_CACHE_SIZE); | |
// convert from size in bytes to size in words | |
sieve_size = (sieve_size >> WORDSHIFT) + ((sieve_size & WORDMASK) ? 1 : 0); | |
int64_t segment_size = sieve_size << FULLSHIFT; | |
int64_t count = (limit == 1) ? -1 : 0; | |
// we sieve primes >= 3 | |
int64_t i = 3; | |
int64_t s = 3; | |
// vector used for sieving | |
std::vector<word_t> sieve(sieve_size); | |
std::vector<uint8_t> is_prime(sqrt + 1, true); | |
std::vector<int64_t> primes; | |
std::vector<int64_t> multiples; | |
std::vector<PunchWheel> wheels; | |
// first index of primes which is not handled by a wheel | |
size_t i_no_wheel = 0; | |
for (int64_t low = 0; low <= limit; low += segment_size) | |
{ | |
std::fill(sieve.begin(), sieve.end(), WORDINIT); | |
// current segment = [low, high] | |
int64_t high = low + segment_size - 1; | |
high = std::min(high, limit); | |
sieve_size = ((high - low)>>FULLSHIFT) + 1; | |
// generate sieving primes using simple sieve of Eratosthenes | |
for (; i * i <= high; i += 2) | |
if (is_prime[i]) | |
for (int64_t j = i * i; j <= sqrt; j += i) | |
is_prime[j] = false; | |
// initialize sieving primes for segmented sieve | |
for (; s * s <= high; s += 2) | |
{ | |
if (is_prime[s]) | |
{ | |
primes.push_back(s); | |
multiples.push_back(s * s - low); | |
// The optimal wheel sieve prime limit here should be equal to word_t bit width (or maybe a few less?, try tweaking) | |
if (s < 64) { | |
if (wheels.empty() || !wheels.back().extend(s)) | |
wheels.emplace_back(L1D_CACHE_SIZE, s); | |
i_no_wheel = primes.size(); | |
} | |
} | |
} | |
for (std::size_t i = 0; i < wheels.size(); i++) { | |
wheels[i].apply(sieve); | |
} | |
// sieve segment using bit array | |
for (std::size_t i = i_no_wheel; i < primes.size(); i++) | |
{ | |
int64_t j = multiples[i]; | |
for (int64_t k = primes[i] * 2; j < segment_size; j += k) | |
sieve[j >> FULLSHIFT] &= unset_bit64[j & FULLMASK]; | |
multiples[i] = j - segment_size; | |
} | |
// unset bits > limit | |
if (high == limit) | |
{ | |
word_t bits = WORDINIT << ((limit & FULLMASK) + 1) / 2; | |
sieve[sieve_size - 1] &= ~bits; | |
} | |
for (int64_t n = 0; n < sieve_size; n++) | |
count += __builtin_popcountl(sieve[n]); | |
//uint8_t *sb = (uint8_t*)&sieve[0]; | |
//for (int64_t n = 0; n < sieve_size<<WORDSHIFT; n++) | |
// count += popcnt[sb[n]]; | |
} | |
std::cout << count << " primes found." << std::endl; | |
} | |
/// Usage: ./segmented_sieve n | |
/// @param n Sieve the primes up to n. | |
/// | |
int main(int argc, char** argv) | |
{ | |
if (argc >= 2) | |
segmented_sieve(std::atoll(argv[1])); | |
else | |
segmented_sieve(1000000000); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment