Skip to content

Instantly share code, notes, and snippets.

@thehans
Last active August 29, 2019 15:34
Show Gist options
  • Save thehans/0669096bebde08b153672cab34e3c630 to your computer and use it in GitHub Desktop.
Save thehans/0669096bebde08b153672cab34e3c630 to your computer and use it in GitHub Desktop.
POC "PunchWheel" tweak to Kim Walisch's Segmented Sieve example. (First revision is Kim's unedited for comparison)
/// @file segmented_bit_sieve.cpp
/// @author Kim Walisch, <[email protected]>
/// @brief This is an implementation of the segmented sieve of
/// Eratosthenes which uses a bit array with 16 numbers per
/// byte. It generates the primes below 10^10 in 7.25 seconds
/// on an Intel Core i7-6700 3.4 GHz CPU.
/// @license Public domain.
#include <iostream>
#include <algorithm>
#include <cmath>
#include <vector>
#include <cstdlib>
#include <cstdint>
#include <cassert>
#include <cstring>
// # of bits shifted to convert between a number <-> byte offset.
// Evens are skipped, so 8bits*2 == 16 == (1<<4)
constexpr size_t BASESHIFT = 4;
// simple helper for below
constexpr size_t tobitpow(size_t x) {
switch(x) {
case 1: return 0;
case 2: return 1;
case 4: return 2;
case 8: return 3;
case 16: return 4;
case 32: return 5;
case 64: return 6;
case 128: return 7;
case 256: return 8;
default: assert(false && "bad input, expected a smallish power of 2"); return 0;
};
}
// Declare the width of "word" used for PunchWheel and Sieve vectors
// And some helper constants to make converting to other widths a little easier
typedef uint64_t word_t;
constexpr word_t WORDINIT = ~word_t(0); // initialization value with all bits set
constexpr size_t WORDBYTES = sizeof(word_t);
constexpr size_t WORDSHIFT = tobitpow(WORDBYTES); // # of bits shifted to convert between word count <-> byte count
constexpr size_t WORDMASK = WORDBYTES-1; // mask based on WORDSHIFT bits
constexpr size_t FULLSHIFT = 4+WORDSHIFT; // # of bits shifted to convert between actual number <-> bit index
constexpr size_t FULLMASK = (1<<FULLSHIFT)-1; // mask based on above
/// Set your CPU's L1 data cache size (in bytes) here
const int64_t L1D_CACHE_SIZE = 32768;
/// Bitmasks needed to unset bits corresponding to multiples
const uint64_t unset_bit64[128] =
{
~(1ull << 0), ~(1ull << 0),
~(1ull << 1), ~(1ull << 1),
~(1ull << 2), ~(1ull << 2),
~(1ull << 3), ~(1ull << 3),
~(1ull << 4), ~(1ull << 4),
~(1ull << 5), ~(1ull << 5),
~(1ull << 6), ~(1ull << 6),
~(1ull << 7), ~(1ull << 7),
~(1ull << 8), ~(1ull << 8),
~(1ull << 9), ~(1ull << 9),
~(1ull << 10), ~(1ull << 10),
~(1ull << 11), ~(1ull << 11),
~(1ull << 12), ~(1ull << 12),
~(1ull << 13), ~(1ull << 13),
~(1ull << 14), ~(1ull << 14),
~(1ull << 15), ~(1ull << 15),
~(1ull << 16), ~(1ull << 16),
~(1ull << 17), ~(1ull << 17),
~(1ull << 18), ~(1ull << 18),
~(1ull << 19), ~(1ull << 19),
~(1ull << 20), ~(1ull << 20),
~(1ull << 21), ~(1ull << 21),
~(1ull << 22), ~(1ull << 22),
~(1ull << 23), ~(1ull << 23),
~(1ull << 24), ~(1ull << 24),
~(1ull << 25), ~(1ull << 25),
~(1ull << 26), ~(1ull << 26),
~(1ull << 27), ~(1ull << 27),
~(1ull << 28), ~(1ull << 28),
~(1ull << 29), ~(1ull << 29),
~(1ull << 30), ~(1ull << 30),
~(1ull << 31), ~(1ull << 31),
~(1ull << 32), ~(1ull << 32),
~(1ull << 33), ~(1ull << 33),
~(1ull << 34), ~(1ull << 34),
~(1ull << 35), ~(1ull << 35),
~(1ull << 36), ~(1ull << 36),
~(1ull << 37), ~(1ull << 37),
~(1ull << 38), ~(1ull << 38),
~(1ull << 39), ~(1ull << 39),
~(1ull << 40), ~(1ull << 40),
~(1ull << 41), ~(1ull << 41),
~(1ull << 42), ~(1ull << 42),
~(1ull << 43), ~(1ull << 43),
~(1ull << 44), ~(1ull << 44),
~(1ull << 45), ~(1ull << 45),
~(1ull << 46), ~(1ull << 46),
~(1ull << 47), ~(1ull << 47),
~(1ull << 48), ~(1ull << 48),
~(1ull << 49), ~(1ull << 49),
~(1ull << 50), ~(1ull << 50),
~(1ull << 51), ~(1ull << 51),
~(1ull << 52), ~(1ull << 52),
~(1ull << 53), ~(1ull << 53),
~(1ull << 54), ~(1ull << 54),
~(1ull << 55), ~(1ull << 55),
~(1ull << 56), ~(1ull << 56),
~(1ull << 57), ~(1ull << 57),
~(1ull << 58), ~(1ull << 58),
~(1ull << 59), ~(1ull << 59),
~(1ull << 60), ~(1ull << 60),
~(1ull << 61), ~(1ull << 61),
~(1ull << 62), ~(1ull << 62),
~(1ull << 63), ~(1ull << 63)
};
const int64_t popcnt[256] =
{
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
// copy (count) bytes from src to src+count, repeating contiguously until n total instances exist.
// src must have allocation for n*count bytes
void memextend(void* src, const size_t count, size_t n) {
if (n>1) {
char* s = (char*)src;
n--; // use n to track how many remaining to copy; src counts as 1 already
char* d = s+count;
for(size_t count2=count,scale=1; n>=scale; d+=count2, n-=scale, count2<<=1,scale<<=1) {
memcpy(d, s, count2);
}
if (n) memcpy(d,s,n*count); // fill remainder that doesn't fit power of 2
}
}
// PunchWheel keeps a vector of words to apply to the sieve.
// Each vector begins with a partial word
class PunchWheel
{
typedef std::vector<word_t> bitvector;
size_t max_size;
bitvector bits; // vector of words representing the sieve bits for prime(s) in this cycle/wheel
bitvector::const_iterator bitp; // keep a pointer to the next wheel word to apply between calls/segments
bitvector::const_iterator resetp; // pointer to reset the cycle. Must occur after word containing (p*p) for the largest prime in wheel.
size_t cyclestart; // contains the word index of the cycle start (after initial words which may have partial filter bits if p*p starts mid word)
const size_t istart; // sieve index which first (header) wheel word corresponds to
size_t i; // sieve index to apply from. Initialize to istart for first segment, then 0 for others
int64_t prod; // product of primes covered by this wheel
public:
// bits start from the first word containing p*p. This first word is only used once, after which we skip over it.
PunchWheel(size_t max_size, int64_t p) : max_size(max_size), bits(p + 1, WORDINIT),
bitp(bits.begin()), resetp(bits.begin()+1), cyclestart(1),
istart((p*p) >> FULLSHIFT), i(istart), prod(p)
{
// This loop only generates the first cycle of *bytes*, which is then copied to the wider word_t width.
for (size_t j = (p*p) & FULLMASK, p2=p<<1; j < (cyclestart<<FULLSHIFT)+(p << BASESHIFT); j += p2)
bits[j >> FULLSHIFT] &= unset_bit64[j & FULLMASK];
/************** Since partial bytes from a word_t are read/copied here... ***************
*********************** THIS IS LITTLE-ENDIAN ARCHITECTURE DEPENDENT *******************/
memextend((void*)&bits[cyclestart], p, WORDBYTES); // copy repeating bytes into 64bit words
std::cout << "created new punchwheel with p=" << p << std::endl;
}
// Insert another prime into the wheel.
// Primes are expected to be added in order from smallest to largest, but consecutive primes are not necessarily required per wheel.
// return false and do nothing if max_size would be exceeded
bool extend(int64_t p) {
// This prime's first filter bit (p*p) could be in a different 64bit word than the first prime in wheel.
size_t extrahead = ((p*p)>>FULLSHIFT) - istart + (1 - cyclestart);
size_t newsize = (cyclestart + extrahead + prod*p)<<WORDSHIFT;
if (newsize > max_size) return false;
size_t bitpoff = bitp-bits.begin();
bits.resize(cyclestart+extrahead+prod*p);
if (extrahead) {
// extend existing cycle to account for new cyclestart
memcpy(&bits[cyclestart+prod], &bits[cyclestart], extrahead<<WORDSHIFT);
}
bitp = bits.begin()+(bitpoff>=cyclestart ? bitpoff+extrahead : bitpoff);
cyclestart += extrahead;
resetp = bits.begin()+cyclestart;
// need enough sieve bits to cover new prime product (but only in terms of bytes)
if (p > WORDBYTES)
memextend(&bits[cyclestart], prod<<WORDSHIFT, (p>>WORDSHIFT)+1);
prod *= p;
for (size_t j = ((cyclestart-1)<<FULLSHIFT) + ((p*p) & FULLMASK), p2=p<<1; j < (cyclestart<<FULLSHIFT)+(prod<<BASESHIFT); j += p2)
bits[j >> FULLSHIFT] &= unset_bit64[j & FULLMASK];
// extend bytes to word_t
memextend(&bits[cyclestart], prod, WORDBYTES); // copy repeating bytes into 64bit words
std::cout << "...extended with p=" << p;
if(extrahead) std::cout << " (cyclestart extended by " << extrahead << ")";
std::cout << std::endl;
return true;
}
void apply(std::vector<word_t> &sieve)
{
while (sieve.size()-i >= bits.end()-bitp)
{
for (; bitp != bits.end(); i++)
{
sieve[i] &= *(bitp++);
}
bitp = resetp;
}
// final (maybe partial) cycle
for (; i < sieve.size(); i++) {
sieve[i] &= *(bitp++);
}
if (bitp == bits.end()) bitp = resetp;
i = 0; // after first segment we always will go from start of sieve
}
};
/// Generate primes using the segmented sieve of Eratosthenes.
/// This algorithm uses O(n log log n) operations and O(sqrt(n)) space.
/// @param limit Sieve primes <= limit.
///
void segmented_sieve(int64_t limit)
{
int64_t sqrt = (int64_t) std::sqrt(limit);
int64_t sieve_size = std::max(sqrt, L1D_CACHE_SIZE);
// convert from size in bytes to size in words
sieve_size = (sieve_size >> WORDSHIFT) + ((sieve_size & WORDMASK) ? 1 : 0);
int64_t segment_size = sieve_size << FULLSHIFT;
int64_t count = (limit == 1) ? -1 : 0;
// we sieve primes >= 3
int64_t i = 3;
int64_t s = 3;
// vector used for sieving
std::vector<word_t> sieve(sieve_size);
std::vector<uint8_t> is_prime(sqrt + 1, true);
std::vector<int64_t> primes;
std::vector<int64_t> multiples;
std::vector<PunchWheel> wheels;
// first index of primes which is not handled by a wheel
size_t i_no_wheel = 0;
for (int64_t low = 0; low <= limit; low += segment_size)
{
std::fill(sieve.begin(), sieve.end(), WORDINIT);
// current segment = [low, high]
int64_t high = low + segment_size - 1;
high = std::min(high, limit);
sieve_size = ((high - low)>>FULLSHIFT) + 1;
// generate sieving primes using simple sieve of Eratosthenes
for (; i * i <= high; i += 2)
if (is_prime[i])
for (int64_t j = i * i; j <= sqrt; j += i)
is_prime[j] = false;
// initialize sieving primes for segmented sieve
for (; s * s <= high; s += 2)
{
if (is_prime[s])
{
primes.push_back(s);
multiples.push_back(s * s - low);
// The optimal wheel sieve prime limit here should be equal to word_t bit width (or maybe a few less?, try tweaking)
if (s < 64) {
if (wheels.empty() || !wheels.back().extend(s))
wheels.emplace_back(L1D_CACHE_SIZE, s);
i_no_wheel = primes.size();
}
}
}
for (std::size_t i = 0; i < wheels.size(); i++) {
wheels[i].apply(sieve);
}
// sieve segment using bit array
for (std::size_t i = i_no_wheel; i < primes.size(); i++)
{
int64_t j = multiples[i];
for (int64_t k = primes[i] * 2; j < segment_size; j += k)
sieve[j >> FULLSHIFT] &= unset_bit64[j & FULLMASK];
multiples[i] = j - segment_size;
}
// unset bits > limit
if (high == limit)
{
word_t bits = WORDINIT << ((limit & FULLMASK) + 1) / 2;
sieve[sieve_size - 1] &= ~bits;
}
for (int64_t n = 0; n < sieve_size; n++)
count += __builtin_popcountl(sieve[n]);
//uint8_t *sb = (uint8_t*)&sieve[0];
//for (int64_t n = 0; n < sieve_size<<WORDSHIFT; n++)
// count += popcnt[sb[n]];
}
std::cout << count << " primes found." << std::endl;
}
/// Usage: ./segmented_sieve n
/// @param n Sieve the primes up to n.
///
int main(int argc, char** argv)
{
if (argc >= 2)
segmented_sieve(std::atoll(argv[1]));
else
segmented_sieve(1000000000);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment