thehans · August 29, 2019 15:34
diff --git a/segmented_bit_sieve.cpp b/segmented_bit_sieve.cpp
 /// @file     segmented_bit_sieve.cpp
 /// @author   Kim Walisch, <[email protected]>
 /// @brief    This is an implementation of the segmented sieve of
 ///           Eratosthenes which uses a bit array with 16 numbers per
 ///           byte. It generates the primes below 10^10 in 7.25 seconds
 ///           on an Intel Core i7-6700 3.4 GHz CPU.
 /// @license  Public domain.

 #include <iostream>
 #include <algorithm>
 #include <cmath>
 #include <vector>
 #include <cstdlib>
 #include <cstdint>
 #include <cassert>
 #include <cstring>

 // # of bits shifted to convert between a number <-> byte offset. 
 // Evens are skipped, so 8bits*2 == 16 == (1<<4)
 constexpr size_t BASESHIFT = 4; 

 // simple helper for below
 constexpr size_t tobitpow(size_t x) {
  switch(x) {
    case 1: return 0;
    case 2: return 1;
    case 4: return 2;
    case 8: return 3;
    case 16: return 4;
    case 32: return 5;
    case 64: return 6;
    case 128: return 7;
    case 256: return 8;
    default: assert(false && "bad input, expected a smallish power of 2"); return 0;
  };
 }

 // Declare the width of "word" used for PunchWheel and Sieve vectors
 // And some helper constants to make converting to other widths a little easier
 typedef uint64_t word_t;
 constexpr word_t WORDINIT = ~word_t(0); // initialization value with all bits set
 constexpr size_t WORDBYTES = sizeof(word_t);
 constexpr size_t WORDSHIFT = tobitpow(WORDBYTES); // # of bits shifted to convert between word count <-> byte count
 constexpr size_t WORDMASK = WORDBYTES-1; // mask based on WORDSHIFT bits
 constexpr size_t FULLSHIFT = 4+WORDSHIFT; // # of bits shifted to convert between actual number <-> bit index
 constexpr size_t FULLMASK = (1<<FULLSHIFT)-1; // mask based on above

 /// Set your CPU's L1 data cache size (in bytes) here
 const int64_t L1D_CACHE_SIZE = 32768;

 /// Bitmasks needed to unset bits corresponding to multiples
 const uint64_t unset_bit64[128] =
 {
  ~(1ull << 0), ~(1ull << 0),
  ~(1ull << 1), ~(1ull << 1),
  ~(1ull << 2), ~(1ull << 2),
  ~(1ull << 3), ~(1ull << 3),
  ~(1ull << 4), ~(1ull << 4),
  ~(1ull << 5), ~(1ull << 5),
  ~(1ull << 6), ~(1ull << 6),
  ~(1ull << 7), ~(1ull << 7),
  ~(1ull << 8), ~(1ull << 8),
  ~(1ull << 9), ~(1ull << 9),
  ~(1ull << 10), ~(1ull << 10),
  ~(1ull << 11), ~(1ull << 11),
  ~(1ull << 12), ~(1ull << 12),
  ~(1ull << 13), ~(1ull << 13),
  ~(1ull << 14), ~(1ull << 14),
  ~(1ull << 15), ~(1ull << 15),
  ~(1ull << 16), ~(1ull << 16),
  ~(1ull << 17), ~(1ull << 17),
  ~(1ull << 18), ~(1ull << 18),
  ~(1ull << 19), ~(1ull << 19),
  ~(1ull << 20), ~(1ull << 20),
  ~(1ull << 21), ~(1ull << 21),
  ~(1ull << 22), ~(1ull << 22),
  ~(1ull << 23), ~(1ull << 23),
  ~(1ull << 24), ~(1ull << 24),
  ~(1ull << 25), ~(1ull << 25),
  ~(1ull << 26), ~(1ull << 26),
  ~(1ull << 27), ~(1ull << 27),
  ~(1ull << 28), ~(1ull << 28),
  ~(1ull << 29), ~(1ull << 29),
  ~(1ull << 30), ~(1ull << 30),
  ~(1ull << 31), ~(1ull << 31),
  ~(1ull << 32), ~(1ull << 32),
  ~(1ull << 33), ~(1ull << 33),
  ~(1ull << 34), ~(1ull << 34),
  ~(1ull << 35), ~(1ull << 35),
  ~(1ull << 36), ~(1ull << 36),
  ~(1ull << 37), ~(1ull << 37),
  ~(1ull << 38), ~(1ull << 38),
  ~(1ull << 39), ~(1ull << 39),
  ~(1ull << 40), ~(1ull << 40),
  ~(1ull << 41), ~(1ull << 41),
  ~(1ull << 42), ~(1ull << 42),
  ~(1ull << 43), ~(1ull << 43),
  ~(1ull << 44), ~(1ull << 44),
  ~(1ull << 45), ~(1ull << 45),
  ~(1ull << 46), ~(1ull << 46),
  ~(1ull << 47), ~(1ull << 47),
  ~(1ull << 48), ~(1ull << 48),
  ~(1ull << 49), ~(1ull << 49),
  ~(1ull << 50), ~(1ull << 50),
  ~(1ull << 51), ~(1ull << 51),
  ~(1ull << 52), ~(1ull << 52),
  ~(1ull << 53), ~(1ull << 53),
  ~(1ull << 54), ~(1ull << 54),
  ~(1ull << 55), ~(1ull << 55),
  ~(1ull << 56), ~(1ull << 56),
  ~(1ull << 57), ~(1ull << 57),
  ~(1ull << 58), ~(1ull << 58),
  ~(1ull << 59), ~(1ull << 59),
  ~(1ull << 60), ~(1ull << 60),
  ~(1ull << 61), ~(1ull << 61),
  ~(1ull << 62), ~(1ull << 62),
  ~(1ull << 63), ~(1ull << 63)
 };

 const int64_t popcnt[256] =
 {
  0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 };

 // copy (count) bytes from src to src+count, repeating contiguously until n total instances exist.
 //    src must have allocation for n*count bytes
 void memextend(void* src, const size_t count, size_t n) {
  if (n>1) {
    char* s = (char*)src;
    n--; // use n to track how many remaining to copy; src counts as 1 already
    char* d = s+count;
    for(size_t count2=count,scale=1; n>=scale; d+=count2, n-=scale, count2<<=1,scale<<=1) {
      memcpy(d, s, count2);
    }
    if (n) memcpy(d,s,n*count); // fill remainder that doesn't fit power of 2
  }
 }

 // PunchWheel keeps a vector of words to apply to the sieve.
 // Each vector begins with a partial word
 class PunchWheel
 {
  typedef std::vector<word_t> bitvector;
  size_t max_size;
  bitvector bits; // vector of words representing the sieve bits for prime(s) in this cycle/wheel
  bitvector::const_iterator bitp; // keep a pointer to the next wheel word to apply between calls/segments
  bitvector::const_iterator resetp; // pointer to reset the cycle.  Must occur after word containing (p*p) for the largest prime in wheel.
  size_t cyclestart; // contains the word index of the cycle start (after initial words which may have partial filter bits if p*p starts mid word)
  const size_t istart; // sieve index which first (header) wheel word corresponds to
  size_t i; // sieve index to apply from.  Initialize to istart for first segment, then 0 for others
  int64_t prod; // product of primes covered by this wheel

 public:
  // bits start from the first word containing p*p. This first word is only used once, after which we skip over it.
  PunchWheel(size_t max_size, int64_t p) : max_size(max_size), bits(p + 1, WORDINIT), 
      bitp(bits.begin()), resetp(bits.begin()+1), cyclestart(1), 
      istart((p*p) >> FULLSHIFT), i(istart), prod(p)
  {
    // This loop only generates the first cycle of *bytes*, which is then copied to the wider word_t width.
    for (size_t j = (p*p) & FULLMASK, p2=p<<1; j < (cyclestart<<FULLSHIFT)+(p << BASESHIFT); j += p2)
      bits[j >> FULLSHIFT] &= unset_bit64[j & FULLMASK];
    /************** Since partial bytes from a word_t are read/copied here... *************** 
     *********************** THIS IS LITTLE-ENDIAN ARCHITECTURE DEPENDENT *******************/
    memextend((void*)&bits[cyclestart], p, WORDBYTES); // copy repeating bytes into 64bit words
    std::cout << "created new punchwheel with p=" << p << std::endl;
  }

  // Insert another prime into the wheel.
  // Primes are expected to be added in order from smallest to largest, but consecutive primes are not necessarily required per wheel.
  //    return false and do nothing if max_size would be exceeded
  bool extend(int64_t p) {
    // This prime's first filter bit (p*p) could be in a different 64bit word than the first prime in wheel.
    size_t extrahead = ((p*p)>>FULLSHIFT) - istart + (1 - cyclestart); 
    size_t newsize = (cyclestart + extrahead + prod*p)<<WORDSHIFT;
    if (newsize > max_size) return false;

    size_t bitpoff = bitp-bits.begin();
    bits.resize(cyclestart+extrahead+prod*p);
    if (extrahead) {
      // extend existing cycle to account for new cyclestart
      memcpy(&bits[cyclestart+prod], &bits[cyclestart], extrahead<<WORDSHIFT);
    }
    bitp = bits.begin()+(bitpoff>=cyclestart ? bitpoff+extrahead : bitpoff);
    cyclestart += extrahead;
    resetp = bits.begin()+cyclestart;

    // need enough sieve bits to cover new prime product (but only in terms of bytes)
    if (p > WORDBYTES) 
      memextend(&bits[cyclestart], prod<<WORDSHIFT, (p>>WORDSHIFT)+1);

    prod *= p;
    for (size_t j = ((cyclestart-1)<<FULLSHIFT) + ((p*p) & FULLMASK), p2=p<<1; j < (cyclestart<<FULLSHIFT)+(prod<<BASESHIFT); j += p2)
      bits[j >> FULLSHIFT] &= unset_bit64[j & FULLMASK];

    // extend bytes to word_t
    memextend(&bits[cyclestart], prod, WORDBYTES); // copy repeating bytes into 64bit words

    std::cout << "...extended with p=" << p;
    if(extrahead) std::cout << " (cyclestart extended by " << extrahead << ")";
    std::cout << std::endl;

    return true;
  }

  void apply(std::vector<word_t> &sieve)
  {
    while (sieve.size()-i >= bits.end()-bitp)
    {
      for (; bitp != bits.end(); i++)
      {
        sieve[i] &= *(bitp++);
      }
      bitp = resetp;
    }
    // final (maybe partial) cycle
    for (; i < sieve.size(); i++) {
      sieve[i] &= *(bitp++);
    }
    if (bitp == bits.end()) bitp = resetp;
    i = 0; // after first segment we always will go from start of sieve
  }
 };

 /// Generate primes using the segmented sieve of Eratosthenes.
 /// This algorithm uses O(n log log n) operations and O(sqrt(n)) space.
 /// @param limit  Sieve primes <= limit.
 ///
 void segmented_sieve(int64_t limit)
 {
  int64_t sqrt = (int64_t) std::sqrt(limit);
  int64_t sieve_size = std::max(sqrt, L1D_CACHE_SIZE);
  // convert from size in bytes to size in words
  sieve_size = (sieve_size >> WORDSHIFT) + ((sieve_size & WORDMASK) ? 1 : 0);
  int64_t segment_size = sieve_size << FULLSHIFT;
  int64_t count = (limit == 1) ? -1 : 0;

  // we sieve primes >= 3
  int64_t i = 3;
  int64_t s = 3;

  // vector used for sieving
  std::vector<word_t> sieve(sieve_size);
  std::vector<uint8_t> is_prime(sqrt + 1, true);
  std::vector<int64_t> primes;
  std::vector<int64_t> multiples;
  std::vector<PunchWheel> wheels;
  // first index of primes which is not handled by a wheel
  size_t i_no_wheel = 0;

  for (int64_t low = 0; low <= limit; low += segment_size)
  {
    std::fill(sieve.begin(), sieve.end(), WORDINIT);

    // current segment = [low, high]
    int64_t high = low + segment_size - 1;
    high = std::min(high, limit);
    sieve_size = ((high - low)>>FULLSHIFT) + 1;

    // generate sieving primes using simple sieve of Eratosthenes
    for (; i * i <= high; i += 2)
      if (is_prime[i])
        for (int64_t j = i * i; j <= sqrt; j += i)
          is_prime[j] = false;

    // initialize sieving primes for segmented sieve
    for (; s * s <= high; s += 2)
    {
      if (is_prime[s])
      {
        primes.push_back(s);
        multiples.push_back(s * s - low);

        // The optimal wheel sieve prime limit here should be equal to word_t bit width (or maybe a few less?, try tweaking)
        if (s < 64) {
            if (wheels.empty() || !wheels.back().extend(s))
              wheels.emplace_back(L1D_CACHE_SIZE, s);
            i_no_wheel = primes.size();
        }
      }
    }

    for (std::size_t i = 0; i < wheels.size(); i++) {
        wheels[i].apply(sieve);
    }

    // sieve segment using bit array
    for (std::size_t i = i_no_wheel; i < primes.size(); i++)
    {
      int64_t j = multiples[i];
      for (int64_t k = primes[i] * 2; j < segment_size; j += k)
        sieve[j >> FULLSHIFT] &= unset_bit64[j & FULLMASK];
      multiples[i] = j - segment_size;
    }

    // unset bits > limit
    if (high == limit)
    {
      word_t bits = WORDINIT << ((limit & FULLMASK) + 1) / 2;
      sieve[sieve_size - 1] &= ~bits;
    }

    for (int64_t n = 0; n < sieve_size; n++)
      count += __builtin_popcountl(sieve[n]);

    //uint8_t *sb = (uint8_t*)&sieve[0];
    //for (int64_t n = 0; n < sieve_size<<WORDSHIFT; n++)
    //  count += popcnt[sb[n]];
  }

  std::cout << count << " primes found." << std::endl;
 }

 /// Usage: ./segmented_sieve n
 /// @param n  Sieve the primes up to n.
 ///
 int main(int argc, char** argv)
 {
  if (argc >= 2)
    segmented_sieve(std::atoll(argv[1]));
  else
    segmented_sieve(1000000000);

  return 0;
 }
	/// @file segmented_bit_sieve.cpp
	/// @author Kim Walisch, <[email protected]>
	/// @brief This is an implementation of the segmented sieve of
	/// Eratosthenes which uses a bit array with 16 numbers per
	/// byte. It generates the primes below 10^10 in 7.25 seconds
	/// on an Intel Core i7-6700 3.4 GHz CPU.
	/// @license Public domain.

	#include <iostream>
	#include <algorithm>
	#include <cmath>
	#include <vector>
	#include <cstdlib>
	#include <cstdint>
	#include <cassert>
	#include <cstring>

	// # of bits shifted to convert between a number <-> byte offset.
	// Evens are skipped, so 8bits*2 == 16 == (1<<4)
	constexpr size_t BASESHIFT = 4;

	// simple helper for below
	constexpr size_t tobitpow(size_t x) {
	switch(x) {
	case 1: return 0;
	case 2: return 1;
	case 4: return 2;
	case 8: return 3;
	case 16: return 4;
	case 32: return 5;
	case 64: return 6;
	case 128: return 7;
	case 256: return 8;
	default: assert(false && "bad input, expected a smallish power of 2"); return 0;
	};
	}

	// Declare the width of "word" used for PunchWheel and Sieve vectors
	// And some helper constants to make converting to other widths a little easier
	typedef uint64_t word_t;
	constexpr word_t WORDINIT = ~word_t(0); // initialization value with all bits set
	constexpr size_t WORDBYTES = sizeof(word_t);
	constexpr size_t WORDSHIFT = tobitpow(WORDBYTES); // # of bits shifted to convert between word count <-> byte count
	constexpr size_t WORDMASK = WORDBYTES-1; // mask based on WORDSHIFT bits
	constexpr size_t FULLSHIFT = 4+WORDSHIFT; // # of bits shifted to convert between actual number <-> bit index
	constexpr size_t FULLMASK = (1<<FULLSHIFT)-1; // mask based on above

	/// Set your CPU's L1 data cache size (in bytes) here
	const int64_t L1D_CACHE_SIZE = 32768;

	/// Bitmasks needed to unset bits corresponding to multiples
	const uint64_t unset_bit64[128] =
	{
	~(1ull << 0), ~(1ull << 0),
	~(1ull << 1), ~(1ull << 1),
	~(1ull << 2), ~(1ull << 2),
	~(1ull << 3), ~(1ull << 3),
	~(1ull << 4), ~(1ull << 4),
	~(1ull << 5), ~(1ull << 5),
	~(1ull << 6), ~(1ull << 6),
	~(1ull << 7), ~(1ull << 7),
	~(1ull << 8), ~(1ull << 8),
	~(1ull << 9), ~(1ull << 9),
	~(1ull << 10), ~(1ull << 10),
	~(1ull << 11), ~(1ull << 11),
	~(1ull << 12), ~(1ull << 12),
	~(1ull << 13), ~(1ull << 13),
	~(1ull << 14), ~(1ull << 14),
	~(1ull << 15), ~(1ull << 15),
	~(1ull << 16), ~(1ull << 16),
	~(1ull << 17), ~(1ull << 17),
	~(1ull << 18), ~(1ull << 18),
	~(1ull << 19), ~(1ull << 19),
	~(1ull << 20), ~(1ull << 20),
	~(1ull << 21), ~(1ull << 21),
	~(1ull << 22), ~(1ull << 22),
	~(1ull << 23), ~(1ull << 23),
	~(1ull << 24), ~(1ull << 24),
	~(1ull << 25), ~(1ull << 25),
	~(1ull << 26), ~(1ull << 26),
	~(1ull << 27), ~(1ull << 27),
	~(1ull << 28), ~(1ull << 28),
	~(1ull << 29), ~(1ull << 29),
	~(1ull << 30), ~(1ull << 30),
	~(1ull << 31), ~(1ull << 31),
	~(1ull << 32), ~(1ull << 32),
	~(1ull << 33), ~(1ull << 33),
	~(1ull << 34), ~(1ull << 34),
	~(1ull << 35), ~(1ull << 35),
	~(1ull << 36), ~(1ull << 36),
	~(1ull << 37), ~(1ull << 37),
	~(1ull << 38), ~(1ull << 38),
	~(1ull << 39), ~(1ull << 39),
	~(1ull << 40), ~(1ull << 40),
	~(1ull << 41), ~(1ull << 41),
	~(1ull << 42), ~(1ull << 42),
	~(1ull << 43), ~(1ull << 43),
	~(1ull << 44), ~(1ull << 44),
	~(1ull << 45), ~(1ull << 45),
	~(1ull << 46), ~(1ull << 46),
	~(1ull << 47), ~(1ull << 47),
	~(1ull << 48), ~(1ull << 48),
	~(1ull << 49), ~(1ull << 49),
	~(1ull << 50), ~(1ull << 50),
	~(1ull << 51), ~(1ull << 51),
	~(1ull << 52), ~(1ull << 52),
	~(1ull << 53), ~(1ull << 53),
	~(1ull << 54), ~(1ull << 54),
	~(1ull << 55), ~(1ull << 55),
	~(1ull << 56), ~(1ull << 56),
	~(1ull << 57), ~(1ull << 57),
	~(1ull << 58), ~(1ull << 58),
	~(1ull << 59), ~(1ull << 59),
	~(1ull << 60), ~(1ull << 60),
	~(1ull << 61), ~(1ull << 61),
	~(1ull << 62), ~(1ull << 62),
	~(1ull << 63), ~(1ull << 63)
	};

	const int64_t popcnt[256] =
	{
	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
	};

	// copy (count) bytes from src to src+count, repeating contiguously until n total instances exist.
	// src must have allocation for n*count bytes
	void memextend(void* src, const size_t count, size_t n) {
	if (n>1) {
	char* s = (char*)src;
	n--; // use n to track how many remaining to copy; src counts as 1 already
	char* d = s+count;
	for(size_t count2=count,scale=1; n>=scale; d+=count2, n-=scale, count2<<=1,scale<<=1) {
	memcpy(d, s, count2);
	}
	if (n) memcpy(d,s,n*count); // fill remainder that doesn't fit power of 2
	}
	}

	// PunchWheel keeps a vector of words to apply to the sieve.
	// Each vector begins with a partial word
	class PunchWheel
	{
	typedef std::vector<word_t> bitvector;
	size_t max_size;
	bitvector bits; // vector of words representing the sieve bits for prime(s) in this cycle/wheel
	bitvector::const_iterator bitp; // keep a pointer to the next wheel word to apply between calls/segments
	bitvector::const_iterator resetp; // pointer to reset the cycle. Must occur after word containing (p*p) for the largest prime in wheel.
	size_t cyclestart; // contains the word index of the cycle start (after initial words which may have partial filter bits if p*p starts mid word)
	const size_t istart; // sieve index which first (header) wheel word corresponds to
	size_t i; // sieve index to apply from. Initialize to istart for first segment, then 0 for others
	int64_t prod; // product of primes covered by this wheel

	public:
	// bits start from the first word containing p*p. This first word is only used once, after which we skip over it.
	PunchWheel(size_t max_size, int64_t p) : max_size(max_size), bits(p + 1, WORDINIT),
	bitp(bits.begin()), resetp(bits.begin()+1), cyclestart(1),
	istart((p*p) >> FULLSHIFT), i(istart), prod(p)
	{
	// This loop only generates the first cycle of bytes, which is then copied to the wider word_t width.
	for (size_t j = (p*p) & FULLMASK, p2=p<<1; j < (cyclestart<<FULLSHIFT)+(p << BASESHIFT); j += p2)
	bits[j >> FULLSHIFT] &= unset_bit64[j & FULLMASK];
	/************ Since partial bytes from a word_t are read/copied here... *************
	********************* THIS IS LITTLE-ENDIAN ARCHITECTURE DEPENDENT *****************/
	memextend((void*)&bits[cyclestart], p, WORDBYTES); // copy repeating bytes into 64bit words
	std::cout << "created new punchwheel with p=" << p << std::endl;
	}

	// Insert another prime into the wheel.
	// Primes are expected to be added in order from smallest to largest, but consecutive primes are not necessarily required per wheel.
	// return false and do nothing if max_size would be exceeded
	bool extend(int64_t p) {
	// This prime's first filter bit (p*p) could be in a different 64bit word than the first prime in wheel.
	size_t extrahead = ((p*p)>>FULLSHIFT) - istart + (1 - cyclestart);
	size_t newsize = (cyclestart + extrahead + prod*p)<<WORDSHIFT;
	if (newsize > max_size) return false;

	size_t bitpoff = bitp-bits.begin();
	bits.resize(cyclestart+extrahead+prod*p);
	if (extrahead) {
	// extend existing cycle to account for new cyclestart
	memcpy(&bits[cyclestart+prod], &bits[cyclestart], extrahead<<WORDSHIFT);
	}
	bitp = bits.begin()+(bitpoff>=cyclestart ? bitpoff+extrahead : bitpoff);
	cyclestart += extrahead;
	resetp = bits.begin()+cyclestart;

	// need enough sieve bits to cover new prime product (but only in terms of bytes)
	if (p > WORDBYTES)
	memextend(&bits[cyclestart], prod<<WORDSHIFT, (p>>WORDSHIFT)+1);

	prod *= p;
	for (size_t j = ((cyclestart-1)<<FULLSHIFT) + ((p*p) & FULLMASK), p2=p<<1; j < (cyclestart<<FULLSHIFT)+(prod<<BASESHIFT); j += p2)
	bits[j >> FULLSHIFT] &= unset_bit64[j & FULLMASK];

	// extend bytes to word_t
	memextend(&bits[cyclestart], prod, WORDBYTES); // copy repeating bytes into 64bit words

	std::cout << "...extended with p=" << p;
	if(extrahead) std::cout << " (cyclestart extended by " << extrahead << ")";
	std::cout << std::endl;

	return true;
	}

	void apply(std::vector<word_t> &sieve)
	{
	while (sieve.size()-i >= bits.end()-bitp)
	{
	for (; bitp != bits.end(); i++)
	{
	sieve[i] &= *(bitp++);
	}
	bitp = resetp;
	}
	// final (maybe partial) cycle
	for (; i < sieve.size(); i++) {
	sieve[i] &= *(bitp++);
	}
	if (bitp == bits.end()) bitp = resetp;
	i = 0; // after first segment we always will go from start of sieve
	}
	};

	/// Generate primes using the segmented sieve of Eratosthenes.
	/// This algorithm uses O(n log log n) operations and O(sqrt(n)) space.
	/// @param limit Sieve primes <= limit.
	///
	void segmented_sieve(int64_t limit)
	{
	int64_t sqrt = (int64_t) std::sqrt(limit);
	int64_t sieve_size = std::max(sqrt, L1D_CACHE_SIZE);
	// convert from size in bytes to size in words
	sieve_size = (sieve_size >> WORDSHIFT) + ((sieve_size & WORDMASK) ? 1 : 0);
	int64_t segment_size = sieve_size << FULLSHIFT;
	int64_t count = (limit == 1) ? -1 : 0;

	// we sieve primes >= 3
	int64_t i = 3;
	int64_t s = 3;

	// vector used for sieving
	std::vector<word_t> sieve(sieve_size);
	std::vector<uint8_t> is_prime(sqrt + 1, true);
	std::vector<int64_t> primes;
	std::vector<int64_t> multiples;
	std::vector<PunchWheel> wheels;
	// first index of primes which is not handled by a wheel
	size_t i_no_wheel = 0;

	for (int64_t low = 0; low <= limit; low += segment_size)
	{
	std::fill(sieve.begin(), sieve.end(), WORDINIT);

	// current segment = [low, high]
	int64_t high = low + segment_size - 1;
	high = std::min(high, limit);
	sieve_size = ((high - low)>>FULLSHIFT) + 1;

	// generate sieving primes using simple sieve of Eratosthenes
	for (; i * i <= high; i += 2)
	if (is_prime[i])
	for (int64_t j = i * i; j <= sqrt; j += i)
	is_prime[j] = false;

	// initialize sieving primes for segmented sieve
	for (; s * s <= high; s += 2)
	{
	if (is_prime[s])
	{
	primes.push_back(s);
	multiples.push_back(s * s - low);

	// The optimal wheel sieve prime limit here should be equal to word_t bit width (or maybe a few less?, try tweaking)
	if (s < 64) {
	if (wheels.empty() \|\| !wheels.back().extend(s))
	wheels.emplace_back(L1D_CACHE_SIZE, s);
	i_no_wheel = primes.size();
	}
	}
	}

	for (std::size_t i = 0; i < wheels.size(); i++) {
	wheels[i].apply(sieve);
	}

	// sieve segment using bit array
	for (std::size_t i = i_no_wheel; i < primes.size(); i++)
	{
	int64_t j = multiples[i];
	for (int64_t k = primes[i] * 2; j < segment_size; j += k)
	sieve[j >> FULLSHIFT] &= unset_bit64[j & FULLMASK];
	multiples[i] = j - segment_size;
	}

	// unset bits > limit
	if (high == limit)
	{
	word_t bits = WORDINIT << ((limit & FULLMASK) + 1) / 2;
	sieve[sieve_size - 1] &= ~bits;
	}

	for (int64_t n = 0; n < sieve_size; n++)
	count += __builtin_popcountl(sieve[n]);

	//uint8_t sb = (uint8_t)&sieve[0];
	//for (int64_t n = 0; n < sieve_size<<WORDSHIFT; n++)
	// count += popcnt[sb[n]];
	}

	std::cout << count << " primes found." << std::endl;
	}

	/// Usage: ./segmented_sieve n
	/// @param n Sieve the primes up to n.
	///
	int main(int argc, char** argv)
	{
	if (argc >= 2)
	segmented_sieve(std::atoll(argv[1]));
	else
	segmented_sieve(1000000000);

	return 0;
	}