-
-
Save jakobkogler/e6359ea9ced24fe304f1a8af3c9bee0e to your computer and use it in GitHub Desktop.
>> g++ -std=c++20 -Ofast benchmark_prime_sieve.cpp && ./a.out | |
N = 30000000: | |
norm: bool: 106 ms | |
norm: char: 235 ms | |
norm: bool is 2.217 times FASTER | |
seg: bool: 147 ms | |
seg: char: 38 ms | |
seg: bool is 3.868 times SLOWER | |
N = 60000000: | |
norm: bool: 300 ms | |
norm: char: 527 ms | |
norm: bool is 1.757 times FASTER | |
seg: bool: 311 ms | |
seg: char: 87 ms | |
seg: bool is 3.575 times SLOWER | |
N = 90000000: | |
norm: bool: 498 ms | |
norm: char: 811 ms | |
norm: bool is 1.629 times FASTER | |
seg: bool: 488 ms | |
seg: char: 143 ms | |
seg: bool is 3.413 times SLOWER | |
N = 200000000: | |
norm: bool: 1305 ms | |
norm: char: 1987 ms | |
norm: bool is 1.523 times FASTER | |
seg: bool: 1164 ms | |
seg: char: 380 ms | |
seg: bool is 3.063 times SLOWER | |
N = 500000000: | |
norm: bool: 3731 ms | |
norm: char: 5210 ms | |
norm: bool is 1.396 times FASTER | |
seg: bool: 3268 ms | |
seg: char: 1214 ms | |
seg: bool is 2.692 times SLOWER | |
N = 1000000000: | |
norm: bool: 7868 ms | |
norm: char: 11066 ms | |
norm: bool is 1.406 times FASTER | |
seg: bool: 7273 ms | |
seg: char: 3039 ms | |
seg: bool is 2.393 times SLOWER |
#include "bits/stdc++.h" | |
using namespace std; | |
template <typename T> | |
int count_primes_seg(int n) { | |
const int S = 10'000; | |
vector<int> primes; | |
int nsqrt = sqrt(n); | |
vector<T> is_prime(nsqrt + 1, true); | |
for (int i = 2; i <= nsqrt; i++) { | |
if (is_prime[i]) { | |
primes.push_back(i); | |
for (int j = i * i; j <= nsqrt; j += i) | |
is_prime[j] = false; | |
} | |
} | |
int result = 0; | |
vector<T> block(S); | |
for (int k = 0; k * S <= n; k++) { | |
fill(block.begin(), block.end(), true); | |
int start = k * S; | |
for (int p : primes) { | |
int start_idx = (start + p - 1) / p; | |
int j = max(start_idx, p) * p - start; | |
for (; j < S; j += p) | |
block[j] = false; | |
} | |
if (k == 0) | |
block[0] = block[1] = false; | |
for (int i = 0; i < S && start + i <= n; i++) { | |
if (block[i]) | |
result++; | |
} | |
} | |
return result; | |
} | |
template <typename T> | |
int count_primes(int n) { | |
vector<T> is_prime(n+1, true); | |
is_prime[0] = is_prime[1] = false; | |
for (int i = 2; i * i <= n; i++) { | |
if (is_prime[i]) { | |
for (int j = i * i; j <= n; j += i) | |
is_prime[j] = false; | |
} | |
} | |
int cnt = 0; | |
for (int i = 2; i <= n; i++) { | |
if (is_prime[i]) | |
cnt++; | |
} | |
return cnt; | |
/* return is_prime[n]; */ | |
} | |
template <typename T> | |
long long benchmark(int n, string s) | |
{ | |
auto t1 = std::chrono::high_resolution_clock::now(); | |
int res = count_primes<T>(n); | |
auto t2 = std::chrono::high_resolution_clock::now(); | |
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1); | |
cout << " norm: " << s << ": " << duration.count() << " ms" << endl; | |
return duration.count(); | |
} | |
template <typename T> | |
long long benchmark_seg(int n, string s) | |
{ | |
auto t1 = std::chrono::high_resolution_clock::now(); | |
int res = count_primes_seg<T>(n); | |
auto t2 = std::chrono::high_resolution_clock::now(); | |
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1); | |
cout << " seg: " << s << ": " << duration.count() << " ms" << endl; | |
return duration.count(); | |
} | |
int main() { | |
ios_base::sync_with_stdio(false); | |
cin.tie(nullptr); | |
std::cout << std::fixed << std::setprecision(3); | |
for (int n : {3e7, 6e7, 9e7, 2e8, 5e8, 1e9}) { | |
cout << "N = " << n << ": " << endl; | |
{ | |
long long a = benchmark<bool>(n, "bool"); | |
long long b = benchmark<char>(n, "char"); | |
cout << " norm: bool is "; | |
if (a < b) { | |
cout << ((double)b / a) << " times FASTER" << endl; | |
} else { | |
cout << ((double)a / b) << " times SLOWER" << endl; | |
} | |
} | |
{ | |
long long a = benchmark_seg<bool>(n, "bool"); | |
long long b = benchmark_seg<char>(n, "char"); | |
cout << " seg: bool is "; | |
if (a < b) { | |
cout << ((double)b / a) << " times FASTER" << endl; | |
} else { | |
cout << ((double)a / b) << " times SLOWER" << endl; | |
} | |
} | |
cout << endl; | |
} | |
} |
@jxu
Just did a small experiment. Only for 1e9 though (as bitsets require the size to be known at compile time). Looks like std::bitset
is a bit slower than std::vector<bool>
during the normal sieve, but a bit faster during the segmented sieve. Not sure why.
N = 1000000000:
norm: vector<bool>: 7775 ms
norm: vector<char>: 11992 ms
norm: bitset: 9291 ms
seg: vector<bool>: 7305 ms
seg: vector<char>: 3110 ms
seg: bitset: 6501 ms
Maybe it's possible to optimize the bitset solution a little bit. I basically just replaced std::vector<T> v(size)
with std::bitset<size> v
in the code. But maybe there are some clever bitmask tricks that you can use.
For reference:
int count_primes_seg_bitset(int n) {
if (n != 1000000000)
return 0;
constexpr int S = 10'000;
vector<int> primes;
constexpr int nsqrt = 31623;
auto is_prime = std::move(bitset<nsqrt + 1>{}.set());
for (int i = 2; i <= nsqrt; i++) {
if (is_prime[i]) {
primes.push_back(i);
for (int j = i * i; j <= nsqrt; j += i)
is_prime[j] = false;
}
}
int result = 0;
bitset<S> block;
for (int k = 0; k * S <= n; k++) {
block.set();
int start = k * S;
for (int p : primes) {
int start_idx = (start + p - 1) / p;
int j = max(start_idx, p) * p - start;
for (; j < S; j += p)
block[j] = false;
}
if (k == 0)
block[0] = block[1] = false;
for (int i = 0; i < S && start + i <= n; i++) {
if (block[i])
result++;
}
}
return result;
}
int count_primes_bitset(int n) {
if (n != 1000000000)
return 0;
auto is_prime = std::move(bitset<1000000000>{}.set());
is_prime[0] = is_prime[1] = false;
for (int i = 2; i * i <= n; i++) {
if (is_prime[i]) {
for (int j = i * i; j <= n; j += i)
is_prime[j] = false;
}
}
int cnt = 0;
for (int i = 2; i <= n; i++) {
if (is_prime[i])
cnt++;
}
return cnt;
}
The point of bitmask is not to need clever bitmask tricks I suppose, so if there's no significant difference then vector being dynamic makes it easier to use.
https://github.com/kimwalisch/primesieve/wiki/Segmented-sieve-of-Eratosthenes says the block size should be the L1D cache size. On my processor it's 128 KiB, so 32768 4-byte ints. Segments beyond 10^9 need to use 8-byte long longs right?
@jxu There are no 4-bit or 8-bit ints used in the segmented sieve code above. As I don't store the prime numbers, I just count them.
For the segement I only need to have an array telling me if a number is prime or not. In the case for the segmented sieve I actually use a vector<char>
(1 byte per entry) and not a vector<bool>
(1 bit per entry), because working with chars can be faster (and in this case it is).
If you want to compute the numbers of primes for ranges bigger than 4-bit ints, than you have to do all computations with 8-bit integers, but the block can still remain an array of chars/booleans.
All the memory that has to be loaded at all time is the block vector.
My CPU has 128kb, however each Core has only 32kb.
And indeed, at S ~= 32000 I get the best results.
Notice, that the same implementation with a vector<bool>
is a lot slower than with a vector<char>
, but doesn't have the same jump at 32k, because it only uses 1/8th of memory, so even 100k bits fit easily into L1 cache.
Strangely though, if I compute this graphic even further I don't get a jump at 256 000 (32k * 8). Probably the CPU is then fast enough to reload cache lines (as the sieve of Eratosthenes is pretty predictable when it comes to the access patterns).
Is there an explanation for primesieve stating "The segment size must not be smaller than the square root of the sieving limit otherwise the run-time complexity of the algorithm deteriorates." I'm not sure why. Should that be mentioned in the article?
@jxu My guess is, that if the segment is small, then most of the prime numbers up to the square root, will not have a single multiple in the range. So you do a lot of operations - figuring out if there is a multiple in the range - just for nothing.
Did you try with std::bitset?