Skip to content

Instantly share code, notes, and snippets.

@jamesy0ung
Created November 15, 2024 05:37
Show Gist options
  • Save jamesy0ung/011a6691710edb47a821cb659ff51f7a to your computer and use it in GitHub Desktop.
Save jamesy0ung/011a6691710edb47a821cb659ff51f7a to your computer and use it in GitHub Desktop.
#include <string>
#include <iostream>
#include <thread>
#include <random>
#include <atomic>
#include <vector>
#include <iomanip>
#include <chrono>
#include <algorithm>
#include <array>
class PiCalculator {
private:
const long long iterations;
const unsigned int nthreads;
std::atomic<long long> successCount{ 0 };
std::atomic<long long> totalCount{ 0 };
std::atomic<bool> shouldStop{ false };
std::chrono::steady_clock::time_point startTime;
// Cache line padding to prevent false sharing
struct alignas(64) PaddedCount {
long long count;
char padding[64 - sizeof(long long)];
};
std::vector<PaddedCount> threadLocalSuccess;
std::vector<PaddedCount> threadLocalTotal;
// Process points in batches for better efficiency
int processBatch(std::mt19937& gen, std::uniform_real_distribution<>& dis, int batchSize) {
int successCount = 0;
for (int i = 0; i < batchSize; ++i) {
double x = dis(gen);
double y = dis(gen);
if (x * x + y * y <= 1.0) {
successCount++;
}
}
return successCount;
}
void task(int threadId, long long threadIterations) {
// Thread-local RNG for better performance
std::random_device rd;
std::mt19937 gen(rd() ^ static_cast<unsigned int>(threadId)); // XOR with threadId for better distribution
std::uniform_real_distribution<> dis(0.0, 1.0);
const int BATCH_SIZE = 1024; // Process points in batches
const long long ATOMIC_BATCH = 1000000LL; // Reduce atomic operations
long long localSuccess = 0;
long long localTotal = 0;
long long processedIterations = 0;
while (processedIterations < threadIterations && !shouldStop) {
// Calculate how many iterations to process in this round
long long remainingIterations = threadIterations - processedIterations;
long long iterationsThisRound = (remainingIterations < ATOMIC_BATCH) ?
remainingIterations : ATOMIC_BATCH;
// Process batches
long long batchesInThisRound = (iterationsThisRound + BATCH_SIZE - 1) / BATCH_SIZE;
for (long long b = 0; b < batchesInThisRound; ++b) {
// Calculate the size of this batch
long long remainingInRound = iterationsThisRound - (b * BATCH_SIZE);
int currentBatchSize = static_cast<int>(
(remainingInRound < BATCH_SIZE) ? remainingInRound : BATCH_SIZE
);
if (currentBatchSize <= 0) break;
int successInBatch = processBatch(gen, dis, currentBatchSize);
localSuccess += successInBatch;
localTotal += currentBatchSize;
}
// Update thread-local counters
threadLocalSuccess[threadId].count += localSuccess;
threadLocalTotal[threadId].count += localTotal;
// Update global counters
successCount.fetch_add(localSuccess, std::memory_order_relaxed);
totalCount.fetch_add(localTotal, std::memory_order_relaxed);
processedIterations += localTotal;
localSuccess = 0;
localTotal = 0;
}
}
public:
PiCalculator(long long n) :
iterations(n),
nthreads(std::thread::hardware_concurrency()),
threadLocalSuccess(nthreads),
threadLocalTotal(nthreads) {
}
double calculate() {
startTime = std::chrono::steady_clock::now();
std::vector<std::thread> threads;
threads.reserve(nthreads);
// Calculate iterations per thread
long long iterationsPerThread = iterations / nthreads;
long long remainder = iterations % nthreads;
// Launch threads
for (unsigned int i = 0; i < nthreads; i++) {
long long threadIterations = iterationsPerThread + (i < remainder ? 1 : 0);
threads.emplace_back(&PiCalculator::task, this, i, threadIterations);
}
// Progress monitoring thread
std::thread progressThread([this]() {
while (totalCount < iterations && !shouldStop) {
auto now = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::seconds>(now - startTime).count();
if (duration > 0) {
double progress = static_cast<double>(totalCount) / iterations * 100.0;
double pointsPerSecond = static_cast<double>(totalCount) / duration;
std::cout << "\rProgress: " << std::fixed << std::setprecision(2)
<< progress << "% ("
<< std::setprecision(2) << pointsPerSecond / 1000000.0 << "M points/sec)"
<< std::flush;
}
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
});
// Join threads
for (auto& thread : threads) {
thread.join();
}
shouldStop = true;
progressThread.join();
// Calculate final result
auto endTime = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::seconds>(endTime - startTime).count();
double pi = 4.0 * static_cast<double>(successCount) / static_cast<double>(totalCount);
// Print results
std::cout << "\nCalculation completed in " << duration << " seconds" << std::endl;
std::cout << "Points processed: " << totalCount << std::endl;
std::cout << "Points inside circle: " << successCount << std::endl;
std::cout << "Approximated value of Pi: " << std::setprecision(10) << pi << std::endl;
std::cout << "Points per second: " << std::fixed << std::setprecision(2)
<< static_cast<double>(totalCount) / duration / 1000000.0 << "M" << std::endl;
return pi;
}
};
int main() {
const long long iterations = 100000000000LL;
try {
PiCalculator calculator(iterations);
calculator.calculate();
}
catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment