jamesy0ung · November 15, 2024 05:37
diff --git a/montecarlo-multithreaded.cpp b/montecarlo-multithreaded.cpp
 #include <string>
 #include <iostream>
 #include <thread>
 #include <random>
 #include <atomic>
 #include <vector>
 #include <iomanip>
 #include <chrono>
 #include <algorithm>
 #include <array>

 class PiCalculator {
 private:
    const long long iterations;
    const unsigned int nthreads;
    std::atomic<long long> successCount{ 0 };
    std::atomic<long long> totalCount{ 0 };
    std::atomic<bool> shouldStop{ false };
    std::chrono::steady_clock::time_point startTime;

    // Cache line padding to prevent false sharing
    struct alignas(64) PaddedCount {
        long long count;
        char padding[64 - sizeof(long long)];
    };
    std::vector<PaddedCount> threadLocalSuccess;
    std::vector<PaddedCount> threadLocalTotal;

    // Process points in batches for better efficiency
    int processBatch(std::mt19937& gen, std::uniform_real_distribution<>& dis, int batchSize) {
        int successCount = 0;
        for (int i = 0; i < batchSize; ++i) {
            double x = dis(gen);
            double y = dis(gen);
            if (x * x + y * y <= 1.0) {
                successCount++;
            }
        }
        return successCount;
    }

    void task(int threadId, long long threadIterations) {
        // Thread-local RNG for better performance
        std::random_device rd;
        std::mt19937 gen(rd() ^ static_cast<unsigned int>(threadId)); // XOR with threadId for better distribution
        std::uniform_real_distribution<> dis(0.0, 1.0);

        const int BATCH_SIZE = 1024;  // Process points in batches
        const long long ATOMIC_BATCH = 1000000LL;  // Reduce atomic operations

        long long localSuccess = 0;
        long long localTotal = 0;
        long long processedIterations = 0;

        while (processedIterations < threadIterations && !shouldStop) {
            // Calculate how many iterations to process in this round
            long long remainingIterations = threadIterations - processedIterations;
            long long iterationsThisRound = (remainingIterations < ATOMIC_BATCH) ?
                remainingIterations : ATOMIC_BATCH;

            // Process batches
            long long batchesInThisRound = (iterationsThisRound + BATCH_SIZE - 1) / BATCH_SIZE;

            for (long long b = 0; b < batchesInThisRound; ++b) {
                // Calculate the size of this batch
                long long remainingInRound = iterationsThisRound - (b * BATCH_SIZE);
                int currentBatchSize = static_cast<int>(
                    (remainingInRound < BATCH_SIZE) ? remainingInRound : BATCH_SIZE
                    );

                if (currentBatchSize <= 0) break;

                int successInBatch = processBatch(gen, dis, currentBatchSize);
                localSuccess += successInBatch;
                localTotal += currentBatchSize;
            }

            // Update thread-local counters
            threadLocalSuccess[threadId].count += localSuccess;
            threadLocalTotal[threadId].count += localTotal;

            // Update global counters
            successCount.fetch_add(localSuccess, std::memory_order_relaxed);
            totalCount.fetch_add(localTotal, std::memory_order_relaxed);

            processedIterations += localTotal;
            localSuccess = 0;
            localTotal = 0;
        }
    }

 public:
    PiCalculator(long long n) :
        iterations(n),
        nthreads(std::thread::hardware_concurrency()),
        threadLocalSuccess(nthreads),
        threadLocalTotal(nthreads) {
    }

    double calculate() {
        startTime = std::chrono::steady_clock::now();
        std::vector<std::thread> threads;
        threads.reserve(nthreads);

        // Calculate iterations per thread
        long long iterationsPerThread = iterations / nthreads;
        long long remainder = iterations % nthreads;

        // Launch threads
        for (unsigned int i = 0; i < nthreads; i++) {
            long long threadIterations = iterationsPerThread + (i < remainder ? 1 : 0);
            threads.emplace_back(&PiCalculator::task, this, i, threadIterations);
        }

        // Progress monitoring thread
        std::thread progressThread([this]() {
            while (totalCount < iterations && !shouldStop) {
                auto now = std::chrono::steady_clock::now();
                auto duration = std::chrono::duration_cast<std::chrono::seconds>(now - startTime).count();
                if (duration > 0) {
                    double progress = static_cast<double>(totalCount) / iterations * 100.0;
                    double pointsPerSecond = static_cast<double>(totalCount) / duration;

                    std::cout << "\rProgress: " << std::fixed << std::setprecision(2)
                        << progress << "% ("
                        << std::setprecision(2) << pointsPerSecond / 1000000.0 << "M points/sec)"
                        << std::flush;
                }
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
            }
            });

        // Join threads
        for (auto& thread : threads) {
            thread.join();
        }
        shouldStop = true;
        progressThread.join();

        // Calculate final result
        auto endTime = std::chrono::steady_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::seconds>(endTime - startTime).count();

        double pi = 4.0 * static_cast<double>(successCount) / static_cast<double>(totalCount);

        // Print results
        std::cout << "\nCalculation completed in " << duration << " seconds" << std::endl;
        std::cout << "Points processed: " << totalCount << std::endl;
        std::cout << "Points inside circle: " << successCount << std::endl;
        std::cout << "Approximated value of Pi: " << std::setprecision(10) << pi << std::endl;
        std::cout << "Points per second: " << std::fixed << std::setprecision(2)
            << static_cast<double>(totalCount) / duration / 1000000.0 << "M" << std::endl;

        return pi;
    }
 };

 int main() {
    const long long iterations = 100000000000LL;
    try {
        PiCalculator calculator(iterations);
        calculator.calculate();
    }
    catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }
    return 0;
 }
	#include <string>
	#include <iostream>
	#include <thread>
	#include <random>
	#include <atomic>
	#include <vector>
	#include <iomanip>
	#include <chrono>
	#include <algorithm>
	#include <array>

	class PiCalculator {
	private:
	const long long iterations;
	const unsigned int nthreads;
	std::atomic<long long> successCount{ 0 };
	std::atomic<long long> totalCount{ 0 };
	std::atomic<bool> shouldStop{ false };
	std::chrono::steady_clock::time_point startTime;

	// Cache line padding to prevent false sharing
	struct alignas(64) PaddedCount {
	long long count;
	char padding[64 - sizeof(long long)];
	};
	std::vector<PaddedCount> threadLocalSuccess;
	std::vector<PaddedCount> threadLocalTotal;

	// Process points in batches for better efficiency
	int processBatch(std::mt19937& gen, std::uniform_real_distribution<>& dis, int batchSize) {
	int successCount = 0;
	for (int i = 0; i < batchSize; ++i) {
	double x = dis(gen);
	double y = dis(gen);
	if (x * x + y * y <= 1.0) {
	successCount++;
	}
	}
	return successCount;
	}

	void task(int threadId, long long threadIterations) {
	// Thread-local RNG for better performance
	std::random_device rd;
	std::mt19937 gen(rd() ^ static_cast<unsigned int>(threadId)); // XOR with threadId for better distribution
	std::uniform_real_distribution<> dis(0.0, 1.0);

	const int BATCH_SIZE = 1024; // Process points in batches
	const long long ATOMIC_BATCH = 1000000LL; // Reduce atomic operations

	long long localSuccess = 0;
	long long localTotal = 0;
	long long processedIterations = 0;

	while (processedIterations < threadIterations && !shouldStop) {
	// Calculate how many iterations to process in this round
	long long remainingIterations = threadIterations - processedIterations;
	long long iterationsThisRound = (remainingIterations < ATOMIC_BATCH) ?
	remainingIterations : ATOMIC_BATCH;

	// Process batches
	long long batchesInThisRound = (iterationsThisRound + BATCH_SIZE - 1) / BATCH_SIZE;

	for (long long b = 0; b < batchesInThisRound; ++b) {
	// Calculate the size of this batch
	long long remainingInRound = iterationsThisRound - (b * BATCH_SIZE);
	int currentBatchSize = static_cast<int>(
	(remainingInRound < BATCH_SIZE) ? remainingInRound : BATCH_SIZE
	);

	if (currentBatchSize <= 0) break;

	int successInBatch = processBatch(gen, dis, currentBatchSize);
	localSuccess += successInBatch;
	localTotal += currentBatchSize;
	}

	// Update thread-local counters
	threadLocalSuccess[threadId].count += localSuccess;
	threadLocalTotal[threadId].count += localTotal;

	// Update global counters
	successCount.fetch_add(localSuccess, std::memory_order_relaxed);
	totalCount.fetch_add(localTotal, std::memory_order_relaxed);

	processedIterations += localTotal;
	localSuccess = 0;
	localTotal = 0;
	}
	}

	public:
	PiCalculator(long long n) :
	iterations(n),
	nthreads(std::thread::hardware_concurrency()),
	threadLocalSuccess(nthreads),
	threadLocalTotal(nthreads) {
	}

	double calculate() {
	startTime = std::chrono::steady_clock::now();
	std::vector<std::thread> threads;
	threads.reserve(nthreads);

	// Calculate iterations per thread
	long long iterationsPerThread = iterations / nthreads;
	long long remainder = iterations % nthreads;

	// Launch threads
	for (unsigned int i = 0; i < nthreads; i++) {
	long long threadIterations = iterationsPerThread + (i < remainder ? 1 : 0);
	threads.emplace_back(&PiCalculator::task, this, i, threadIterations);
	}

	// Progress monitoring thread
	std::thread progressThread([this]() {
	while (totalCount < iterations && !shouldStop) {
	auto now = std::chrono::steady_clock::now();
	auto duration = std::chrono::duration_cast<std::chrono::seconds>(now - startTime).count();
	if (duration > 0) {
	double progress = static_cast<double>(totalCount) / iterations * 100.0;
	double pointsPerSecond = static_cast<double>(totalCount) / duration;

	std::cout << "\rProgress: " << std::fixed << std::setprecision(2)
	<< progress << "% ("
	<< std::setprecision(2) << pointsPerSecond / 1000000.0 << "M points/sec)"
	<< std::flush;
	}
	std::this_thread::sleep_for(std::chrono::milliseconds(100));
	}
	});

	// Join threads
	for (auto& thread : threads) {
	thread.join();
	}
	shouldStop = true;
	progressThread.join();

	// Calculate final result
	auto endTime = std::chrono::steady_clock::now();
	auto duration = std::chrono::duration_cast<std::chrono::seconds>(endTime - startTime).count();

	double pi = 4.0 * static_cast<double>(successCount) / static_cast<double>(totalCount);

	// Print results
	std::cout << "\nCalculation completed in " << duration << " seconds" << std::endl;
	std::cout << "Points processed: " << totalCount << std::endl;
	std::cout << "Points inside circle: " << successCount << std::endl;
	std::cout << "Approximated value of Pi: " << std::setprecision(10) << pi << std::endl;
	std::cout << "Points per second: " << std::fixed << std::setprecision(2)
	<< static_cast<double>(totalCount) / duration / 1000000.0 << "M" << std::endl;

	return pi;
	}
	};

	int main() {
	const long long iterations = 100000000000LL;
	try {
	PiCalculator calculator(iterations);
	calculator.calculate();
	}
	catch (const std::exception& e) {
	std::cerr << "Error: " << e.what() << std::endl;
	return 1;
	}
	return 0;
	}