dc1394 · December 31, 2023 13:25
diff --git a/mc_xorshift_vec.cpp b/mc_xorshift_vec.cpp
 #include <cstdint>  // for std::uint32_t
 #include <iomanip> // for std::setprecision
 #include <ios> // for std::ios::fixed, std::ios::floatfield
 #include <iostream> // for std::cout, std::endl
 #define __AVX2_AVAILABLE__
 #include "SIMDInstructionSet.h"
 #include "Xoshiro256Plus.h"

 namespace {
 inline double mcpi();
 }

 int main()
 {
    std::cout.setf(std::ios::fixed, std::ios::floatfield);
    std::cout << "pi = "
              << std::setprecision(16)
              << mcpi()
              << std::endl;
 }

 namespace {
 double mcpi()
 {
    using Xoshiro256PlusAVX2 = SEFUtility::RNG::Xoshiro256Plus<SIMDInstructionSet::AVX2>;

    auto constexpr seed = 20231226;
    auto constexpr num_points = 1000000000;
    auto num_inside = 0;
    Xoshiro256PlusAVX2 avx_rng(seed);

    auto const loopnum = num_points / 4;
    for (auto i = 0; i < loopnum; i++) {
        // Load 8 random numbers (4 for x and 4 for y) using AVX
        auto const next4_avx1 = avx_rng.dnext4();
        auto const next4_avx2 = avx_rng.dnext4();

        // Separate x and y components
        auto const x = _mm256_permute_pd(next4_avx1, 0b0000);
        auto const y = _mm256_permute_pd(next4_avx2, 0b1111);

        // Calculate r2 for the first 4 points
        auto const r2 = _mm256_add_pd(_mm256_mul_pd(x, x), _mm256_mul_pd(y, y));

        // Compare r2 with 1.0 and increment num_inside accordingly
        auto const mask = _mm256_cmp_pd(r2, _mm256_set1_pd(1.0), _CMP_LT_OS);
        std::uint32_t const mask_int = _mm256_movemask_pd(mask);
        num_inside += _mm_popcnt_u32(mask_int);
    }
    return 4.0 * static_cast<double>(num_inside) / static_cast<double>(num_points);
 }
 }
	#include <cstdint> // for std::uint32_t
	#include <iomanip> // for std::setprecision
	#include <ios> // for std::ios::fixed, std::ios::floatfield
	#include <iostream> // for std::cout, std::endl
	#define __AVX2_AVAILABLE__
	#include "SIMDInstructionSet.h"
	#include "Xoshiro256Plus.h"

	namespace {
	inline double mcpi();
	}

	int main()
	{
	std::cout.setf(std::ios::fixed, std::ios::floatfield);
	std::cout << "pi = "
	<< std::setprecision(16)
	<< mcpi()
	<< std::endl;
	}

	namespace {
	double mcpi()
	{
	using Xoshiro256PlusAVX2 = SEFUtility::RNG::Xoshiro256Plus<SIMDInstructionSet::AVX2>;

	auto constexpr seed = 20231226;
	auto constexpr num_points = 1000000000;
	auto num_inside = 0;
	Xoshiro256PlusAVX2 avx_rng(seed);

	auto const loopnum = num_points / 4;
	for (auto i = 0; i < loopnum; i++) {
	// Load 8 random numbers (4 for x and 4 for y) using AVX
	auto const next4_avx1 = avx_rng.dnext4();
	auto const next4_avx2 = avx_rng.dnext4();

	// Separate x and y components
	auto const x = _mm256_permute_pd(next4_avx1, 0b0000);
	auto const y = _mm256_permute_pd(next4_avx2, 0b1111);

	// Calculate r2 for the first 4 points
	auto const r2 = _mm256_add_pd(_mm256_mul_pd(x, x), _mm256_mul_pd(y, y));

	// Compare r2 with 1.0 and increment num_inside accordingly
	auto const mask = _mm256_cmp_pd(r2, _mm256_set1_pd(1.0), _CMP_LT_OS);
	std::uint32_t const mask_int = _mm256_movemask_pd(mask);
	num_inside += _mm_popcnt_u32(mask_int);
	}
	return 4.0 * static_cast<double>(num_inside) / static_cast<double>(num_points);
	}
	}