Last active
January 4, 2024 09:19
-
-
Save dc1394/6984fa72e03c02ffb28e1a37c88c3d29 to your computer and use it in GitHub Desktop.
Twitterのモンテカルロ法のC++版の速度比較コード(dSFMT-AVX512使用+手動でSIMD (AVX-512)ベクトル化)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <array> // for std::array | |
#include <iomanip> // for std::setprecision | |
#include <ios> // for std::ios::fixed, std::ios::floatfield | |
#include <iostream> // for std::cout, std::endl | |
#define HAVE_SSE2 | |
#define HAVE_AVX2 | |
#include "dSFMT.h" | |
#include "dSFMT-2203-avx512.h" | |
namespace { | |
static auto constexpr ARRAYSIZE = 64; | |
static auto constexpr RANDSIZE = ARRAYSIZE / 2; | |
inline double mcpi(); | |
} | |
int main() | |
{ | |
std::cout.setf(std::ios::fixed, std::ios::floatfield); | |
std::cout << "pi = " | |
<< std::setprecision(16) | |
<< mcpi() | |
<< std::endl; | |
return 0; | |
} | |
namespace { | |
double mcpi() | |
{ | |
auto constexpr seed = 20231226; | |
auto constexpr num_points = 1000000000; | |
auto num_inside = 0; | |
dsfmt_t dsfmt; | |
dsfmt_init_gen_rand(&dsfmt, seed); | |
alignas(64) std::array<double, ARRAYSIZE> randarray; | |
auto const loopnum = num_points / (RANDSIZE / 2); | |
for (auto i = 0; i < loopnum; i++) { | |
dsfmt_fill_array_close_open(&dsfmt, randarray.data(), RANDSIZE); | |
auto mask = _mm512_set1_epi32(0); | |
auto const x_values = _mm512_load_pd(randarray.data()); | |
auto const y_values = _mm512_load_pd(&randarray[8]); | |
auto const x_squared = _mm512_mul_pd(x_values, x_values); | |
auto const y_squared = _mm512_mul_pd(y_values, y_values); | |
auto const inside_mask = _mm512_cmp_pd_mask(_mm512_add_pd(x_squared, y_squared), _mm512_set1_pd(1.0), _MM_CMPINT_LT); | |
// Update mask with the result | |
mask = _mm512_mask_add_epi32(mask, inside_mask, mask, _mm512_set1_epi32(1)); | |
// Sum the counts in the mask | |
num_inside += _mm512_reduce_add_epi32(mask); | |
auto mask2 = _mm512_set1_epi32(0); | |
auto const x_values2 = _mm512_load_pd(&randarray[16]); | |
auto const y_values2 = _mm512_load_pd(&randarray[24]); | |
auto const x_squared2 = _mm512_mul_pd(x_values2, x_values2); | |
auto const y_squared2 = _mm512_mul_pd(y_values2, y_values2); | |
auto const inside_mask2 = _mm512_cmp_pd_mask(_mm512_add_pd(x_squared2, y_squared2), _mm512_set1_pd(1.0), _MM_CMPINT_LT); | |
// Update mask with the result | |
mask2 = _mm512_mask_add_epi32(mask2, inside_mask2, mask2, _mm512_set1_epi32(1)); | |
// Sum the counts in the mask | |
num_inside += _mm512_reduce_add_epi32(mask2); | |
} | |
return 4.0 * static_cast<double>(num_inside) / static_cast<double>(num_points); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment