-
-
Save guicho271828/2ec0a2553e36568178117668842c9add to your computer and use it in GitHub Desktop.
random number generator speed benchmark for https://cppbenchmarks.wordpress.com/2020/10/11/speed-comparison-of-random-number-generators/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*~ | |
random_generators_speed_* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
all: run | |
run: run0 | |
run%: random_generators_speed_% | |
./$< | |
random_generators_speed_0: random_generators_speed.cpp random.h Makefile | |
g++ -march=native -O0 -o $@ $< | |
random_generators_speed_1: random_generators_speed.cpp random.h Makefile | |
g++ -march=native -O1 -o $@ $< | |
random_generators_speed_2: random_generators_speed.cpp random.h Makefile | |
g++ -march=native -O2 -o $@ $< | |
random_generators_speed_3: random_generators_speed.cpp random.h Makefile | |
g++ -march=native -O3 -o $@ $< | |
clean: | |
-rm random_generators_speed_* | |
disassemble: random_generators_speed_0 | |
objdump -d -M att -CSl --no-show-raw-insn $< | awk -v RS= '/operator\(\)\(\)>:/' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Written in 2023 by Sebastiano Vigna ([email protected]) | |
To the extent possible under law, the author has dedicated all copyright | |
and related and neighboring rights to this software to the public domain | |
worldwide. This software is distributed without any warranty. | |
See <http://creativecommons.org/publicdomain/zero/1.0/>. */ | |
#include <stdint.h> | |
#include <strings.h> | |
/* This is a Marsaglia multiply-with-carry generator with period | |
approximately 2^127. It is close in speed to a scrambled linear | |
generator, as its only 128-bit operations are a multiplication and sum; | |
it is an excellent generator based on congruential arithmetic. | |
As all MWC generators, it simulates a multiplicative LCG with | |
prime modulus m = 0xffebb71d94fcdaf8ffffffffffffffff and | |
multiplier given by the inverse of 2^64 modulo m. The modulus has a | |
particular form, which creates some theoretical issues, but at this | |
size a generator of this kind passes all known statistical tests, | |
except for very large size birthday-spacings tests in three dimensions | |
(as predicted from the theory). These failures are unlikely to have | |
any impact in practice. | |
For a generator of the same type with stronger theoretical guarantees, | |
consider a Goresky-Klapper generalized multiply-with-carry generator. | |
Alternatively, applying a xorshift to the result (i.e., returning | |
x ^ (x << 32) instead of x) will eliminate the birthday-spacings | |
failures. | |
Note that a previous version had a different MWC_A1. Moreover, now | |
the result is computed using the current state. | |
*/ | |
#define MWC_A1 0xffebb71d94fcdaf9 | |
/* The state must be initialized so that 0 < c < MWC_A1 - 1. | |
For simplicity, we suggest to set c = 1 and x to a 64-bit seed. */ | |
uint64_t x, c; | |
uint64_t inline next() { | |
const uint64_t result = x; // Or, result = x ^ (x << 32) (see above) | |
const __uint128_t t = MWC_A1 * (__uint128_t)x + c; | |
x = t; | |
c = t >> 64; | |
return result; | |
} | |
/* The following jump functions use a minimal multiprecision library. */ | |
#define MP_SIZE 3 | |
#include "mp.c" | |
static uint64_t mod[MP_SIZE] = { 0xffffffffffffffff, MWC_A1 - 1 }; | |
/* This is the jump function for the generator. It is equivalent | |
to 2^64 calls to next(); it can be used to generate 2^64 | |
non-overlapping subsequences for parallel computations. | |
Equivalent C++ Boost multiprecision code: | |
cpp_int b = cpp_int(1) << 64; | |
cpp_int m = MWC_A1 * b - 1; | |
cpp_int r = cpp_int("0x2f65fed2e8400983a72f9a3547208003"); | |
cpp_int s = ((x + c * b) * r) % m; | |
x = uint64_t(s); | |
c = uint64_t(s >> 64); | |
*/ | |
void jump(void) { | |
static uint64_t jump[MP_SIZE] = { 0xa72f9a3547208003, 0x2f65fed2e8400983 }; | |
uint64_t state[MP_SIZE] = { x, c }; | |
mul(state, jump, mod); | |
x = state[0]; | |
c = state[1]; | |
} | |
/* This is the long-jump function for the generator. It is equivalent to | |
2^96 calls to next(); it can be used to generate 2^32 starting points, | |
from each of which jump() will generate 2^32 non-overlapping | |
subsequences for parallel distributed computations. | |
Equivalent C++ Boost multiprecision code: | |
cpp_int b = cpp_int(1) << 64; | |
cpp_int m = MWC_A1 * b - 1; | |
cpp_int r = cpp_int("0x394649cfd6769c91e6f7814467f3fcdd"); | |
cpp_int s = ((x + c * b) * r) % m; | |
x = uint64_t(s); | |
c = uint64_t(s >> 64); | |
*/ | |
void long_jump(void) { | |
static uint64_t long_jump[MP_SIZE] = { 0xe6f7814467f3fcdd, 0x394649cfd6769c91 }; | |
uint64_t state[MP_SIZE] = { x, c }; | |
mul(state, long_jump, mod); | |
x = state[0]; | |
c = state[1]; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Written in 2023 by Sebastiano Vigna ([email protected]) | |
To the extent possible under law, the author has dedicated all copyright | |
and related and neighboring rights to this software to the public domain | |
worldwide. This software is distributed without any warranty. | |
See <http://creativecommons.org/publicdomain/zero/1.0/>. */ | |
#include <stdint.h> | |
#include <strings.h> | |
/* This is a Marsaglia multiply-with-carry generator with period | |
approximately 2^191. It is faster than a scrambled linear | |
generator, as its only 128-bit operations are a multiplication and sum; | |
it is an excellent generator based on congruential arithmetic. | |
As all MWC generators, it simulates a multiplicative LCG with prime | |
modulus m = 0xffa04e67b3c95d85ffffffffffffffffffffffffffffffff | |
and multiplier given by the inverse of 2^64 modulo m. The modulus has a | |
particular form, which creates some theoretical issues, but at this | |
size a generator of this kind passes all known statistical tests. For a | |
generator of the same type with stronger theoretical guarantees | |
consider a Goresky-Klapper generalized multiply-with-carry generator. | |
*/ | |
#define MWC_A2 0xffa04e67b3c95d86 | |
/* The state must be initialized so that 0 < c < MWC_A2 - 1. | |
For simplicity, we suggest to set c = 1 and x, y to a 128-bit seed. */ | |
uint64_t x, y, c; | |
uint64_t inline next() { | |
const uint64_t result = y; | |
const __uint128_t t = MWC_A2 * (__uint128_t)x + c; | |
x = y; | |
y = t; | |
c = t >> 64; | |
return result; | |
} | |
/* The following jump functions use a minimal multiprecision library. */ | |
#define MP_SIZE 4 | |
#include "mp.c" | |
static uint64_t mod[MP_SIZE] = { 0xffffffffffffffff, 0xffffffffffffffff, MWC_A2 - 1 }; | |
/* This is the jump function for the generator. It is equivalent | |
to 2^96 calls to next(); it can be used to generate 2^96 | |
non-overlapping subsequences for parallel computations. | |
Equivalent C++ Boost multiprecision code: | |
cpp_int b = cpp_int(1) << 64; | |
cpp_int b2 = b * b; | |
cpp_int m = MWC_A2 * b2 - 1; | |
cpp_int r = cpp_int("0xdc2be36e4bd21a2afc217e3b9edf985d94fb8d87c7c6437"); | |
cpp_int s = ((x + y * b + c * b2) * r) % m; | |
x = uint64_t(s); | |
y = uint64_t(s >> 64); | |
c = uint64_t(s >> 128); | |
*/ | |
void jump(void) { | |
static uint64_t jump[MP_SIZE] = { 0xd94fb8d87c7c6437, 0xafc217e3b9edf985, 0xdc2be36e4bd21a2 }; | |
uint64_t state[MP_SIZE] = { x, y, c }; | |
mul(state, jump, mod); | |
x = state[0]; | |
y = state[1]; | |
c = state[2]; | |
} | |
/* This is the long-jump function for the generator. It is equivalent to | |
2^144 calls to next(); it can be used to generate 2^96 starting points, | |
from each of which jump() will generate 2^96 non-overlapping | |
subsequences for parallel distributed computations. | |
Equivalent C++ Boost multiprecision code: | |
cpp_int b = cpp_int(1) << 64; | |
cpp_int b2 = b * b; | |
cpp_int m = MWC_A2 * b2 - 1; | |
cpp_int r = cpp_int("0x3c6528aaead6bbddec956c3909137b2dd0e7cedd16a0758e"); | |
cpp_int s = ((x + y * b + c * b2) * r) % m; | |
x = uint64_t(s); | |
y = uint64_t(s >> 64); | |
c = uint64_t(s >> 128); | |
*/ | |
void long_jump(void) { | |
static uint64_t long_jump[MP_SIZE] = { 0xd0e7cedd16a0758e, 0xec956c3909137b2d, 0x3c6528aaead6bbdd }; | |
uint64_t state[MP_SIZE] = { x, y, c }; | |
mul(state, long_jump, mod); | |
x = state[0]; | |
y = state[1]; | |
c = state[2]; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Written in 2021 by Sebastiano Vigna ([email protected]) | |
To the extent possible under law, the author has dedicated all copyright | |
and related and neighboring rights to this software to the public domain | |
worldwide. This software is distributed without any warranty. | |
See <http://creativecommons.org/publicdomain/zero/1.0/>. */ | |
#include <stdint.h> | |
#include <strings.h> | |
/* This is a Marsaglia multiply-with-carry generator with period | |
approximately 2^255. It is faster than a scrambled linear | |
generator, as its only 128-bit operations are a multiplication and sum; | |
it is an excellent generator based on congruential arithmetic. | |
As all MWC generators, it simulates a multiplicative LCG with prime | |
modulus m = 0xfff62cf2ccc0cdaeffffffffffffffffffffffffffffffffffffffffffffffff | |
and multiplier given by the inverse of 2^64 modulo m. The modulus has a | |
particular form, which creates some theoretical issues, but at this | |
size a generator of this kind passes all known statistical tests. For a | |
generator of the same type with stronger theoretical guarantees | |
consider a Goresky-Klapper generalized multiply-with-carry generator. | |
Note that a previous version had a different MWC_A3. Moreover, now | |
the result is computed using the current state. | |
*/ | |
#define MWC_A3 0xfff62cf2ccc0cdaf | |
/* The state must be initialized so that 0 < c < MWC_A3 - 1. | |
For simplicity, we suggest to set c = 1 and x, y, z to a 192-bit seed. */ | |
uint64_t x, y, z, c; | |
uint64_t inline next() { | |
const uint64_t result = z; | |
const __uint128_t t = MWC_A3 * (__uint128_t)x + c; | |
x = y; | |
y = z; | |
z = t; | |
c = t >> 64; | |
return result; | |
} | |
/* The following jump functions use a minimal multiprecision library. */ | |
#define MP_SIZE 5 | |
#include "mp.c" | |
static uint64_t mod[MP_SIZE] = { 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, MWC_A3 - 1 }; | |
/* This is the jump function for the generator. It is equivalent | |
to 2^128 calls to next(); it can be used to generate 2^128 | |
non-overlapping subsequences for parallel computations. | |
Equivalent C++ Boost multiprecision code: | |
cpp_int b = cpp_int(1) << 64; | |
cpp_int b3 = b * b * b; | |
cpp_int m = MWC_A3 * b3 - 1; | |
cpp_int r = cpp_int("0x4b89aa2cd51c37b9f6f8c3fd02ec98fbfe88c291203b225428c3ff11313847eb"); | |
cpp_int s = ((x + y * b + z * b * b + c * b3) * r) % m; | |
x = uint64_t(s); | |
y = uint64_t(s >> 64); | |
z = uint64_t(s >> 128); | |
c = uint64_t(s >> 192); | |
*/ | |
void jump(void) { | |
static uint64_t jump[MP_SIZE] = { 0x28c3ff11313847eb, 0xfe88c291203b2254, 0xf6f8c3fd02ec98fb, 0x4b89aa2cd51c37b9 }; | |
uint64_t state[MP_SIZE] = { x, y, z, c }; | |
mul(state, jump, mod); | |
x = state[0]; | |
y = state[1]; | |
z = state[2]; | |
c = state[3]; | |
} | |
/* This is the long-jump function for the generator. It is equivalent to | |
2^192 calls to next(); it can be used to generate 2^64 starting points, | |
from each of which jump() will generate 2^64 non-overlapping | |
subsequences for parallel distributed computations. | |
Equivalent C++ Boost multiprecision code: | |
cpp_int b = cpp_int(1) << 64; | |
cpp_int b3 = b * b * b; | |
cpp_int m = MWC_A3 * b3 - 1; | |
cpp_int r = cpp_int("0xaf5ca22408cdc8306c40ce860e0d702f95382f758ac987764c6e39cf92f77a4"); | |
cpp_int s = ((x + y * b + z * b * b + c * b3) * r) % m; | |
x = uint64_t(s); | |
y = uint64_t(s >> 64); | |
z = uint64_t(s >> 128); | |
c = uint64_t(s >> 192); | |
*/ | |
void long_jump(void) { | |
static uint64_t long_jump[MP_SIZE] = { 0x64c6e39cf92f77a4, 0xf95382f758ac9877, 0x6c40ce860e0d702, 0xaf5ca22408cdc83 }; | |
uint64_t state[MP_SIZE] = { x, y, z, c }; | |
mul(state, long_jump, mod); | |
x = state[0]; | |
y = state[1]; | |
z = state[2]; | |
c = state[3]; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Copyright (c) 2018 Arvid Gerstmann. */ | |
/* Copyright (c) 2024 Masataro Asai. */ | |
/* This code is licensed under MIT license. */ | |
#ifndef AG_RANDOM_H | |
#define AG_RANDOM_H | |
#include <immintrin.h> | |
class generator | |
{ | |
public: | |
virtual ~generator() = default; | |
using result_type = uint32_t; | |
static constexpr result_type (min)() { return 0; } | |
static constexpr result_type (max)() { return UINT32_MAX; } | |
void seed() | |
{ | |
std::random_device rd; // non-deterministic source if possible; otherwise a PRNG | |
seed(rd); | |
} | |
void seed(std::random_device &rd) | |
{ | |
seed(rd()); | |
} | |
void seed(std::uint32_t s) | |
{ | |
std::seed_seq seq{s}; | |
seed(seq); | |
} | |
void seed(std::seed_seq &seq) | |
{ | |
std::vector<std::uint32_t> seeds(32); // support up to 1024 bit state | |
seq.generate(seeds.begin(), seeds.end()); | |
seed(seeds); | |
} | |
result_type operator()(); | |
void discard(unsigned long long n) | |
{ | |
for (unsigned long long i = 0; i < n; ++i) | |
operator()(); | |
} | |
protected: | |
virtual void seed(std::vector<std::uint32_t> &seeds) = 0; | |
}; | |
class generator64 : public generator | |
{ | |
public: | |
virtual ~generator64() = default; | |
using generator::seed; | |
protected: | |
void seed(std::vector<std::uint32_t> &seeds) | |
{ | |
m_seed = uint64_t(seeds[0]) << 31 | uint64_t(seeds[1]); | |
} | |
uint64_t m_seed; | |
}; | |
class generator64x2 : public generator | |
{ | |
public: | |
virtual ~generator64x2() = default; | |
using generator::seed; | |
protected: | |
void seed(std::vector<std::uint32_t> &seeds) | |
{ | |
for (int i = 0; i < 2; i++){ | |
s[i] = uint64_t(seeds[2*i]) << 31 | uint64_t(seeds[2*i+1]); | |
} | |
} | |
uint64_t s[2]; | |
}; | |
class generator64x4 : public generator | |
{ | |
public: | |
virtual ~generator64x4() = default; | |
using generator::seed; | |
protected: | |
void seed(std::vector<std::uint32_t> &seeds) | |
{ | |
for (int i = 0; i < 4; i++){ | |
s[i] = uint64_t(seeds[2*i]) << 31 | uint64_t(seeds[2*i+1]); | |
} | |
} | |
uint64_t s[4]; | |
}; | |
class generator64x8 : public generator | |
{ | |
public: | |
virtual ~generator64x8() = default; | |
using generator::seed; | |
protected: | |
void seed(std::vector<std::uint32_t> &seeds) | |
{ | |
for (int i = 0; i < 8; i++){ | |
s[i] = uint64_t(seeds[2*i]) << 31 | uint64_t(seeds[2*i+1]); | |
} | |
} | |
uint64_t s[8]; | |
}; | |
class generator64x16 : public generator | |
{ | |
public: | |
virtual ~generator64x16() = default; | |
using generator::seed; | |
protected: | |
void seed(std::vector<std::uint32_t> &seeds) | |
{ | |
for (int i = 0; i < 16; i++){ | |
s[i] = uint64_t(seeds[2*i]) << 31 | uint64_t(seeds[2*i+1]); | |
} | |
} | |
uint64_t s[16]; | |
}; | |
class generator32 : public generator | |
{ | |
public: | |
virtual ~generator32() = default; | |
using generator::seed; | |
protected: | |
void seed(std::vector<std::uint32_t> &seeds) | |
{ | |
m_seed = seeds[0]; | |
} | |
uint32_t m_seed; | |
}; | |
class generator32x2 : public generator | |
{ | |
public: | |
virtual ~generator32x2() = default; | |
using generator::seed; | |
protected: | |
void seed(std::vector<std::uint32_t> &seeds) | |
{ | |
for (int i = 0; i < 2; i++){ | |
s[i] = seeds[i]; | |
} | |
} | |
uint32_t s[2]; | |
}; | |
class generator32x4 : public generator | |
{ | |
public: | |
virtual ~generator32x4() = default; | |
using generator::seed; | |
protected: | |
void seed(std::vector<std::uint32_t> &seeds) | |
{ | |
for (int i = 0; i < 4; i++){ | |
s[i] = seeds[i]; | |
} | |
} | |
uint32_t s[4]; | |
}; | |
class generator32x8 : public generator | |
{ | |
public: | |
virtual ~generator32x8() = default; | |
using generator::seed; | |
protected: | |
void seed(std::vector<std::uint32_t> &seeds) | |
{ | |
for (int i = 0; i < 8; i++){ | |
s[i] = seeds[i]; | |
} | |
} | |
uint32_t s[8]; | |
}; | |
class generator32x16 : public generator | |
{ | |
public: | |
virtual ~generator32x16() = default; | |
using generator::seed; | |
protected: | |
void seed(std::vector<std::uint32_t> &seeds) | |
{ | |
for (int i = 0; i < 16; i++){ | |
s[i] = seeds[i]; | |
} | |
} | |
uint32_t s[16]; | |
}; | |
class generator32x32 : public generator | |
{ | |
public: | |
virtual ~generator32x32() = default; | |
using generator::seed; | |
protected: | |
void seed(std::vector<std::uint32_t> &seeds) | |
{ | |
for (int i = 0; i < 32; i++){ | |
s[i] = seeds[i]; | |
} | |
} | |
uint32_t s[32]; | |
}; | |
class splitmix : public generator64 | |
{ | |
public: | |
virtual ~splitmix() = default; | |
using generator::seed; | |
splitmix() { seed(); } | |
splitmix(std::uint32_t s) {seed(s);} | |
result_type operator()() | |
{ | |
uint64_t z = (m_seed += UINT64_C(0x9E3779B97F4A7C15)); | |
z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9); | |
z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB); | |
return result_type((z ^ (z >> 31)) >> 31); | |
} | |
}; | |
class trunc_xorshift32star : public generator64 | |
{ | |
public: | |
virtual ~trunc_xorshift32star() = default; | |
using generator::seed; | |
trunc_xorshift32star() { seed(); } | |
trunc_xorshift32star(std::uint32_t s) {seed(s);} | |
result_type operator()() | |
{ | |
uint64_t result = m_seed * 0xd989bcacc137dcd5ull; | |
m_seed ^= m_seed >> 11; | |
m_seed ^= m_seed << 31; | |
m_seed ^= m_seed >> 18; | |
return uint32_t(result >> 32ull); | |
} | |
}; | |
class xorshift32star : public generator64 | |
{ | |
public: | |
virtual ~xorshift32star() = default; | |
using generator::seed; | |
xorshift32star() { seed(); } | |
xorshift32star(std::uint32_t s) {seed(s);} | |
result_type operator()() | |
{ | |
uint64_t x; | |
uint32_t answer; | |
x = m_seed; | |
x = x ^ (x >> 12); | |
x = x ^ (x << 25); | |
x = x ^ (x >> 27); | |
m_seed = x; | |
answer = ((x * MAGIC) >> 32); | |
return answer; | |
} | |
private: | |
const uint64_t MAGIC = 0x2545F4914F6CDD1D; | |
}; | |
class xorshift32 : public generator32 | |
{ | |
public: | |
virtual ~xorshift32() = default; | |
using generator::seed; | |
xorshift32() { seed(); } | |
xorshift32(std::uint32_t s) {seed(s);} | |
result_type operator()() | |
{ | |
uint64_t x = m_seed; | |
x ^= x << 13; | |
x ^= x >> 17; | |
x ^= x << 5; | |
return m_seed = x; | |
} | |
}; | |
class xorshift64 : public generator64 | |
{ | |
public: | |
virtual ~xorshift64() = default; | |
using generator::seed; | |
xorshift64() { seed(); } | |
xorshift64(std::uint32_t s) {seed(s);} | |
result_type operator()() | |
{ | |
uint64_t x = m_seed; | |
x ^= x << 13; | |
x ^= x >> 7; | |
x ^= x << 17; | |
return m_seed = x; | |
} | |
}; | |
// compute 4 numbers at once with SSE. | |
class xorshift32x4 : public generator32x4 | |
{ | |
int counter = -1; | |
public: | |
virtual ~xorshift32x4() = default; | |
using generator::seed; | |
xorshift32x4() { seed(); } | |
xorshift32x4(std::uint32_t seed) {generator::seed(seed);} | |
result_type operator()() | |
{ | |
counter++; | |
int cycle = counter % 4; | |
result_type result = s[cycle]; | |
if (cycle == 3){ | |
// uint64_t x = m_seed; | |
__m128i x = _mm_loadu_si128((__m128i*) s); | |
// x ^= x << 13; | |
x = _mm_xor_epi32(x, _mm_slli_epi32(x, 13)); | |
// x ^= x >> 7; | |
x = _mm_xor_epi32(x, _mm_srli_epi32(x, 17)); | |
// x ^= x << 17; | |
x = _mm_xor_epi32(x, _mm_slli_epi32(x, 5)); | |
// return m_seed = x; | |
_mm_storeu_si128((__m128i*) s, x); | |
} | |
return result; | |
} | |
}; | |
// compute 4 numbers at once with SSE. | |
class xorshift64x4 : public generator64x4 | |
{ | |
int counter = -1; | |
public: | |
virtual ~xorshift64x4() = default; | |
using generator::seed; | |
xorshift64x4() { seed(); } | |
xorshift64x4(std::uint32_t seed) { generator::seed(seed); } | |
result_type operator()() | |
{ | |
counter++; | |
int cycle = counter % 4; | |
result_type result = s[cycle]; | |
if (cycle == 3){ | |
// uint64_t x = m_seed; | |
__m256i x = _mm256_loadu_si256((__m256i*) s); | |
// x ^= x << 13; | |
x = _mm256_xor_epi64(x, _mm256_slli_epi64(x, 13)); | |
// x ^= x >> 7; | |
x = _mm256_xor_epi64(x, _mm256_srli_epi64(x, 7)); | |
// x ^= x << 17; | |
x = _mm256_xor_epi64(x, _mm256_slli_epi64(x, 17)); | |
// return m_seed = x; | |
_mm256_storeu_si256((__m256i*) s, x); | |
} | |
return result; | |
} | |
}; | |
// compute 8 numbers at once with SSE. | |
class xorshift32x8 : public generator32x8 | |
{ | |
int counter = -1; | |
public: | |
virtual ~xorshift32x8() = default; | |
using generator::seed; | |
xorshift32x8() { seed(); } | |
xorshift32x8(std::uint32_t seed) { generator::seed(seed); } | |
result_type operator()() | |
{ | |
counter++; | |
int cycle = counter % 8; | |
result_type result = s[cycle]; | |
if (cycle == 7){ | |
// uint64_t x = m_seed; | |
__m256i x = _mm256_loadu_si256((__m256i*) s); | |
// x ^= x << 13; | |
x = _mm256_xor_epi32(x, _mm256_slli_epi32(x, 13)); | |
// x ^= x >> 7; | |
x = _mm256_xor_epi32(x, _mm256_srli_epi32(x, 17)); | |
// x ^= x << 17; | |
x = _mm256_xor_epi32(x, _mm256_slli_epi32(x, 5)); | |
// return m_seed = x; | |
_mm256_storeu_si256((__m256i*) s, x); | |
} | |
return result; | |
} | |
}; | |
// compute 16 numbers at once with AVX512. | |
class xorshift32x16 : public generator32x16 | |
{ | |
int counter = -1; | |
public: | |
virtual ~xorshift32x16() = default; | |
using generator::seed; | |
xorshift32x16() { seed(); } | |
xorshift32x16(std::uint32_t seed) { generator::seed(seed); } | |
result_type operator()() | |
{ | |
counter++; | |
int cycle = counter % 16; | |
result_type result = s[cycle]; | |
if (cycle == 15){ | |
// uint64_t x = m_seed; | |
__m512i x = _mm512_loadu_si512((__m512i*) s); | |
// x ^= x << 13; | |
x = _mm512_xor_epi32(x, _mm512_slli_epi32(x, 13)); | |
// x ^= x >> 7; | |
x = _mm512_xor_epi32(x, _mm512_srli_epi32(x, 17)); | |
// x ^= x << 17; | |
x = _mm512_xor_epi32(x, _mm512_slli_epi32(x, 5)); | |
// return m_seed = x; | |
_mm512_storeu_si512((__m512i*) s, x); | |
} | |
return result; | |
} | |
}; | |
// https://www.pcg-random.org/download.html#minimal-c-implementation | |
class pcg : public generator64x2 | |
{ | |
public: | |
virtual ~pcg() = default; | |
using generator::seed; | |
pcg() { seed();} | |
pcg(std::uint32_t s) {seed(s);} | |
result_type operator()() | |
{ | |
uint64_t oldstate = s[0]; | |
s[0] = oldstate * 6364136223846793005ULL + s[1]; | |
uint32_t xorshifted = uint32_t(((oldstate >> 18u) ^ oldstate) >> 27u); | |
int rot = oldstate >> 59u; | |
return (xorshifted >> rot) | (xorshifted << ((-rot) & 31)); | |
} | |
protected: | |
void seed(std::vector<std::uint32_t> &seeds) | |
{ | |
// proper seeding | |
uint64_t s0 = uint64_t(seeds[0]) << 31 | uint64_t(seeds[1]); | |
uint64_t s1 = uint64_t(seeds[2]) << 31 | uint64_t(seeds[3]); | |
s[0] = 0; | |
s[1] = (s1 << 1) | 1; | |
operator()(); | |
s[0] += s0; | |
operator()(); | |
} | |
}; | |
static uint32_t rotl(const uint32_t x, int k) __attribute__((always_inline)); | |
static inline uint32_t rotl(const uint32_t x, int k) { | |
return (x << k) | (x >> (32 - k)); | |
} | |
class xoshiro128plusplus : public generator32x4 | |
{ | |
public: | |
virtual ~xoshiro128plusplus() = default; | |
using generator::seed; | |
xoshiro128plusplus() { generator::seed(1); } | |
xoshiro128plusplus(std::uint32_t s) {seed(s);} | |
result_type operator()() | |
{ | |
const uint32_t result = rotl(s[0] + s[3], 7) + s[0]; | |
const uint32_t t = s[1] << 9; | |
s[2] ^= s[0]; | |
s[3] ^= s[1]; | |
s[1] ^= s[2]; | |
s[0] ^= s[3]; | |
s[2] ^= t; | |
s[3] = rotl(s[3], 11); | |
return result; | |
} | |
}; | |
class xoshiro128starstar : public generator32x4 | |
{ | |
public: | |
virtual ~xoshiro128starstar() = default; | |
using generator::seed; | |
xoshiro128starstar() { generator::seed(1); } | |
xoshiro128starstar(std::uint32_t s) {seed(s);} | |
result_type operator()() | |
{ | |
const uint32_t result = rotl(s[1] * 5, 7) * 9; | |
const uint32_t t = s[1] << 9; | |
s[2] ^= s[0]; | |
s[3] ^= s[1]; | |
s[1] ^= s[2]; | |
s[0] ^= s[3]; | |
s[2] ^= t; | |
s[3] = rotl(s[3], 11); | |
return result; | |
} | |
}; | |
// note : dispatching. | |
static __m128i rotl(const __m128i x, int k) __attribute__((always_inline)); | |
static inline __m128i rotl(const __m128i x, const int k) { | |
return _mm_or_epi32(_mm_slli_epi32(x, k), _mm_slli_epi32(x, (32 - k))); | |
} | |
class xoshiro128plusplusx4 : public generator32x16 | |
{ | |
int counter = -1; | |
result_type buffer[4]; | |
public: | |
virtual ~xoshiro128plusplusx4() = default; | |
using generator::seed; | |
xoshiro128plusplusx4() { generator::seed(1); } | |
xoshiro128plusplusx4(std::uint32_t s) {seed(s);} | |
result_type operator()() | |
{ | |
counter++; | |
int cycle = counter % 4; | |
if (cycle == 0){ | |
__m128i s0 = _mm_loadu_si128((__m128i*) s); | |
__m128i s1 = _mm_loadu_si128(((__m128i*) s)+1); | |
__m128i s2 = _mm_loadu_si128(((__m128i*) s)+2); | |
__m128i s3 = _mm_loadu_si128(((__m128i*) s)+3); | |
// const uint32_t result = rotl(s[0] + s[3], 7) + s[0]; | |
const __m128i results = _mm_add_epi32(rotl(_mm_add_epi32(s0, s3), 7), s0); | |
// const uint32_t t = s[1] << 9; | |
const __m128i t = _mm_slli_epi32(s1, 9); | |
// s[2] ^= s[0]; | |
s2 = _mm_xor_epi32(s2, s0); | |
// s[3] ^= s[1]; | |
s3 = _mm_xor_epi32(s3, s1); | |
// s[1] ^= s[2]; | |
s1 = _mm_xor_epi32(s1, s2); | |
// s[0] ^= s[3]; | |
s0 = _mm_xor_epi32(s0, s3); | |
// s[2] ^= t; | |
s2 = _mm_xor_epi32(s2, t); | |
// s[3] = rotl(s[3], 11); | |
s3 = rotl(s3, 11); | |
_mm_storeu_si128((__m128i*) s, s0); | |
_mm_storeu_si128(((__m128i*) s)+1, s1); | |
_mm_storeu_si128(((__m128i*) s)+2, s2); | |
_mm_storeu_si128(((__m128i*) s)+3, s3); | |
_mm_storeu_si128((__m128i*) buffer, results); | |
} | |
return buffer[cycle]; | |
} | |
}; | |
class xoshiro128starstarx4 : public generator32x16 | |
{ | |
int counter = -1; | |
result_type buffer[4]; | |
public: | |
virtual ~xoshiro128starstarx4() = default; | |
using generator::seed; | |
xoshiro128starstarx4() { generator::seed(1); } | |
xoshiro128starstarx4(std::uint32_t s) {seed(s);} | |
result_type operator()() | |
{ | |
counter++; | |
int cycle = counter % 4; | |
if (cycle == 0){ | |
__m128i s0 = _mm_loadu_si128((__m128i*) s); | |
__m128i s1 = _mm_loadu_si128(((__m128i*) s)+1); | |
__m128i s2 = _mm_loadu_si128(((__m128i*) s)+2); | |
__m128i s3 = _mm_loadu_si128(((__m128i*) s)+3); | |
const __m128i c5 = _mm_set1_epi32(5); | |
const __m128i c9 = _mm_set1_epi32(9); | |
// const uint32_t result = rotl(s[1] * 5, 7) * 9; | |
const __m128i results = _mm_mul_epi32(rotl(_mm_mul_epi32(s1, c5), 7), c9); | |
// const uint32_t t = s[1] << 9; | |
const __m128i t = _mm_slli_epi32(s1, 9); | |
// s[2] ^= s[0]; | |
s2 = _mm_xor_epi32(s2, s0); | |
// s[3] ^= s[1]; | |
s3 = _mm_xor_epi32(s3, s1); | |
// s[1] ^= s[2]; | |
s1 = _mm_xor_epi32(s1, s2); | |
// s[0] ^= s[3]; | |
s0 = _mm_xor_epi32(s0, s3); | |
// s[2] ^= t; | |
s2 = _mm_xor_epi32(s2, t); | |
// s[3] = rotl(s[3], 11); | |
s3 = rotl(s3, 11); | |
_mm_storeu_si128((__m128i*) s, s0); | |
_mm_storeu_si128(((__m128i*) s)+1, s1); | |
_mm_storeu_si128(((__m128i*) s)+2, s2); | |
_mm_storeu_si128(((__m128i*) s)+3, s3); | |
_mm_storeu_si128((__m128i*) buffer, results); | |
} | |
return buffer[cycle]; | |
} | |
}; | |
// note : dispatching. | |
static __m256i rotl(const __m256i x, int k) __attribute__((always_inline)); | |
static inline __m256i rotl(const __m256i x, const int k) { | |
return _mm256_or_epi32(_mm256_slli_epi32(x, k), _mm256_slli_epi32(x, (32 - k))); | |
} | |
class xoshiro128plusplusx8 : public generator32x32 | |
{ | |
int counter = -1; | |
result_type buffer[4]; | |
public: | |
virtual ~xoshiro128plusplusx8() = default; | |
using generator::seed; | |
xoshiro128plusplusx8() { generator::seed(1); } | |
xoshiro128plusplusx8(std::uint32_t s) {seed(s);} | |
result_type operator()() | |
{ | |
counter++; | |
int cycle = counter % 8; | |
if (cycle == 0){ | |
__m256i s0 = _mm256_loadu_si256((__m256i*) s); | |
__m256i s1 = _mm256_loadu_si256(((__m256i*) s)+1); | |
__m256i s2 = _mm256_loadu_si256(((__m256i*) s)+2); | |
__m256i s3 = _mm256_loadu_si256(((__m256i*) s)+3); | |
// const uint32_t result = rotl(s[0] + s[3], 7) + s[0]; | |
const __m256i results = _mm256_add_epi32(rotl(_mm256_add_epi32(s0, s3), 7), s0); | |
// const uint32_t t = s[1] << 9; | |
const __m256i t = _mm256_slli_epi32(s1, 9); | |
// s[2] ^= s[0]; | |
s2 = _mm256_xor_epi32(s2, s0); | |
// s[3] ^= s[1]; | |
s3 = _mm256_xor_epi32(s3, s1); | |
// s[1] ^= s[2]; | |
s1 = _mm256_xor_epi32(s1, s2); | |
// s[0] ^= s[3]; | |
s0 = _mm256_xor_epi32(s0, s3); | |
// s[2] ^= t; | |
s2 = _mm256_xor_epi32(s2, t); | |
// s[3] = rotl(s[3], 11); | |
s3 = rotl(s3, 11); | |
_mm256_storeu_si256((__m256i*) s, s0); | |
_mm256_storeu_si256(((__m256i*) s)+1, s1); | |
_mm256_storeu_si256(((__m256i*) s)+2, s2); | |
_mm256_storeu_si256(((__m256i*) s)+3, s3); | |
_mm256_storeu_si256((__m256i*) buffer, results); | |
} | |
return buffer[cycle]; | |
} | |
}; | |
class xoshiro128starstarx8 : public generator32x32 | |
{ | |
int counter = -1; | |
result_type buffer[4]; | |
public: | |
virtual ~xoshiro128starstarx8() = default; | |
using generator::seed; | |
xoshiro128starstarx8() { generator::seed(1); } | |
xoshiro128starstarx8(std::uint32_t s) {seed(s);} | |
result_type operator()() | |
{ | |
counter++; | |
int cycle = counter % 8; | |
if (cycle == 0){ | |
__m256i s0 = _mm256_loadu_si256((__m256i*) s); | |
__m256i s1 = _mm256_loadu_si256(((__m256i*) s)+1); | |
__m256i s2 = _mm256_loadu_si256(((__m256i*) s)+2); | |
__m256i s3 = _mm256_loadu_si256(((__m256i*) s)+3); | |
const __m256i c5 = _mm256_set1_epi32(5); | |
const __m256i c9 = _mm256_set1_epi32(9); | |
// const uint32_t result = rotl(s[1] * 5, 7) * 9; | |
const __m256i results = _mm256_mul_epi32(rotl(_mm256_mul_epi32(s1, c5), 7), c9); | |
// const uint32_t t = s[1] << 9; | |
const __m256i t = _mm256_slli_epi32(s1, 9); | |
// s[2] ^= s[0]; | |
s2 = _mm256_xor_epi32(s2, s0); | |
// s[3] ^= s[1]; | |
s3 = _mm256_xor_epi32(s3, s1); | |
// s[1] ^= s[2]; | |
s1 = _mm256_xor_epi32(s1, s2); | |
// s[0] ^= s[3]; | |
s0 = _mm256_xor_epi32(s0, s3); | |
// s[2] ^= t; | |
s2 = _mm256_xor_epi32(s2, t); | |
// s[3] = rotl(s[3], 11); | |
s3 = rotl(s3, 11); | |
_mm256_storeu_si256((__m256i*) s, s0); | |
_mm256_storeu_si256(((__m256i*) s)+1, s1); | |
_mm256_storeu_si256(((__m256i*) s)+2, s2); | |
_mm256_storeu_si256(((__m256i*) s)+3, s3); | |
_mm256_storeu_si256((__m256i*) buffer, results); | |
} | |
return buffer[cycle]; | |
} | |
}; | |
// from George Marsaglia (1994) | |
// https://groups.google.com/g/sci.stat.math/c/p7aLW3TsJys/m/QGb1kti6kN0J | |
#define MWC_A0 1111111464 | |
class MWC64 : public generator32x2 | |
{ | |
public: | |
virtual ~MWC64() = default; | |
using generator::seed; | |
/* The state must be initialized so that 0 < c < MWC_A0 - 1. | |
For simplicity, we suggest to set c = 1 and x to a 32-bit seed. */ | |
MWC64() { seed(); } | |
MWC64(std::uint32_t s) {seed(s);} | |
result_type operator()() | |
{ | |
const uint32_t result = s[0]; | |
const uint64_t t = MWC_A0 * (uint64_t)s[0] + s[1]; | |
s[0] = t; | |
s[1] = t >> 32; | |
return result; | |
} | |
protected: | |
void seed(std::vector<std::uint32_t> &seeds) | |
{ | |
s[0] = seeds[0]; | |
s[1] = seeds[1] / (MWC_A0 - 1); | |
} | |
}; | |
#define MWC_A1 0xffebb71d94fcdaf9 | |
class MWC128 : public generator64x2 | |
{ | |
public: | |
virtual ~MWC128() = default; | |
using generator::seed; | |
/* The state must be initialized so that 0 < c < MWC_A1 - 1. | |
For simplicity, we suggest to set c = 1 and x to a 64-bit seed. */ | |
MWC128() { seed(); } | |
MWC128(std::uint32_t s) {seed(s);} | |
result_type operator()() | |
{ | |
const uint64_t result = s[0]; | |
const __uint128_t t = MWC_A1 * (__uint128_t)s[0] + s[1]; | |
s[0] = t; | |
s[1] = t >> 64; | |
return result; | |
} | |
protected: | |
void seed(std::vector<std::uint32_t> &seeds) | |
{ | |
s[0] = uint64_t(seeds[0]) << 31 | uint64_t(seeds[1]); | |
s[1] = uint64_t(seeds[2]) << 31 | uint64_t(seeds[3]); | |
s[1] = s[1] / (MWC_A1 - 1); | |
} | |
}; | |
#endif /* AG_RANDOM_H */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <random> | |
#include <chrono> | |
#include <iomanip> | |
#include "random.h" | |
template <typename RandomEngineType> void benchmark_random_generator(const std::string &name) { | |
RandomEngineType generator(42); | |
generator.seed(5); | |
constexpr int nIterations = 300000000; | |
std::cout << std::setw(20) << name << std::setw(0); | |
// check sequence | |
// std::cout << " first 8: " ; | |
// for (int i = 0; i < 8; ++i) { | |
// std::cout << std::setw(0) << " 0x" | |
// << std::hex | |
// << std::setw(8) << std::setfill('0') | |
// << generator() | |
// << std::setw(0) << std::setfill(' '); | |
// | |
// } | |
const auto start = std::chrono::steady_clock::now(); | |
for (int i = 0; i < nIterations; ++i) { | |
volatile int n = generator(); | |
} | |
const auto end = std::chrono::steady_clock::now(); | |
std::cout << std::setprecision(3) | |
<< " | " | |
<< std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() / static_cast<double>(nIterations) | |
<< " ns / num\n"; | |
}; | |
int main() { | |
benchmark_random_generator<trunc_xorshift32star>("trunc_xorshift32star"); | |
benchmark_random_generator<xorshift32star>("xorshift32star"); | |
benchmark_random_generator<xorshift64>("xorshift32"); | |
benchmark_random_generator<xorshift64>("xorshift64"); | |
benchmark_random_generator<xorshift32x4>("xorshift32x4"); | |
benchmark_random_generator<xorshift64x4>("xorshift64x4"); | |
benchmark_random_generator<xorshift32x8>("xorshift32x8"); | |
benchmark_random_generator<xorshift32x16>("xorshift32x16"); | |
benchmark_random_generator<splitmix>("splitmix"); | |
benchmark_random_generator<pcg>("pcg"); | |
benchmark_random_generator<xoshiro128plusplus>("xoshiro128plusplus"); | |
benchmark_random_generator<xoshiro128starstar>("xoshiro128starstar"); | |
benchmark_random_generator<xoshiro128plusplusx4>("xoshiro128plusplusx4"); | |
benchmark_random_generator<xoshiro128starstarx4>("xoshiro128starstarx4"); | |
benchmark_random_generator<xoshiro128plusplusx8>("xoshiro128plusplusx8"); | |
benchmark_random_generator<xoshiro128starstarx8>("xoshiro128starstarx8"); | |
benchmark_random_generator<MWC128>("MWC128"); | |
benchmark_random_generator<MWC64>("MWC64"); | |
benchmark_random_generator<std::minstd_rand>("minstd_rand"); | |
benchmark_random_generator<std::minstd_rand0>("minstd_rand0"); | |
benchmark_random_generator<std::mt19937>("mt19937"); | |
benchmark_random_generator<std::mt19937_64>("mt19937_64"); | |
benchmark_random_generator<std::ranlux24_base>("ranlux24_base"); | |
benchmark_random_generator<std::ranlux48_base>("ranlux48_base"); | |
// benchmark_random_generator<std::ranlux24>("ranlux24"); // very slow | |
// benchmark_random_generator<std::ranlux48>("ranlux48"); // very slow | |
benchmark_random_generator<std::knuth_b>("knuth_b"); | |
return EXIT_SUCCESS; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Written in 2016-2018 by David Blackman and Sebastiano Vigna ([email protected]) | |
To the extent possible under law, the author has dedicated all copyright | |
and related and neighboring rights to this software to the public domain | |
worldwide. This software is distributed without any warranty. | |
See <http://creativecommons.org/publicdomain/zero/1.0/>. */ | |
#include <stdint.h> | |
/* This is xoroshiro128+ 1.0, our best and fastest small-state generator | |
for floating-point numbers, but its state space is large enough only | |
for mild parallelism. We suggest to use its upper bits for | |
floating-point generation, as it is slightly faster than | |
xoroshiro128++/xoroshiro128**. It passes all tests we are aware of | |
except for the four lower bits, which might fail linearity tests (and | |
just those), so if low linear complexity is not considered an issue (as | |
it is usually the case) it can be used to generate 64-bit outputs, too; | |
moreover, this generator has a very mild Hamming-weight dependency | |
making our test (http://prng.di.unimi.it/hwd.php) fail after 5 TB of | |
output; we believe this slight bias cannot affect any application. If | |
you are concerned, use xoroshiro128++, xoroshiro128** or xoshiro256+. | |
We suggest to use a sign test to extract a random Boolean value, and | |
right shifts to extract subsets of bits. | |
The state must be seeded so that it is not everywhere zero. If you have | |
a 64-bit seed, we suggest to seed a splitmix64 generator and use its | |
output to fill s. | |
NOTE: the parameters (a=24, b=16, b=37) of this version give slightly | |
better results in our test than the 2016 version (a=55, b=14, c=36). | |
*/ | |
static inline uint64_t rotl(const uint64_t x, int k) { | |
return (x << k) | (x >> (64 - k)); | |
} | |
static uint64_t s[2]; | |
uint64_t next(void) { | |
const uint64_t s0 = s[0]; | |
uint64_t s1 = s[1]; | |
const uint64_t result = s0 + s1; | |
s1 ^= s0; | |
s[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16); // a, b | |
s[1] = rotl(s1, 37); // c | |
return result; | |
} | |
/* This is the jump function for the generator. It is equivalent | |
to 2^64 calls to next(); it can be used to generate 2^64 | |
non-overlapping subsequences for parallel computations. */ | |
void jump(void) { | |
static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc }; | |
uint64_t s0 = 0; | |
uint64_t s1 = 0; | |
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++) | |
for(int b = 0; b < 64; b++) { | |
if (JUMP[i] & UINT64_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
} | |
/* This is the long-jump function for the generator. It is equivalent to | |
2^96 calls to next(); it can be used to generate 2^32 starting points, | |
from each of which jump() will generate 2^32 non-overlapping | |
subsequences for parallel distributed computations. */ | |
void long_jump(void) { | |
static const uint64_t LONG_JUMP[] = { 0xd2a98b26625eee7b, 0xdddf9b1090aa7ac1 }; | |
uint64_t s0 = 0; | |
uint64_t s1 = 0; | |
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++) | |
for(int b = 0; b < 64; b++) { | |
if (LONG_JUMP[i] & UINT64_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Written in 2019 by David Blackman and Sebastiano Vigna ([email protected]) | |
To the extent possible under law, the author has dedicated all copyright | |
and related and neighboring rights to this software to the public domain | |
worldwide. This software is distributed without any warranty. | |
See <http://creativecommons.org/publicdomain/zero/1.0/>. */ | |
#include <stdint.h> | |
/* This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid, | |
small-state generators. It is extremely (sub-ns) fast and it passes all | |
tests we are aware of, but its state space is large enough only for | |
mild parallelism. | |
For generating just floating-point numbers, xoroshiro128+ is even | |
faster (but it has a very mild bias, see notes in the comments). | |
The state must be seeded so that it is not everywhere zero. If you have | |
a 64-bit seed, we suggest to seed a splitmix64 generator and use its | |
output to fill s. */ | |
static inline uint64_t rotl(const uint64_t x, int k) { | |
return (x << k) | (x >> (64 - k)); | |
} | |
static uint64_t s[2]; | |
uint64_t next(void) { | |
const uint64_t s0 = s[0]; | |
uint64_t s1 = s[1]; | |
const uint64_t result = rotl(s0 + s1, 17) + s0; | |
s1 ^= s0; | |
s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b | |
s[1] = rotl(s1, 28); // c | |
return result; | |
} | |
/* This is the jump function for the generator. It is equivalent | |
to 2^64 calls to next(); it can be used to generate 2^64 | |
non-overlapping subsequences for parallel computations. */ | |
void jump(void) { | |
static const uint64_t JUMP[] = { 0x2bd7a6a6e99c2ddc, 0x0992ccaf6a6fca05 }; | |
uint64_t s0 = 0; | |
uint64_t s1 = 0; | |
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++) | |
for(int b = 0; b < 64; b++) { | |
if (JUMP[i] & UINT64_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
} | |
/* This is the long-jump function for the generator. It is equivalent to | |
2^96 calls to next(); it can be used to generate 2^32 starting points, | |
from each of which jump() will generate 2^32 non-overlapping | |
subsequences for parallel distributed computations. */ | |
void long_jump(void) { | |
static const uint64_t LONG_JUMP[] = { 0x360fd5f2cf8d5d99, 0x9c6e6877736c46e3 }; | |
uint64_t s0 = 0; | |
uint64_t s1 = 0; | |
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++) | |
for(int b = 0; b < 64; b++) { | |
if (LONG_JUMP[i] & UINT64_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Written in 2018 by David Blackman and Sebastiano Vigna ([email protected]) | |
To the extent possible under law, the author has dedicated all copyright | |
and related and neighboring rights to this software to the public domain | |
worldwide. This software is distributed without any warranty. | |
See <http://creativecommons.org/publicdomain/zero/1.0/>. */ | |
#include <stdint.h> | |
/* This is xoroshiro128** 1.0, one of our all-purpose, rock-solid, | |
small-state generators. It is extremely (sub-ns) fast and it passes all | |
tests we are aware of, but its state space is large enough only for | |
mild parallelism. | |
For generating just floating-point numbers, xoroshiro128+ is even | |
faster (but it has a very mild bias, see notes in the comments). | |
The state must be seeded so that it is not everywhere zero. If you have | |
a 64-bit seed, we suggest to seed a splitmix64 generator and use its | |
output to fill s. */ | |
static inline uint64_t rotl(const uint64_t x, int k) { | |
return (x << k) | (x >> (64 - k)); | |
} | |
static uint64_t s[2]; | |
uint64_t next(void) { | |
const uint64_t s0 = s[0]; | |
uint64_t s1 = s[1]; | |
const uint64_t result = rotl(s0 * 5, 7) * 9; | |
s1 ^= s0; | |
s[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16); // a, b | |
s[1] = rotl(s1, 37); // c | |
return result; | |
} | |
/* This is the jump function for the generator. It is equivalent | |
to 2^64 calls to next(); it can be used to generate 2^64 | |
non-overlapping subsequences for parallel computations. */ | |
void jump(void) { | |
static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc }; | |
uint64_t s0 = 0; | |
uint64_t s1 = 0; | |
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++) | |
for(int b = 0; b < 64; b++) { | |
if (JUMP[i] & UINT64_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
} | |
/* This is the long-jump function for the generator. It is equivalent to | |
2^96 calls to next(); it can be used to generate 2^32 starting points, | |
from each of which jump() will generate 2^32 non-overlapping | |
subsequences for parallel distributed computations. */ | |
void long_jump(void) { | |
static const uint64_t LONG_JUMP[] = { 0xd2a98b26625eee7b, 0xdddf9b1090aa7ac1 }; | |
uint64_t s0 = 0; | |
uint64_t s1 = 0; | |
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++) | |
for(int b = 0; b < 64; b++) { | |
if (LONG_JUMP[i] & UINT64_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Written in 2016 by David Blackman and Sebastiano Vigna ([email protected]) | |
To the extent possible under law, the author has dedicated all copyright | |
and related and neighboring rights to this software to the public domain | |
worldwide. This software is distributed without any warranty. | |
See <http://creativecommons.org/publicdomain/zero/1.0/>. */ | |
#include <stdint.h> | |
/* This is xoroshiro64* 1.0, our best and fastest 32-bit small-state | |
generator for 32-bit floating-point numbers. We suggest to use its | |
upper bits for floating-point generation, as it is slightly faster than | |
xoroshiro64**. It passes all tests we are aware of except for linearity | |
tests, as the lowest six bits have low linear complexity, so if low | |
linear complexity is not considered an issue (as it is usually the | |
case) it can be used to generate 32-bit outputs, too. | |
We suggest to use a sign test to extract a random Boolean value, and | |
right shifts to extract subsets of bits. | |
The state must be seeded so that it is not everywhere zero. */ | |
static inline uint32_t rotl(const uint32_t x, int k) { | |
return (x << k) | (x >> (32 - k)); | |
} | |
static uint32_t s[2]; | |
uint32_t next(void) { | |
const uint32_t s0 = s[0]; | |
uint32_t s1 = s[1]; | |
const uint32_t result = s0 * 0x9E3779BB; | |
s1 ^= s0; | |
s[0] = rotl(s0, 26) ^ s1 ^ (s1 << 9); // a, b | |
s[1] = rotl(s1, 13); // c | |
return result; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Written in 2018 by David Blackman and Sebastiano Vigna ([email protected]) | |
To the extent possible under law, the author has dedicated all copyright | |
and related and neighboring rights to this software to the public domain | |
worldwide. This software is distributed without any warranty. | |
See <http://creativecommons.org/publicdomain/zero/1.0/>. */ | |
#include <stdint.h> | |
/* This is xoroshiro64** 1.0, our 32-bit all-purpose, rock-solid, | |
small-state generator. It is extremely fast and it passes all tests we | |
are aware of, but its state space is not large enough for any parallel | |
application. | |
For generating just single-precision (i.e., 32-bit) floating-point | |
numbers, xoroshiro64* is even faster. | |
The state must be seeded so that it is not everywhere zero. */ | |
static inline uint32_t rotl(const uint32_t x, int k) { | |
return (x << k) | (x >> (32 - k)); | |
} | |
static uint32_t s[2]; | |
uint32_t next(void) { | |
const uint32_t s0 = s[0]; | |
uint32_t s1 = s[1]; | |
const uint32_t result = rotl(s0 * 0x9E3779BB, 5) * 5; | |
s1 ^= s0; | |
s[0] = rotl(s0, 26) ^ s1 ^ (s1 << 9); // a, b | |
s[1] = rotl(s1, 13); // c | |
return result; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Written in 2018 by David Blackman and Sebastiano Vigna ([email protected]) | |
To the extent possible under law, the author has dedicated all copyright | |
and related and neighboring rights to this software to the public domain | |
worldwide. This software is distributed without any warranty. | |
See <http://creativecommons.org/publicdomain/zero/1.0/>. */ | |
#include <stdint.h> | |
/* This is xoshiro128+ 1.0, our best and fastest 32-bit generator for 32-bit | |
floating-point numbers. We suggest to use its upper bits for | |
floating-point generation, as it is slightly faster than xoshiro128**. | |
It passes all tests we are aware of except for | |
linearity tests, as the lowest four bits have low linear complexity, so | |
if low linear complexity is not considered an issue (as it is usually | |
the case) it can be used to generate 32-bit outputs, too. | |
We suggest to use a sign test to extract a random Boolean value, and | |
right shifts to extract subsets of bits. | |
The state must be seeded so that it is not everywhere zero. */ | |
static inline uint32_t rotl(const uint32_t x, int k) { | |
return (x << k) | (x >> (32 - k)); | |
} | |
static uint32_t s[4]; | |
uint32_t next(void) { | |
const uint32_t result = s[0] + s[3]; | |
const uint32_t t = s[1] << 9; | |
s[2] ^= s[0]; | |
s[3] ^= s[1]; | |
s[1] ^= s[2]; | |
s[0] ^= s[3]; | |
s[2] ^= t; | |
s[3] = rotl(s[3], 11); | |
return result; | |
} | |
/* This is the jump function for the generator. It is equivalent | |
to 2^64 calls to next(); it can be used to generate 2^64 | |
non-overlapping subsequences for parallel computations. */ | |
void jump(void) { | |
static const uint32_t JUMP[] = { 0x8764000b, 0xf542d2d3, 0x6fa035c3, 0x77f2db5b }; | |
uint32_t s0 = 0; | |
uint32_t s1 = 0; | |
uint32_t s2 = 0; | |
uint32_t s3 = 0; | |
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++) | |
for(int b = 0; b < 32; b++) { | |
if (JUMP[i] & UINT32_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
s2 ^= s[2]; | |
s3 ^= s[3]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
s[2] = s2; | |
s[3] = s3; | |
} | |
/* This is the long-jump function for the generator. It is equivalent to | |
2^96 calls to next(); it can be used to generate 2^32 starting points, | |
from each of which jump() will generate 2^32 non-overlapping | |
subsequences for parallel distributed computations. */ | |
void long_jump(void) { | |
static const uint32_t LONG_JUMP[] = { 0xb523952e, 0x0b6f099f, 0xccf5a0ef, 0x1c580662 }; | |
uint32_t s0 = 0; | |
uint32_t s1 = 0; | |
uint32_t s2 = 0; | |
uint32_t s3 = 0; | |
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++) | |
for(int b = 0; b < 32; b++) { | |
if (LONG_JUMP[i] & UINT32_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
s2 ^= s[2]; | |
s3 ^= s[3]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
s[2] = s2; | |
s[3] = s3; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Written in 2019 by David Blackman and Sebastiano Vigna ([email protected]) | |
To the extent possible under law, the author has dedicated all copyright | |
and related and neighboring rights to this software to the public domain | |
worldwide. This software is distributed without any warranty. | |
See <http://creativecommons.org/publicdomain/zero/1.0/>. */ | |
#include <stdint.h> | |
/* This is xoshiro128++ 1.0, one of our 32-bit all-purpose, rock-solid | |
generators. It has excellent speed, a state size (128 bits) that is | |
large enough for mild parallelism, and it passes all tests we are aware | |
of. | |
For generating just single-precision (i.e., 32-bit) floating-point | |
numbers, xoshiro128+ is even faster. | |
The state must be seeded so that it is not everywhere zero. */ | |
static inline uint32_t rotl(const uint32_t x, int k) { | |
return (x << k) | (x >> (32 - k)); | |
} | |
static uint32_t s[4]; | |
uint32_t next(void) { | |
const uint32_t result = rotl(s[0] + s[3], 7) + s[0]; | |
const uint32_t t = s[1] << 9; | |
s[2] ^= s[0]; | |
s[3] ^= s[1]; | |
s[1] ^= s[2]; | |
s[0] ^= s[3]; | |
s[2] ^= t; | |
s[3] = rotl(s[3], 11); | |
return result; | |
} | |
/* This is the jump function for the generator. It is equivalent | |
to 2^64 calls to next(); it can be used to generate 2^64 | |
non-overlapping subsequences for parallel computations. */ | |
void jump(void) { | |
static const uint32_t JUMP[] = { 0x8764000b, 0xf542d2d3, 0x6fa035c3, 0x77f2db5b }; | |
uint32_t s0 = 0; | |
uint32_t s1 = 0; | |
uint32_t s2 = 0; | |
uint32_t s3 = 0; | |
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++) | |
for(int b = 0; b < 32; b++) { | |
if (JUMP[i] & UINT32_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
s2 ^= s[2]; | |
s3 ^= s[3]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
s[2] = s2; | |
s[3] = s3; | |
} | |
/* This is the long-jump function for the generator. It is equivalent to | |
2^96 calls to next(); it can be used to generate 2^32 starting points, | |
from each of which jump() will generate 2^32 non-overlapping | |
subsequences for parallel distributed computations. */ | |
void long_jump(void) { | |
static const uint32_t LONG_JUMP[] = { 0xb523952e, 0x0b6f099f, 0xccf5a0ef, 0x1c580662 }; | |
uint32_t s0 = 0; | |
uint32_t s1 = 0; | |
uint32_t s2 = 0; | |
uint32_t s3 = 0; | |
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++) | |
for(int b = 0; b < 32; b++) { | |
if (LONG_JUMP[i] & UINT32_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
s2 ^= s[2]; | |
s3 ^= s[3]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
s[2] = s2; | |
s[3] = s3; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Written in 2018 by David Blackman and Sebastiano Vigna ([email protected]) | |
To the extent possible under law, the author has dedicated all copyright | |
and related and neighboring rights to this software to the public domain | |
worldwide. This software is distributed without any warranty. | |
See <http://creativecommons.org/publicdomain/zero/1.0/>. */ | |
#include <stdint.h> | |
/* This is xoshiro128** 1.1, one of our 32-bit all-purpose, rock-solid | |
generators. It has excellent speed, a state size (128 bits) that is | |
large enough for mild parallelism, and it passes all tests we are aware | |
of. | |
Note that version 1.0 had mistakenly s[0] instead of s[1] as state | |
word passed to the scrambler. | |
For generating just single-precision (i.e., 32-bit) floating-point | |
numbers, xoshiro128+ is even faster. | |
The state must be seeded so that it is not everywhere zero. */ | |
static inline uint32_t rotl(const uint32_t x, int k) { | |
return (x << k) | (x >> (32 - k)); | |
} | |
static uint32_t s[4]; | |
uint32_t next(void) { | |
const uint32_t result = rotl(s[1] * 5, 7) * 9; | |
const uint32_t t = s[1] << 9; | |
s[2] ^= s[0]; | |
s[3] ^= s[1]; | |
s[1] ^= s[2]; | |
s[0] ^= s[3]; | |
s[2] ^= t; | |
s[3] = rotl(s[3], 11); | |
return result; | |
} | |
/* This is the jump function for the generator. It is equivalent | |
to 2^64 calls to next(); it can be used to generate 2^64 | |
non-overlapping subsequences for parallel computations. */ | |
void jump(void) { | |
static const uint32_t JUMP[] = { 0x8764000b, 0xf542d2d3, 0x6fa035c3, 0x77f2db5b }; | |
uint32_t s0 = 0; | |
uint32_t s1 = 0; | |
uint32_t s2 = 0; | |
uint32_t s3 = 0; | |
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++) | |
for(int b = 0; b < 32; b++) { | |
if (JUMP[i] & UINT32_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
s2 ^= s[2]; | |
s3 ^= s[3]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
s[2] = s2; | |
s[3] = s3; | |
} | |
/* This is the long-jump function for the generator. It is equivalent to | |
2^96 calls to next(); it can be used to generate 2^32 starting points, | |
from each of which jump() will generate 2^32 non-overlapping | |
subsequences for parallel distributed computations. */ | |
void long_jump(void) { | |
static const uint32_t LONG_JUMP[] = { 0xb523952e, 0x0b6f099f, 0xccf5a0ef, 0x1c580662 }; | |
uint32_t s0 = 0; | |
uint32_t s1 = 0; | |
uint32_t s2 = 0; | |
uint32_t s3 = 0; | |
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++) | |
for(int b = 0; b < 32; b++) { | |
if (LONG_JUMP[i] & UINT32_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
s2 ^= s[2]; | |
s3 ^= s[3]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
s[2] = s2; | |
s[3] = s3; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Written in 2018 by David Blackman and Sebastiano Vigna ([email protected]) | |
To the extent possible under law, the author has dedicated all copyright | |
and related and neighboring rights to this software to the public domain | |
worldwide. This software is distributed without any warranty. | |
See <http://creativecommons.org/publicdomain/zero/1.0/>. */ | |
#include <stdint.h> | |
/* This is xoshiro256+ 1.0, our best and fastest generator for floating-point | |
numbers. We suggest to use its upper bits for floating-point | |
generation, as it is slightly faster than xoshiro256++/xoshiro256**. It | |
passes all tests we are aware of except for the lowest three bits, | |
which might fail linearity tests (and just those), so if low linear | |
complexity is not considered an issue (as it is usually the case) it | |
can be used to generate 64-bit outputs, too. | |
We suggest to use a sign test to extract a random Boolean value, and | |
right shifts to extract subsets of bits. | |
The state must be seeded so that it is not everywhere zero. If you have | |
a 64-bit seed, we suggest to seed a splitmix64 generator and use its | |
output to fill s. */ | |
static inline uint64_t rotl(const uint64_t x, int k) { | |
return (x << k) | (x >> (64 - k)); | |
} | |
static uint64_t s[4]; | |
uint64_t next(void) { | |
const uint64_t result = s[0] + s[3]; | |
const uint64_t t = s[1] << 17; | |
s[2] ^= s[0]; | |
s[3] ^= s[1]; | |
s[1] ^= s[2]; | |
s[0] ^= s[3]; | |
s[2] ^= t; | |
s[3] = rotl(s[3], 45); | |
return result; | |
} | |
/* This is the jump function for the generator. It is equivalent | |
to 2^128 calls to next(); it can be used to generate 2^128 | |
non-overlapping subsequences for parallel computations. */ | |
void jump(void) { | |
static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c }; | |
uint64_t s0 = 0; | |
uint64_t s1 = 0; | |
uint64_t s2 = 0; | |
uint64_t s3 = 0; | |
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++) | |
for(int b = 0; b < 64; b++) { | |
if (JUMP[i] & UINT64_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
s2 ^= s[2]; | |
s3 ^= s[3]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
s[2] = s2; | |
s[3] = s3; | |
} | |
/* This is the long-jump function for the generator. It is equivalent to | |
2^192 calls to next(); it can be used to generate 2^64 starting points, | |
from each of which jump() will generate 2^64 non-overlapping | |
subsequences for parallel distributed computations. */ | |
void long_jump(void) { | |
static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 }; | |
uint64_t s0 = 0; | |
uint64_t s1 = 0; | |
uint64_t s2 = 0; | |
uint64_t s3 = 0; | |
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++) | |
for(int b = 0; b < 64; b++) { | |
if (LONG_JUMP[i] & UINT64_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
s2 ^= s[2]; | |
s3 ^= s[3]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
s[2] = s2; | |
s[3] = s3; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Written in 2019 by David Blackman and Sebastiano Vigna ([email protected]) | |
To the extent possible under law, the author has dedicated all copyright | |
and related and neighboring rights to this software to the public domain | |
worldwide. This software is distributed without any warranty. | |
See <http://creativecommons.org/publicdomain/zero/1.0/>. */ | |
#include <stdint.h> | |
/* This is xoshiro256++ 1.0, one of our all-purpose, rock-solid generators. | |
It has excellent (sub-ns) speed, a state (256 bits) that is large | |
enough for any parallel application, and it passes all tests we are | |
aware of. | |
For generating just floating-point numbers, xoshiro256+ is even faster. | |
The state must be seeded so that it is not everywhere zero. If you have | |
a 64-bit seed, we suggest to seed a splitmix64 generator and use its | |
output to fill s. */ | |
static inline uint64_t rotl(const uint64_t x, int k) { | |
return (x << k) | (x >> (64 - k)); | |
} | |
static uint64_t s[4]; | |
uint64_t next(void) { | |
const uint64_t result = rotl(s[0] + s[3], 23) + s[0]; | |
const uint64_t t = s[1] << 17; | |
s[2] ^= s[0]; | |
s[3] ^= s[1]; | |
s[1] ^= s[2]; | |
s[0] ^= s[3]; | |
s[2] ^= t; | |
s[3] = rotl(s[3], 45); | |
return result; | |
} | |
/* This is the jump function for the generator. It is equivalent | |
to 2^128 calls to next(); it can be used to generate 2^128 | |
non-overlapping subsequences for parallel computations. */ | |
void jump(void) { | |
static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c }; | |
uint64_t s0 = 0; | |
uint64_t s1 = 0; | |
uint64_t s2 = 0; | |
uint64_t s3 = 0; | |
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++) | |
for(int b = 0; b < 64; b++) { | |
if (JUMP[i] & UINT64_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
s2 ^= s[2]; | |
s3 ^= s[3]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
s[2] = s2; | |
s[3] = s3; | |
} | |
/* This is the long-jump function for the generator. It is equivalent to | |
2^192 calls to next(); it can be used to generate 2^64 starting points, | |
from each of which jump() will generate 2^64 non-overlapping | |
subsequences for parallel distributed computations. */ | |
void long_jump(void) { | |
static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 }; | |
uint64_t s0 = 0; | |
uint64_t s1 = 0; | |
uint64_t s2 = 0; | |
uint64_t s3 = 0; | |
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++) | |
for(int b = 0; b < 64; b++) { | |
if (LONG_JUMP[i] & UINT64_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
s2 ^= s[2]; | |
s3 ^= s[3]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
s[2] = s2; | |
s[3] = s3; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Written in 2018 by David Blackman and Sebastiano Vigna ([email protected]) | |
To the extent possible under law, the author has dedicated all copyright | |
and related and neighboring rights to this software to the public domain | |
worldwide. This software is distributed without any warranty. | |
See <http://creativecommons.org/publicdomain/zero/1.0/>. */ | |
#include <stdint.h> | |
/* This is xoshiro256** 1.0, one of our all-purpose, rock-solid | |
generators. It has excellent (sub-ns) speed, a state (256 bits) that is | |
large enough for any parallel application, and it passes all tests we | |
are aware of. | |
For generating just floating-point numbers, xoshiro256+ is even faster. | |
The state must be seeded so that it is not everywhere zero. If you have | |
a 64-bit seed, we suggest to seed a splitmix64 generator and use its | |
output to fill s. */ | |
static inline uint64_t rotl(const uint64_t x, int k) { | |
return (x << k) | (x >> (64 - k)); | |
} | |
static uint64_t s[4]; | |
uint64_t next(void) { | |
const uint64_t result = rotl(s[1] * 5, 7) * 9; | |
const uint64_t t = s[1] << 17; | |
s[2] ^= s[0]; | |
s[3] ^= s[1]; | |
s[1] ^= s[2]; | |
s[0] ^= s[3]; | |
s[2] ^= t; | |
s[3] = rotl(s[3], 45); | |
return result; | |
} | |
/* This is the jump function for the generator. It is equivalent | |
to 2^128 calls to next(); it can be used to generate 2^128 | |
non-overlapping subsequences for parallel computations. */ | |
void jump(void) { | |
static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c }; | |
uint64_t s0 = 0; | |
uint64_t s1 = 0; | |
uint64_t s2 = 0; | |
uint64_t s3 = 0; | |
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++) | |
for(int b = 0; b < 64; b++) { | |
if (JUMP[i] & UINT64_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
s2 ^= s[2]; | |
s3 ^= s[3]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
s[2] = s2; | |
s[3] = s3; | |
} | |
/* This is the long-jump function for the generator. It is equivalent to | |
2^192 calls to next(); it can be used to generate 2^64 starting points, | |
from each of which jump() will generate 2^64 non-overlapping | |
subsequences for parallel distributed computations. */ | |
void long_jump(void) { | |
static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 }; | |
uint64_t s0 = 0; | |
uint64_t s1 = 0; | |
uint64_t s2 = 0; | |
uint64_t s3 = 0; | |
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++) | |
for(int b = 0; b < 64; b++) { | |
if (LONG_JUMP[i] & UINT64_C(1) << b) { | |
s0 ^= s[0]; | |
s1 ^= s[1]; | |
s2 ^= s[2]; | |
s3 ^= s[3]; | |
} | |
next(); | |
} | |
s[0] = s0; | |
s[1] = s1; | |
s[2] = s2; | |
s[3] = s3; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment