Skip to content

Instantly share code, notes, and snippets.

@guicho271828
Forked from kd9f9/random_generators_speed.cpp
Last active April 19, 2024 00:59
Show Gist options
  • Save guicho271828/2ec0a2553e36568178117668842c9add to your computer and use it in GitHub Desktop.
Save guicho271828/2ec0a2553e36568178117668842c9add to your computer and use it in GitHub Desktop.
*~
random_generators_speed_*
all: run
run: run0
run%: random_generators_speed_%
./$<
random_generators_speed_0: random_generators_speed.cpp random.h Makefile
g++ -march=native -O0 -o $@ $<
random_generators_speed_1: random_generators_speed.cpp random.h Makefile
g++ -march=native -O1 -o $@ $<
random_generators_speed_2: random_generators_speed.cpp random.h Makefile
g++ -march=native -O2 -o $@ $<
random_generators_speed_3: random_generators_speed.cpp random.h Makefile
g++ -march=native -O3 -o $@ $<
clean:
-rm random_generators_speed_*
disassemble: random_generators_speed_0
objdump -d -M att -CSl --no-show-raw-insn $< | awk -v RS= '/operator\(\)\(\)>:/'
/* Written in 2023 by Sebastiano Vigna ([email protected])
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
#include <stdint.h>
#include <strings.h>
/* This is a Marsaglia multiply-with-carry generator with period
approximately 2^127. It is close in speed to a scrambled linear
generator, as its only 128-bit operations are a multiplication and sum;
it is an excellent generator based on congruential arithmetic.
As all MWC generators, it simulates a multiplicative LCG with
prime modulus m = 0xffebb71d94fcdaf8ffffffffffffffff and
multiplier given by the inverse of 2^64 modulo m. The modulus has a
particular form, which creates some theoretical issues, but at this
size a generator of this kind passes all known statistical tests,
except for very large size birthday-spacings tests in three dimensions
(as predicted from the theory). These failures are unlikely to have
any impact in practice.
For a generator of the same type with stronger theoretical guarantees,
consider a Goresky-Klapper generalized multiply-with-carry generator.
Alternatively, applying a xorshift to the result (i.e., returning
x ^ (x << 32) instead of x) will eliminate the birthday-spacings
failures.
Note that a previous version had a different MWC_A1. Moreover, now
the result is computed using the current state.
*/
#define MWC_A1 0xffebb71d94fcdaf9
/* The state must be initialized so that 0 < c < MWC_A1 - 1.
For simplicity, we suggest to set c = 1 and x to a 64-bit seed. */
uint64_t x, c;
uint64_t inline next() {
const uint64_t result = x; // Or, result = x ^ (x << 32) (see above)
const __uint128_t t = MWC_A1 * (__uint128_t)x + c;
x = t;
c = t >> 64;
return result;
}
/* The following jump functions use a minimal multiprecision library. */
#define MP_SIZE 3
#include "mp.c"
static uint64_t mod[MP_SIZE] = { 0xffffffffffffffff, MWC_A1 - 1 };
/* This is the jump function for the generator. It is equivalent
to 2^64 calls to next(); it can be used to generate 2^64
non-overlapping subsequences for parallel computations.
Equivalent C++ Boost multiprecision code:
cpp_int b = cpp_int(1) << 64;
cpp_int m = MWC_A1 * b - 1;
cpp_int r = cpp_int("0x2f65fed2e8400983a72f9a3547208003");
cpp_int s = ((x + c * b) * r) % m;
x = uint64_t(s);
c = uint64_t(s >> 64);
*/
void jump(void) {
static uint64_t jump[MP_SIZE] = { 0xa72f9a3547208003, 0x2f65fed2e8400983 };
uint64_t state[MP_SIZE] = { x, c };
mul(state, jump, mod);
x = state[0];
c = state[1];
}
/* This is the long-jump function for the generator. It is equivalent to
2^96 calls to next(); it can be used to generate 2^32 starting points,
from each of which jump() will generate 2^32 non-overlapping
subsequences for parallel distributed computations.
Equivalent C++ Boost multiprecision code:
cpp_int b = cpp_int(1) << 64;
cpp_int m = MWC_A1 * b - 1;
cpp_int r = cpp_int("0x394649cfd6769c91e6f7814467f3fcdd");
cpp_int s = ((x + c * b) * r) % m;
x = uint64_t(s);
c = uint64_t(s >> 64);
*/
void long_jump(void) {
static uint64_t long_jump[MP_SIZE] = { 0xe6f7814467f3fcdd, 0x394649cfd6769c91 };
uint64_t state[MP_SIZE] = { x, c };
mul(state, long_jump, mod);
x = state[0];
c = state[1];
}
/* Written in 2023 by Sebastiano Vigna ([email protected])
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
#include <stdint.h>
#include <strings.h>
/* This is a Marsaglia multiply-with-carry generator with period
approximately 2^191. It is faster than a scrambled linear
generator, as its only 128-bit operations are a multiplication and sum;
it is an excellent generator based on congruential arithmetic.
As all MWC generators, it simulates a multiplicative LCG with prime
modulus m = 0xffa04e67b3c95d85ffffffffffffffffffffffffffffffff
and multiplier given by the inverse of 2^64 modulo m. The modulus has a
particular form, which creates some theoretical issues, but at this
size a generator of this kind passes all known statistical tests. For a
generator of the same type with stronger theoretical guarantees
consider a Goresky-Klapper generalized multiply-with-carry generator.
*/
#define MWC_A2 0xffa04e67b3c95d86
/* The state must be initialized so that 0 < c < MWC_A2 - 1.
For simplicity, we suggest to set c = 1 and x, y to a 128-bit seed. */
uint64_t x, y, c;
uint64_t inline next() {
const uint64_t result = y;
const __uint128_t t = MWC_A2 * (__uint128_t)x + c;
x = y;
y = t;
c = t >> 64;
return result;
}
/* The following jump functions use a minimal multiprecision library. */
#define MP_SIZE 4
#include "mp.c"
static uint64_t mod[MP_SIZE] = { 0xffffffffffffffff, 0xffffffffffffffff, MWC_A2 - 1 };
/* This is the jump function for the generator. It is equivalent
to 2^96 calls to next(); it can be used to generate 2^96
non-overlapping subsequences for parallel computations.
Equivalent C++ Boost multiprecision code:
cpp_int b = cpp_int(1) << 64;
cpp_int b2 = b * b;
cpp_int m = MWC_A2 * b2 - 1;
cpp_int r = cpp_int("0xdc2be36e4bd21a2afc217e3b9edf985d94fb8d87c7c6437");
cpp_int s = ((x + y * b + c * b2) * r) % m;
x = uint64_t(s);
y = uint64_t(s >> 64);
c = uint64_t(s >> 128);
*/
void jump(void) {
static uint64_t jump[MP_SIZE] = { 0xd94fb8d87c7c6437, 0xafc217e3b9edf985, 0xdc2be36e4bd21a2 };
uint64_t state[MP_SIZE] = { x, y, c };
mul(state, jump, mod);
x = state[0];
y = state[1];
c = state[2];
}
/* This is the long-jump function for the generator. It is equivalent to
2^144 calls to next(); it can be used to generate 2^96 starting points,
from each of which jump() will generate 2^96 non-overlapping
subsequences for parallel distributed computations.
Equivalent C++ Boost multiprecision code:
cpp_int b = cpp_int(1) << 64;
cpp_int b2 = b * b;
cpp_int m = MWC_A2 * b2 - 1;
cpp_int r = cpp_int("0x3c6528aaead6bbddec956c3909137b2dd0e7cedd16a0758e");
cpp_int s = ((x + y * b + c * b2) * r) % m;
x = uint64_t(s);
y = uint64_t(s >> 64);
c = uint64_t(s >> 128);
*/
void long_jump(void) {
static uint64_t long_jump[MP_SIZE] = { 0xd0e7cedd16a0758e, 0xec956c3909137b2d, 0x3c6528aaead6bbdd };
uint64_t state[MP_SIZE] = { x, y, c };
mul(state, long_jump, mod);
x = state[0];
y = state[1];
c = state[2];
}
/* Written in 2021 by Sebastiano Vigna ([email protected])
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
#include <stdint.h>
#include <strings.h>
/* This is a Marsaglia multiply-with-carry generator with period
approximately 2^255. It is faster than a scrambled linear
generator, as its only 128-bit operations are a multiplication and sum;
it is an excellent generator based on congruential arithmetic.
As all MWC generators, it simulates a multiplicative LCG with prime
modulus m = 0xfff62cf2ccc0cdaeffffffffffffffffffffffffffffffffffffffffffffffff
and multiplier given by the inverse of 2^64 modulo m. The modulus has a
particular form, which creates some theoretical issues, but at this
size a generator of this kind passes all known statistical tests. For a
generator of the same type with stronger theoretical guarantees
consider a Goresky-Klapper generalized multiply-with-carry generator.
Note that a previous version had a different MWC_A3. Moreover, now
the result is computed using the current state.
*/
#define MWC_A3 0xfff62cf2ccc0cdaf
/* The state must be initialized so that 0 < c < MWC_A3 - 1.
For simplicity, we suggest to set c = 1 and x, y, z to a 192-bit seed. */
uint64_t x, y, z, c;
uint64_t inline next() {
const uint64_t result = z;
const __uint128_t t = MWC_A3 * (__uint128_t)x + c;
x = y;
y = z;
z = t;
c = t >> 64;
return result;
}
/* The following jump functions use a minimal multiprecision library. */
#define MP_SIZE 5
#include "mp.c"
static uint64_t mod[MP_SIZE] = { 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, MWC_A3 - 1 };
/* This is the jump function for the generator. It is equivalent
to 2^128 calls to next(); it can be used to generate 2^128
non-overlapping subsequences for parallel computations.
Equivalent C++ Boost multiprecision code:
cpp_int b = cpp_int(1) << 64;
cpp_int b3 = b * b * b;
cpp_int m = MWC_A3 * b3 - 1;
cpp_int r = cpp_int("0x4b89aa2cd51c37b9f6f8c3fd02ec98fbfe88c291203b225428c3ff11313847eb");
cpp_int s = ((x + y * b + z * b * b + c * b3) * r) % m;
x = uint64_t(s);
y = uint64_t(s >> 64);
z = uint64_t(s >> 128);
c = uint64_t(s >> 192);
*/
void jump(void) {
static uint64_t jump[MP_SIZE] = { 0x28c3ff11313847eb, 0xfe88c291203b2254, 0xf6f8c3fd02ec98fb, 0x4b89aa2cd51c37b9 };
uint64_t state[MP_SIZE] = { x, y, z, c };
mul(state, jump, mod);
x = state[0];
y = state[1];
z = state[2];
c = state[3];
}
/* This is the long-jump function for the generator. It is equivalent to
2^192 calls to next(); it can be used to generate 2^64 starting points,
from each of which jump() will generate 2^64 non-overlapping
subsequences for parallel distributed computations.
Equivalent C++ Boost multiprecision code:
cpp_int b = cpp_int(1) << 64;
cpp_int b3 = b * b * b;
cpp_int m = MWC_A3 * b3 - 1;
cpp_int r = cpp_int("0xaf5ca22408cdc8306c40ce860e0d702f95382f758ac987764c6e39cf92f77a4");
cpp_int s = ((x + y * b + z * b * b + c * b3) * r) % m;
x = uint64_t(s);
y = uint64_t(s >> 64);
z = uint64_t(s >> 128);
c = uint64_t(s >> 192);
*/
void long_jump(void) {
static uint64_t long_jump[MP_SIZE] = { 0x64c6e39cf92f77a4, 0xf95382f758ac9877, 0x6c40ce860e0d702, 0xaf5ca22408cdc83 };
uint64_t state[MP_SIZE] = { x, y, z, c };
mul(state, long_jump, mod);
x = state[0];
y = state[1];
z = state[2];
c = state[3];
}
/* Copyright (c) 2018 Arvid Gerstmann. */
/* Copyright (c) 2024 Masataro Asai. */
/* This code is licensed under MIT license. */
#ifndef AG_RANDOM_H
#define AG_RANDOM_H
#include <immintrin.h>
class generator
{
public:
virtual ~generator() = default;
using result_type = uint32_t;
static constexpr result_type (min)() { return 0; }
static constexpr result_type (max)() { return UINT32_MAX; }
void seed()
{
std::random_device rd; // non-deterministic source if possible; otherwise a PRNG
seed(rd);
}
void seed(std::random_device &rd)
{
seed(rd());
}
void seed(std::uint32_t s)
{
std::seed_seq seq{s};
seed(seq);
}
void seed(std::seed_seq &seq)
{
std::vector<std::uint32_t> seeds(32); // support up to 1024 bit state
seq.generate(seeds.begin(), seeds.end());
seed(seeds);
}
result_type operator()();
void discard(unsigned long long n)
{
for (unsigned long long i = 0; i < n; ++i)
operator()();
}
protected:
virtual void seed(std::vector<std::uint32_t> &seeds) = 0;
};
class generator64 : public generator
{
public:
virtual ~generator64() = default;
using generator::seed;
protected:
void seed(std::vector<std::uint32_t> &seeds)
{
m_seed = uint64_t(seeds[0]) << 31 | uint64_t(seeds[1]);
}
uint64_t m_seed;
};
class generator64x2 : public generator
{
public:
virtual ~generator64x2() = default;
using generator::seed;
protected:
void seed(std::vector<std::uint32_t> &seeds)
{
for (int i = 0; i < 2; i++){
s[i] = uint64_t(seeds[2*i]) << 31 | uint64_t(seeds[2*i+1]);
}
}
uint64_t s[2];
};
class generator64x4 : public generator
{
public:
virtual ~generator64x4() = default;
using generator::seed;
protected:
void seed(std::vector<std::uint32_t> &seeds)
{
for (int i = 0; i < 4; i++){
s[i] = uint64_t(seeds[2*i]) << 31 | uint64_t(seeds[2*i+1]);
}
}
uint64_t s[4];
};
class generator64x8 : public generator
{
public:
virtual ~generator64x8() = default;
using generator::seed;
protected:
void seed(std::vector<std::uint32_t> &seeds)
{
for (int i = 0; i < 8; i++){
s[i] = uint64_t(seeds[2*i]) << 31 | uint64_t(seeds[2*i+1]);
}
}
uint64_t s[8];
};
class generator64x16 : public generator
{
public:
virtual ~generator64x16() = default;
using generator::seed;
protected:
void seed(std::vector<std::uint32_t> &seeds)
{
for (int i = 0; i < 16; i++){
s[i] = uint64_t(seeds[2*i]) << 31 | uint64_t(seeds[2*i+1]);
}
}
uint64_t s[16];
};
class generator32 : public generator
{
public:
virtual ~generator32() = default;
using generator::seed;
protected:
void seed(std::vector<std::uint32_t> &seeds)
{
m_seed = seeds[0];
}
uint32_t m_seed;
};
class generator32x2 : public generator
{
public:
virtual ~generator32x2() = default;
using generator::seed;
protected:
void seed(std::vector<std::uint32_t> &seeds)
{
for (int i = 0; i < 2; i++){
s[i] = seeds[i];
}
}
uint32_t s[2];
};
class generator32x4 : public generator
{
public:
virtual ~generator32x4() = default;
using generator::seed;
protected:
void seed(std::vector<std::uint32_t> &seeds)
{
for (int i = 0; i < 4; i++){
s[i] = seeds[i];
}
}
uint32_t s[4];
};
class generator32x8 : public generator
{
public:
virtual ~generator32x8() = default;
using generator::seed;
protected:
void seed(std::vector<std::uint32_t> &seeds)
{
for (int i = 0; i < 8; i++){
s[i] = seeds[i];
}
}
uint32_t s[8];
};
class generator32x16 : public generator
{
public:
virtual ~generator32x16() = default;
using generator::seed;
protected:
void seed(std::vector<std::uint32_t> &seeds)
{
for (int i = 0; i < 16; i++){
s[i] = seeds[i];
}
}
uint32_t s[16];
};
class generator32x32 : public generator
{
public:
virtual ~generator32x32() = default;
using generator::seed;
protected:
void seed(std::vector<std::uint32_t> &seeds)
{
for (int i = 0; i < 32; i++){
s[i] = seeds[i];
}
}
uint32_t s[32];
};
class splitmix : public generator64
{
public:
virtual ~splitmix() = default;
using generator::seed;
splitmix() { seed(); }
splitmix(std::uint32_t s) {seed(s);}
result_type operator()()
{
uint64_t z = (m_seed += UINT64_C(0x9E3779B97F4A7C15));
z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9);
z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB);
return result_type((z ^ (z >> 31)) >> 31);
}
};
class trunc_xorshift32star : public generator64
{
public:
virtual ~trunc_xorshift32star() = default;
using generator::seed;
trunc_xorshift32star() { seed(); }
trunc_xorshift32star(std::uint32_t s) {seed(s);}
result_type operator()()
{
uint64_t result = m_seed * 0xd989bcacc137dcd5ull;
m_seed ^= m_seed >> 11;
m_seed ^= m_seed << 31;
m_seed ^= m_seed >> 18;
return uint32_t(result >> 32ull);
}
};
class xorshift32star : public generator64
{
public:
virtual ~xorshift32star() = default;
using generator::seed;
xorshift32star() { seed(); }
xorshift32star(std::uint32_t s) {seed(s);}
result_type operator()()
{
uint64_t x;
uint32_t answer;
x = m_seed;
x = x ^ (x >> 12);
x = x ^ (x << 25);
x = x ^ (x >> 27);
m_seed = x;
answer = ((x * MAGIC) >> 32);
return answer;
}
private:
const uint64_t MAGIC = 0x2545F4914F6CDD1D;
};
class xorshift32 : public generator32
{
public:
virtual ~xorshift32() = default;
using generator::seed;
xorshift32() { seed(); }
xorshift32(std::uint32_t s) {seed(s);}
result_type operator()()
{
uint64_t x = m_seed;
x ^= x << 13;
x ^= x >> 17;
x ^= x << 5;
return m_seed = x;
}
};
class xorshift64 : public generator64
{
public:
virtual ~xorshift64() = default;
using generator::seed;
xorshift64() { seed(); }
xorshift64(std::uint32_t s) {seed(s);}
result_type operator()()
{
uint64_t x = m_seed;
x ^= x << 13;
x ^= x >> 7;
x ^= x << 17;
return m_seed = x;
}
};
// compute 4 numbers at once with SSE.
class xorshift32x4 : public generator32x4
{
int counter = -1;
public:
virtual ~xorshift32x4() = default;
using generator::seed;
xorshift32x4() { seed(); }
xorshift32x4(std::uint32_t seed) {generator::seed(seed);}
result_type operator()()
{
counter++;
int cycle = counter % 4;
result_type result = s[cycle];
if (cycle == 3){
// uint64_t x = m_seed;
__m128i x = _mm_loadu_si128((__m128i*) s);
// x ^= x << 13;
x = _mm_xor_epi32(x, _mm_slli_epi32(x, 13));
// x ^= x >> 7;
x = _mm_xor_epi32(x, _mm_srli_epi32(x, 17));
// x ^= x << 17;
x = _mm_xor_epi32(x, _mm_slli_epi32(x, 5));
// return m_seed = x;
_mm_storeu_si128((__m128i*) s, x);
}
return result;
}
};
// compute 4 numbers at once with SSE.
class xorshift64x4 : public generator64x4
{
int counter = -1;
public:
virtual ~xorshift64x4() = default;
using generator::seed;
xorshift64x4() { seed(); }
xorshift64x4(std::uint32_t seed) { generator::seed(seed); }
result_type operator()()
{
counter++;
int cycle = counter % 4;
result_type result = s[cycle];
if (cycle == 3){
// uint64_t x = m_seed;
__m256i x = _mm256_loadu_si256((__m256i*) s);
// x ^= x << 13;
x = _mm256_xor_epi64(x, _mm256_slli_epi64(x, 13));
// x ^= x >> 7;
x = _mm256_xor_epi64(x, _mm256_srli_epi64(x, 7));
// x ^= x << 17;
x = _mm256_xor_epi64(x, _mm256_slli_epi64(x, 17));
// return m_seed = x;
_mm256_storeu_si256((__m256i*) s, x);
}
return result;
}
};
// compute 8 numbers at once with SSE.
class xorshift32x8 : public generator32x8
{
int counter = -1;
public:
virtual ~xorshift32x8() = default;
using generator::seed;
xorshift32x8() { seed(); }
xorshift32x8(std::uint32_t seed) { generator::seed(seed); }
result_type operator()()
{
counter++;
int cycle = counter % 8;
result_type result = s[cycle];
if (cycle == 7){
// uint64_t x = m_seed;
__m256i x = _mm256_loadu_si256((__m256i*) s);
// x ^= x << 13;
x = _mm256_xor_epi32(x, _mm256_slli_epi32(x, 13));
// x ^= x >> 7;
x = _mm256_xor_epi32(x, _mm256_srli_epi32(x, 17));
// x ^= x << 17;
x = _mm256_xor_epi32(x, _mm256_slli_epi32(x, 5));
// return m_seed = x;
_mm256_storeu_si256((__m256i*) s, x);
}
return result;
}
};
// compute 16 numbers at once with AVX512.
class xorshift32x16 : public generator32x16
{
int counter = -1;
public:
virtual ~xorshift32x16() = default;
using generator::seed;
xorshift32x16() { seed(); }
xorshift32x16(std::uint32_t seed) { generator::seed(seed); }
result_type operator()()
{
counter++;
int cycle = counter % 16;
result_type result = s[cycle];
if (cycle == 15){
// uint64_t x = m_seed;
__m512i x = _mm512_loadu_si512((__m512i*) s);
// x ^= x << 13;
x = _mm512_xor_epi32(x, _mm512_slli_epi32(x, 13));
// x ^= x >> 7;
x = _mm512_xor_epi32(x, _mm512_srli_epi32(x, 17));
// x ^= x << 17;
x = _mm512_xor_epi32(x, _mm512_slli_epi32(x, 5));
// return m_seed = x;
_mm512_storeu_si512((__m512i*) s, x);
}
return result;
}
};
// https://www.pcg-random.org/download.html#minimal-c-implementation
class pcg : public generator64x2
{
public:
virtual ~pcg() = default;
using generator::seed;
pcg() { seed();}
pcg(std::uint32_t s) {seed(s);}
result_type operator()()
{
uint64_t oldstate = s[0];
s[0] = oldstate * 6364136223846793005ULL + s[1];
uint32_t xorshifted = uint32_t(((oldstate >> 18u) ^ oldstate) >> 27u);
int rot = oldstate >> 59u;
return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
}
protected:
void seed(std::vector<std::uint32_t> &seeds)
{
// proper seeding
uint64_t s0 = uint64_t(seeds[0]) << 31 | uint64_t(seeds[1]);
uint64_t s1 = uint64_t(seeds[2]) << 31 | uint64_t(seeds[3]);
s[0] = 0;
s[1] = (s1 << 1) | 1;
operator()();
s[0] += s0;
operator()();
}
};
static uint32_t rotl(const uint32_t x, int k) __attribute__((always_inline));
static inline uint32_t rotl(const uint32_t x, int k) {
return (x << k) | (x >> (32 - k));
}
class xoshiro128plusplus : public generator32x4
{
public:
virtual ~xoshiro128plusplus() = default;
using generator::seed;
xoshiro128plusplus() { generator::seed(1); }
xoshiro128plusplus(std::uint32_t s) {seed(s);}
result_type operator()()
{
const uint32_t result = rotl(s[0] + s[3], 7) + s[0];
const uint32_t t = s[1] << 9;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rotl(s[3], 11);
return result;
}
};
class xoshiro128starstar : public generator32x4
{
public:
virtual ~xoshiro128starstar() = default;
using generator::seed;
xoshiro128starstar() { generator::seed(1); }
xoshiro128starstar(std::uint32_t s) {seed(s);}
result_type operator()()
{
const uint32_t result = rotl(s[1] * 5, 7) * 9;
const uint32_t t = s[1] << 9;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rotl(s[3], 11);
return result;
}
};
// note : dispatching.
static __m128i rotl(const __m128i x, int k) __attribute__((always_inline));
static inline __m128i rotl(const __m128i x, const int k) {
return _mm_or_epi32(_mm_slli_epi32(x, k), _mm_slli_epi32(x, (32 - k)));
}
class xoshiro128plusplusx4 : public generator32x16
{
int counter = -1;
result_type buffer[4];
public:
virtual ~xoshiro128plusplusx4() = default;
using generator::seed;
xoshiro128plusplusx4() { generator::seed(1); }
xoshiro128plusplusx4(std::uint32_t s) {seed(s);}
result_type operator()()
{
counter++;
int cycle = counter % 4;
if (cycle == 0){
__m128i s0 = _mm_loadu_si128((__m128i*) s);
__m128i s1 = _mm_loadu_si128(((__m128i*) s)+1);
__m128i s2 = _mm_loadu_si128(((__m128i*) s)+2);
__m128i s3 = _mm_loadu_si128(((__m128i*) s)+3);
// const uint32_t result = rotl(s[0] + s[3], 7) + s[0];
const __m128i results = _mm_add_epi32(rotl(_mm_add_epi32(s0, s3), 7), s0);
// const uint32_t t = s[1] << 9;
const __m128i t = _mm_slli_epi32(s1, 9);
// s[2] ^= s[0];
s2 = _mm_xor_epi32(s2, s0);
// s[3] ^= s[1];
s3 = _mm_xor_epi32(s3, s1);
// s[1] ^= s[2];
s1 = _mm_xor_epi32(s1, s2);
// s[0] ^= s[3];
s0 = _mm_xor_epi32(s0, s3);
// s[2] ^= t;
s2 = _mm_xor_epi32(s2, t);
// s[3] = rotl(s[3], 11);
s3 = rotl(s3, 11);
_mm_storeu_si128((__m128i*) s, s0);
_mm_storeu_si128(((__m128i*) s)+1, s1);
_mm_storeu_si128(((__m128i*) s)+2, s2);
_mm_storeu_si128(((__m128i*) s)+3, s3);
_mm_storeu_si128((__m128i*) buffer, results);
}
return buffer[cycle];
}
};
class xoshiro128starstarx4 : public generator32x16
{
int counter = -1;
result_type buffer[4];
public:
virtual ~xoshiro128starstarx4() = default;
using generator::seed;
xoshiro128starstarx4() { generator::seed(1); }
xoshiro128starstarx4(std::uint32_t s) {seed(s);}
result_type operator()()
{
counter++;
int cycle = counter % 4;
if (cycle == 0){
__m128i s0 = _mm_loadu_si128((__m128i*) s);
__m128i s1 = _mm_loadu_si128(((__m128i*) s)+1);
__m128i s2 = _mm_loadu_si128(((__m128i*) s)+2);
__m128i s3 = _mm_loadu_si128(((__m128i*) s)+3);
const __m128i c5 = _mm_set1_epi32(5);
const __m128i c9 = _mm_set1_epi32(9);
// const uint32_t result = rotl(s[1] * 5, 7) * 9;
const __m128i results = _mm_mul_epi32(rotl(_mm_mul_epi32(s1, c5), 7), c9);
// const uint32_t t = s[1] << 9;
const __m128i t = _mm_slli_epi32(s1, 9);
// s[2] ^= s[0];
s2 = _mm_xor_epi32(s2, s0);
// s[3] ^= s[1];
s3 = _mm_xor_epi32(s3, s1);
// s[1] ^= s[2];
s1 = _mm_xor_epi32(s1, s2);
// s[0] ^= s[3];
s0 = _mm_xor_epi32(s0, s3);
// s[2] ^= t;
s2 = _mm_xor_epi32(s2, t);
// s[3] = rotl(s[3], 11);
s3 = rotl(s3, 11);
_mm_storeu_si128((__m128i*) s, s0);
_mm_storeu_si128(((__m128i*) s)+1, s1);
_mm_storeu_si128(((__m128i*) s)+2, s2);
_mm_storeu_si128(((__m128i*) s)+3, s3);
_mm_storeu_si128((__m128i*) buffer, results);
}
return buffer[cycle];
}
};
// note : dispatching.
static __m256i rotl(const __m256i x, int k) __attribute__((always_inline));
static inline __m256i rotl(const __m256i x, const int k) {
return _mm256_or_epi32(_mm256_slli_epi32(x, k), _mm256_slli_epi32(x, (32 - k)));
}
class xoshiro128plusplusx8 : public generator32x32
{
int counter = -1;
result_type buffer[4];
public:
virtual ~xoshiro128plusplusx8() = default;
using generator::seed;
xoshiro128plusplusx8() { generator::seed(1); }
xoshiro128plusplusx8(std::uint32_t s) {seed(s);}
result_type operator()()
{
counter++;
int cycle = counter % 8;
if (cycle == 0){
__m256i s0 = _mm256_loadu_si256((__m256i*) s);
__m256i s1 = _mm256_loadu_si256(((__m256i*) s)+1);
__m256i s2 = _mm256_loadu_si256(((__m256i*) s)+2);
__m256i s3 = _mm256_loadu_si256(((__m256i*) s)+3);
// const uint32_t result = rotl(s[0] + s[3], 7) + s[0];
const __m256i results = _mm256_add_epi32(rotl(_mm256_add_epi32(s0, s3), 7), s0);
// const uint32_t t = s[1] << 9;
const __m256i t = _mm256_slli_epi32(s1, 9);
// s[2] ^= s[0];
s2 = _mm256_xor_epi32(s2, s0);
// s[3] ^= s[1];
s3 = _mm256_xor_epi32(s3, s1);
// s[1] ^= s[2];
s1 = _mm256_xor_epi32(s1, s2);
// s[0] ^= s[3];
s0 = _mm256_xor_epi32(s0, s3);
// s[2] ^= t;
s2 = _mm256_xor_epi32(s2, t);
// s[3] = rotl(s[3], 11);
s3 = rotl(s3, 11);
_mm256_storeu_si256((__m256i*) s, s0);
_mm256_storeu_si256(((__m256i*) s)+1, s1);
_mm256_storeu_si256(((__m256i*) s)+2, s2);
_mm256_storeu_si256(((__m256i*) s)+3, s3);
_mm256_storeu_si256((__m256i*) buffer, results);
}
return buffer[cycle];
}
};
class xoshiro128starstarx8 : public generator32x32
{
int counter = -1;
result_type buffer[4];
public:
virtual ~xoshiro128starstarx8() = default;
using generator::seed;
xoshiro128starstarx8() { generator::seed(1); }
xoshiro128starstarx8(std::uint32_t s) {seed(s);}
result_type operator()()
{
counter++;
int cycle = counter % 8;
if (cycle == 0){
__m256i s0 = _mm256_loadu_si256((__m256i*) s);
__m256i s1 = _mm256_loadu_si256(((__m256i*) s)+1);
__m256i s2 = _mm256_loadu_si256(((__m256i*) s)+2);
__m256i s3 = _mm256_loadu_si256(((__m256i*) s)+3);
const __m256i c5 = _mm256_set1_epi32(5);
const __m256i c9 = _mm256_set1_epi32(9);
// const uint32_t result = rotl(s[1] * 5, 7) * 9;
const __m256i results = _mm256_mul_epi32(rotl(_mm256_mul_epi32(s1, c5), 7), c9);
// const uint32_t t = s[1] << 9;
const __m256i t = _mm256_slli_epi32(s1, 9);
// s[2] ^= s[0];
s2 = _mm256_xor_epi32(s2, s0);
// s[3] ^= s[1];
s3 = _mm256_xor_epi32(s3, s1);
// s[1] ^= s[2];
s1 = _mm256_xor_epi32(s1, s2);
// s[0] ^= s[3];
s0 = _mm256_xor_epi32(s0, s3);
// s[2] ^= t;
s2 = _mm256_xor_epi32(s2, t);
// s[3] = rotl(s[3], 11);
s3 = rotl(s3, 11);
_mm256_storeu_si256((__m256i*) s, s0);
_mm256_storeu_si256(((__m256i*) s)+1, s1);
_mm256_storeu_si256(((__m256i*) s)+2, s2);
_mm256_storeu_si256(((__m256i*) s)+3, s3);
_mm256_storeu_si256((__m256i*) buffer, results);
}
return buffer[cycle];
}
};
// from George Marsaglia (1994)
// https://groups.google.com/g/sci.stat.math/c/p7aLW3TsJys/m/QGb1kti6kN0J
#define MWC_A0 1111111464
class MWC64 : public generator32x2
{
public:
virtual ~MWC64() = default;
using generator::seed;
/* The state must be initialized so that 0 < c < MWC_A0 - 1.
For simplicity, we suggest to set c = 1 and x to a 32-bit seed. */
MWC64() { seed(); }
MWC64(std::uint32_t s) {seed(s);}
result_type operator()()
{
const uint32_t result = s[0];
const uint64_t t = MWC_A0 * (uint64_t)s[0] + s[1];
s[0] = t;
s[1] = t >> 32;
return result;
}
protected:
void seed(std::vector<std::uint32_t> &seeds)
{
s[0] = seeds[0];
s[1] = seeds[1] / (MWC_A0 - 1);
}
};
#define MWC_A1 0xffebb71d94fcdaf9
class MWC128 : public generator64x2
{
public:
virtual ~MWC128() = default;
using generator::seed;
/* The state must be initialized so that 0 < c < MWC_A1 - 1.
For simplicity, we suggest to set c = 1 and x to a 64-bit seed. */
MWC128() { seed(); }
MWC128(std::uint32_t s) {seed(s);}
result_type operator()()
{
const uint64_t result = s[0];
const __uint128_t t = MWC_A1 * (__uint128_t)s[0] + s[1];
s[0] = t;
s[1] = t >> 64;
return result;
}
protected:
void seed(std::vector<std::uint32_t> &seeds)
{
s[0] = uint64_t(seeds[0]) << 31 | uint64_t(seeds[1]);
s[1] = uint64_t(seeds[2]) << 31 | uint64_t(seeds[3]);
s[1] = s[1] / (MWC_A1 - 1);
}
};
#endif /* AG_RANDOM_H */
#include <iostream>
#include <random>
#include <chrono>
#include <iomanip>
#include "random.h"
template <typename RandomEngineType> void benchmark_random_generator(const std::string &name) {
RandomEngineType generator(42);
generator.seed(5);
constexpr int nIterations = 300000000;
std::cout << std::setw(20) << name << std::setw(0);
// check sequence
// std::cout << " first 8: " ;
// for (int i = 0; i < 8; ++i) {
// std::cout << std::setw(0) << " 0x"
// << std::hex
// << std::setw(8) << std::setfill('0')
// << generator()
// << std::setw(0) << std::setfill(' ');
//
// }
const auto start = std::chrono::steady_clock::now();
for (int i = 0; i < nIterations; ++i) {
volatile int n = generator();
}
const auto end = std::chrono::steady_clock::now();
std::cout << std::setprecision(3)
<< " | "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() / static_cast<double>(nIterations)
<< " ns / num\n";
};
int main() {
benchmark_random_generator<trunc_xorshift32star>("trunc_xorshift32star");
benchmark_random_generator<xorshift32star>("xorshift32star");
benchmark_random_generator<xorshift64>("xorshift32");
benchmark_random_generator<xorshift64>("xorshift64");
benchmark_random_generator<xorshift32x4>("xorshift32x4");
benchmark_random_generator<xorshift64x4>("xorshift64x4");
benchmark_random_generator<xorshift32x8>("xorshift32x8");
benchmark_random_generator<xorshift32x16>("xorshift32x16");
benchmark_random_generator<splitmix>("splitmix");
benchmark_random_generator<pcg>("pcg");
benchmark_random_generator<xoshiro128plusplus>("xoshiro128plusplus");
benchmark_random_generator<xoshiro128starstar>("xoshiro128starstar");
benchmark_random_generator<xoshiro128plusplusx4>("xoshiro128plusplusx4");
benchmark_random_generator<xoshiro128starstarx4>("xoshiro128starstarx4");
benchmark_random_generator<xoshiro128plusplusx8>("xoshiro128plusplusx8");
benchmark_random_generator<xoshiro128starstarx8>("xoshiro128starstarx8");
benchmark_random_generator<MWC128>("MWC128");
benchmark_random_generator<MWC64>("MWC64");
benchmark_random_generator<std::minstd_rand>("minstd_rand");
benchmark_random_generator<std::minstd_rand0>("minstd_rand0");
benchmark_random_generator<std::mt19937>("mt19937");
benchmark_random_generator<std::mt19937_64>("mt19937_64");
benchmark_random_generator<std::ranlux24_base>("ranlux24_base");
benchmark_random_generator<std::ranlux48_base>("ranlux48_base");
// benchmark_random_generator<std::ranlux24>("ranlux24"); // very slow
// benchmark_random_generator<std::ranlux48>("ranlux48"); // very slow
benchmark_random_generator<std::knuth_b>("knuth_b");
return EXIT_SUCCESS;
}
/* Written in 2016-2018 by David Blackman and Sebastiano Vigna ([email protected])
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
#include <stdint.h>
/* This is xoroshiro128+ 1.0, our best and fastest small-state generator
for floating-point numbers, but its state space is large enough only
for mild parallelism. We suggest to use its upper bits for
floating-point generation, as it is slightly faster than
xoroshiro128++/xoroshiro128**. It passes all tests we are aware of
except for the four lower bits, which might fail linearity tests (and
just those), so if low linear complexity is not considered an issue (as
it is usually the case) it can be used to generate 64-bit outputs, too;
moreover, this generator has a very mild Hamming-weight dependency
making our test (http://prng.di.unimi.it/hwd.php) fail after 5 TB of
output; we believe this slight bias cannot affect any application. If
you are concerned, use xoroshiro128++, xoroshiro128** or xoshiro256+.
We suggest to use a sign test to extract a random Boolean value, and
right shifts to extract subsets of bits.
The state must be seeded so that it is not everywhere zero. If you have
a 64-bit seed, we suggest to seed a splitmix64 generator and use its
output to fill s.
NOTE: the parameters (a=24, b=16, b=37) of this version give slightly
better results in our test than the 2016 version (a=55, b=14, c=36).
*/
static inline uint64_t rotl(const uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
}
static uint64_t s[2];
uint64_t next(void) {
const uint64_t s0 = s[0];
uint64_t s1 = s[1];
const uint64_t result = s0 + s1;
s1 ^= s0;
s[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16); // a, b
s[1] = rotl(s1, 37); // c
return result;
}
/* This is the jump function for the generator. It is equivalent
to 2^64 calls to next(); it can be used to generate 2^64
non-overlapping subsequences for parallel computations. */
void jump(void) {
static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc };
uint64_t s0 = 0;
uint64_t s1 = 0;
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
for(int b = 0; b < 64; b++) {
if (JUMP[i] & UINT64_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
}
next();
}
s[0] = s0;
s[1] = s1;
}
/* This is the long-jump function for the generator. It is equivalent to
2^96 calls to next(); it can be used to generate 2^32 starting points,
from each of which jump() will generate 2^32 non-overlapping
subsequences for parallel distributed computations. */
void long_jump(void) {
static const uint64_t LONG_JUMP[] = { 0xd2a98b26625eee7b, 0xdddf9b1090aa7ac1 };
uint64_t s0 = 0;
uint64_t s1 = 0;
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
for(int b = 0; b < 64; b++) {
if (LONG_JUMP[i] & UINT64_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
}
next();
}
s[0] = s0;
s[1] = s1;
}
/* Written in 2019 by David Blackman and Sebastiano Vigna ([email protected])
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
#include <stdint.h>
/* This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid,
small-state generators. It is extremely (sub-ns) fast and it passes all
tests we are aware of, but its state space is large enough only for
mild parallelism.
For generating just floating-point numbers, xoroshiro128+ is even
faster (but it has a very mild bias, see notes in the comments).
The state must be seeded so that it is not everywhere zero. If you have
a 64-bit seed, we suggest to seed a splitmix64 generator and use its
output to fill s. */
static inline uint64_t rotl(const uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
}
static uint64_t s[2];
uint64_t next(void) {
const uint64_t s0 = s[0];
uint64_t s1 = s[1];
const uint64_t result = rotl(s0 + s1, 17) + s0;
s1 ^= s0;
s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b
s[1] = rotl(s1, 28); // c
return result;
}
/* This is the jump function for the generator. It is equivalent
to 2^64 calls to next(); it can be used to generate 2^64
non-overlapping subsequences for parallel computations. */
void jump(void) {
static const uint64_t JUMP[] = { 0x2bd7a6a6e99c2ddc, 0x0992ccaf6a6fca05 };
uint64_t s0 = 0;
uint64_t s1 = 0;
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
for(int b = 0; b < 64; b++) {
if (JUMP[i] & UINT64_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
}
next();
}
s[0] = s0;
s[1] = s1;
}
/* This is the long-jump function for the generator. It is equivalent to
2^96 calls to next(); it can be used to generate 2^32 starting points,
from each of which jump() will generate 2^32 non-overlapping
subsequences for parallel distributed computations. */
void long_jump(void) {
static const uint64_t LONG_JUMP[] = { 0x360fd5f2cf8d5d99, 0x9c6e6877736c46e3 };
uint64_t s0 = 0;
uint64_t s1 = 0;
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
for(int b = 0; b < 64; b++) {
if (LONG_JUMP[i] & UINT64_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
}
next();
}
s[0] = s0;
s[1] = s1;
}
/* Written in 2018 by David Blackman and Sebastiano Vigna ([email protected])
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
#include <stdint.h>
/* This is xoroshiro128** 1.0, one of our all-purpose, rock-solid,
small-state generators. It is extremely (sub-ns) fast and it passes all
tests we are aware of, but its state space is large enough only for
mild parallelism.
For generating just floating-point numbers, xoroshiro128+ is even
faster (but it has a very mild bias, see notes in the comments).
The state must be seeded so that it is not everywhere zero. If you have
a 64-bit seed, we suggest to seed a splitmix64 generator and use its
output to fill s. */
static inline uint64_t rotl(const uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
}
static uint64_t s[2];
uint64_t next(void) {
const uint64_t s0 = s[0];
uint64_t s1 = s[1];
const uint64_t result = rotl(s0 * 5, 7) * 9;
s1 ^= s0;
s[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16); // a, b
s[1] = rotl(s1, 37); // c
return result;
}
/* This is the jump function for the generator. It is equivalent
to 2^64 calls to next(); it can be used to generate 2^64
non-overlapping subsequences for parallel computations. */
void jump(void) {
static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc };
uint64_t s0 = 0;
uint64_t s1 = 0;
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
for(int b = 0; b < 64; b++) {
if (JUMP[i] & UINT64_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
}
next();
}
s[0] = s0;
s[1] = s1;
}
/* This is the long-jump function for the generator. It is equivalent to
2^96 calls to next(); it can be used to generate 2^32 starting points,
from each of which jump() will generate 2^32 non-overlapping
subsequences for parallel distributed computations. */
void long_jump(void) {
static const uint64_t LONG_JUMP[] = { 0xd2a98b26625eee7b, 0xdddf9b1090aa7ac1 };
uint64_t s0 = 0;
uint64_t s1 = 0;
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
for(int b = 0; b < 64; b++) {
if (LONG_JUMP[i] & UINT64_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
}
next();
}
s[0] = s0;
s[1] = s1;
}
/* Written in 2016 by David Blackman and Sebastiano Vigna ([email protected])
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
#include <stdint.h>
/* This is xoroshiro64* 1.0, our best and fastest 32-bit small-state
generator for 32-bit floating-point numbers. We suggest to use its
upper bits for floating-point generation, as it is slightly faster than
xoroshiro64**. It passes all tests we are aware of except for linearity
tests, as the lowest six bits have low linear complexity, so if low
linear complexity is not considered an issue (as it is usually the
case) it can be used to generate 32-bit outputs, too.
We suggest to use a sign test to extract a random Boolean value, and
right shifts to extract subsets of bits.
The state must be seeded so that it is not everywhere zero. */
static inline uint32_t rotl(const uint32_t x, int k) {
return (x << k) | (x >> (32 - k));
}
static uint32_t s[2];
uint32_t next(void) {
const uint32_t s0 = s[0];
uint32_t s1 = s[1];
const uint32_t result = s0 * 0x9E3779BB;
s1 ^= s0;
s[0] = rotl(s0, 26) ^ s1 ^ (s1 << 9); // a, b
s[1] = rotl(s1, 13); // c
return result;
}
/* Written in 2018 by David Blackman and Sebastiano Vigna ([email protected])
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
#include <stdint.h>
/* This is xoroshiro64** 1.0, our 32-bit all-purpose, rock-solid,
small-state generator. It is extremely fast and it passes all tests we
are aware of, but its state space is not large enough for any parallel
application.
For generating just single-precision (i.e., 32-bit) floating-point
numbers, xoroshiro64* is even faster.
The state must be seeded so that it is not everywhere zero. */
static inline uint32_t rotl(const uint32_t x, int k) {
return (x << k) | (x >> (32 - k));
}
static uint32_t s[2];
uint32_t next(void) {
const uint32_t s0 = s[0];
uint32_t s1 = s[1];
const uint32_t result = rotl(s0 * 0x9E3779BB, 5) * 5;
s1 ^= s0;
s[0] = rotl(s0, 26) ^ s1 ^ (s1 << 9); // a, b
s[1] = rotl(s1, 13); // c
return result;
}
/* Written in 2018 by David Blackman and Sebastiano Vigna ([email protected])
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
#include <stdint.h>
/* This is xoshiro128+ 1.0, our best and fastest 32-bit generator for 32-bit
floating-point numbers. We suggest to use its upper bits for
floating-point generation, as it is slightly faster than xoshiro128**.
It passes all tests we are aware of except for
linearity tests, as the lowest four bits have low linear complexity, so
if low linear complexity is not considered an issue (as it is usually
the case) it can be used to generate 32-bit outputs, too.
We suggest to use a sign test to extract a random Boolean value, and
right shifts to extract subsets of bits.
The state must be seeded so that it is not everywhere zero. */
static inline uint32_t rotl(const uint32_t x, int k) {
return (x << k) | (x >> (32 - k));
}
static uint32_t s[4];
uint32_t next(void) {
const uint32_t result = s[0] + s[3];
const uint32_t t = s[1] << 9;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rotl(s[3], 11);
return result;
}
/* This is the jump function for the generator. It is equivalent
to 2^64 calls to next(); it can be used to generate 2^64
non-overlapping subsequences for parallel computations. */
void jump(void) {
static const uint32_t JUMP[] = { 0x8764000b, 0xf542d2d3, 0x6fa035c3, 0x77f2db5b };
uint32_t s0 = 0;
uint32_t s1 = 0;
uint32_t s2 = 0;
uint32_t s3 = 0;
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
for(int b = 0; b < 32; b++) {
if (JUMP[i] & UINT32_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
s2 ^= s[2];
s3 ^= s[3];
}
next();
}
s[0] = s0;
s[1] = s1;
s[2] = s2;
s[3] = s3;
}
/* This is the long-jump function for the generator. It is equivalent to
2^96 calls to next(); it can be used to generate 2^32 starting points,
from each of which jump() will generate 2^32 non-overlapping
subsequences for parallel distributed computations. */
void long_jump(void) {
static const uint32_t LONG_JUMP[] = { 0xb523952e, 0x0b6f099f, 0xccf5a0ef, 0x1c580662 };
uint32_t s0 = 0;
uint32_t s1 = 0;
uint32_t s2 = 0;
uint32_t s3 = 0;
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
for(int b = 0; b < 32; b++) {
if (LONG_JUMP[i] & UINT32_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
s2 ^= s[2];
s3 ^= s[3];
}
next();
}
s[0] = s0;
s[1] = s1;
s[2] = s2;
s[3] = s3;
}
/* Written in 2019 by David Blackman and Sebastiano Vigna ([email protected])
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
#include <stdint.h>
/* This is xoshiro128++ 1.0, one of our 32-bit all-purpose, rock-solid
generators. It has excellent speed, a state size (128 bits) that is
large enough for mild parallelism, and it passes all tests we are aware
of.
For generating just single-precision (i.e., 32-bit) floating-point
numbers, xoshiro128+ is even faster.
The state must be seeded so that it is not everywhere zero. */
static inline uint32_t rotl(const uint32_t x, int k) {
return (x << k) | (x >> (32 - k));
}
static uint32_t s[4];
uint32_t next(void) {
const uint32_t result = rotl(s[0] + s[3], 7) + s[0];
const uint32_t t = s[1] << 9;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rotl(s[3], 11);
return result;
}
/* This is the jump function for the generator. It is equivalent
to 2^64 calls to next(); it can be used to generate 2^64
non-overlapping subsequences for parallel computations. */
void jump(void) {
static const uint32_t JUMP[] = { 0x8764000b, 0xf542d2d3, 0x6fa035c3, 0x77f2db5b };
uint32_t s0 = 0;
uint32_t s1 = 0;
uint32_t s2 = 0;
uint32_t s3 = 0;
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
for(int b = 0; b < 32; b++) {
if (JUMP[i] & UINT32_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
s2 ^= s[2];
s3 ^= s[3];
}
next();
}
s[0] = s0;
s[1] = s1;
s[2] = s2;
s[3] = s3;
}
/* This is the long-jump function for the generator. It is equivalent to
2^96 calls to next(); it can be used to generate 2^32 starting points,
from each of which jump() will generate 2^32 non-overlapping
subsequences for parallel distributed computations. */
void long_jump(void) {
static const uint32_t LONG_JUMP[] = { 0xb523952e, 0x0b6f099f, 0xccf5a0ef, 0x1c580662 };
uint32_t s0 = 0;
uint32_t s1 = 0;
uint32_t s2 = 0;
uint32_t s3 = 0;
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
for(int b = 0; b < 32; b++) {
if (LONG_JUMP[i] & UINT32_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
s2 ^= s[2];
s3 ^= s[3];
}
next();
}
s[0] = s0;
s[1] = s1;
s[2] = s2;
s[3] = s3;
}
/* Written in 2018 by David Blackman and Sebastiano Vigna ([email protected])
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
#include <stdint.h>
/* This is xoshiro128** 1.1, one of our 32-bit all-purpose, rock-solid
generators. It has excellent speed, a state size (128 bits) that is
large enough for mild parallelism, and it passes all tests we are aware
of.
Note that version 1.0 had mistakenly s[0] instead of s[1] as state
word passed to the scrambler.
For generating just single-precision (i.e., 32-bit) floating-point
numbers, xoshiro128+ is even faster.
The state must be seeded so that it is not everywhere zero. */
static inline uint32_t rotl(const uint32_t x, int k) {
return (x << k) | (x >> (32 - k));
}
static uint32_t s[4];
uint32_t next(void) {
const uint32_t result = rotl(s[1] * 5, 7) * 9;
const uint32_t t = s[1] << 9;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rotl(s[3], 11);
return result;
}
/* This is the jump function for the generator. It is equivalent
to 2^64 calls to next(); it can be used to generate 2^64
non-overlapping subsequences for parallel computations. */
void jump(void) {
static const uint32_t JUMP[] = { 0x8764000b, 0xf542d2d3, 0x6fa035c3, 0x77f2db5b };
uint32_t s0 = 0;
uint32_t s1 = 0;
uint32_t s2 = 0;
uint32_t s3 = 0;
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
for(int b = 0; b < 32; b++) {
if (JUMP[i] & UINT32_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
s2 ^= s[2];
s3 ^= s[3];
}
next();
}
s[0] = s0;
s[1] = s1;
s[2] = s2;
s[3] = s3;
}
/* This is the long-jump function for the generator. It is equivalent to
2^96 calls to next(); it can be used to generate 2^32 starting points,
from each of which jump() will generate 2^32 non-overlapping
subsequences for parallel distributed computations. */
void long_jump(void) {
static const uint32_t LONG_JUMP[] = { 0xb523952e, 0x0b6f099f, 0xccf5a0ef, 0x1c580662 };
uint32_t s0 = 0;
uint32_t s1 = 0;
uint32_t s2 = 0;
uint32_t s3 = 0;
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
for(int b = 0; b < 32; b++) {
if (LONG_JUMP[i] & UINT32_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
s2 ^= s[2];
s3 ^= s[3];
}
next();
}
s[0] = s0;
s[1] = s1;
s[2] = s2;
s[3] = s3;
}
/* Written in 2018 by David Blackman and Sebastiano Vigna ([email protected])
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
#include <stdint.h>
/* This is xoshiro256+ 1.0, our best and fastest generator for floating-point
numbers. We suggest to use its upper bits for floating-point
generation, as it is slightly faster than xoshiro256++/xoshiro256**. It
passes all tests we are aware of except for the lowest three bits,
which might fail linearity tests (and just those), so if low linear
complexity is not considered an issue (as it is usually the case) it
can be used to generate 64-bit outputs, too.
We suggest to use a sign test to extract a random Boolean value, and
right shifts to extract subsets of bits.
The state must be seeded so that it is not everywhere zero. If you have
a 64-bit seed, we suggest to seed a splitmix64 generator and use its
output to fill s. */
static inline uint64_t rotl(const uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
}
static uint64_t s[4];
uint64_t next(void) {
const uint64_t result = s[0] + s[3];
const uint64_t t = s[1] << 17;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rotl(s[3], 45);
return result;
}
/* This is the jump function for the generator. It is equivalent
to 2^128 calls to next(); it can be used to generate 2^128
non-overlapping subsequences for parallel computations. */
void jump(void) {
static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c };
uint64_t s0 = 0;
uint64_t s1 = 0;
uint64_t s2 = 0;
uint64_t s3 = 0;
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
for(int b = 0; b < 64; b++) {
if (JUMP[i] & UINT64_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
s2 ^= s[2];
s3 ^= s[3];
}
next();
}
s[0] = s0;
s[1] = s1;
s[2] = s2;
s[3] = s3;
}
/* This is the long-jump function for the generator. It is equivalent to
2^192 calls to next(); it can be used to generate 2^64 starting points,
from each of which jump() will generate 2^64 non-overlapping
subsequences for parallel distributed computations. */
void long_jump(void) {
static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 };
uint64_t s0 = 0;
uint64_t s1 = 0;
uint64_t s2 = 0;
uint64_t s3 = 0;
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
for(int b = 0; b < 64; b++) {
if (LONG_JUMP[i] & UINT64_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
s2 ^= s[2];
s3 ^= s[3];
}
next();
}
s[0] = s0;
s[1] = s1;
s[2] = s2;
s[3] = s3;
}
/* Written in 2019 by David Blackman and Sebastiano Vigna ([email protected])
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
#include <stdint.h>
/* This is xoshiro256++ 1.0, one of our all-purpose, rock-solid generators.
It has excellent (sub-ns) speed, a state (256 bits) that is large
enough for any parallel application, and it passes all tests we are
aware of.
For generating just floating-point numbers, xoshiro256+ is even faster.
The state must be seeded so that it is not everywhere zero. If you have
a 64-bit seed, we suggest to seed a splitmix64 generator and use its
output to fill s. */
static inline uint64_t rotl(const uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
}
static uint64_t s[4];
uint64_t next(void) {
const uint64_t result = rotl(s[0] + s[3], 23) + s[0];
const uint64_t t = s[1] << 17;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rotl(s[3], 45);
return result;
}
/* This is the jump function for the generator. It is equivalent
to 2^128 calls to next(); it can be used to generate 2^128
non-overlapping subsequences for parallel computations. */
void jump(void) {
static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c };
uint64_t s0 = 0;
uint64_t s1 = 0;
uint64_t s2 = 0;
uint64_t s3 = 0;
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
for(int b = 0; b < 64; b++) {
if (JUMP[i] & UINT64_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
s2 ^= s[2];
s3 ^= s[3];
}
next();
}
s[0] = s0;
s[1] = s1;
s[2] = s2;
s[3] = s3;
}
/* This is the long-jump function for the generator. It is equivalent to
2^192 calls to next(); it can be used to generate 2^64 starting points,
from each of which jump() will generate 2^64 non-overlapping
subsequences for parallel distributed computations. */
void long_jump(void) {
static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 };
uint64_t s0 = 0;
uint64_t s1 = 0;
uint64_t s2 = 0;
uint64_t s3 = 0;
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
for(int b = 0; b < 64; b++) {
if (LONG_JUMP[i] & UINT64_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
s2 ^= s[2];
s3 ^= s[3];
}
next();
}
s[0] = s0;
s[1] = s1;
s[2] = s2;
s[3] = s3;
}
/* Written in 2018 by David Blackman and Sebastiano Vigna ([email protected])
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
#include <stdint.h>
/* This is xoshiro256** 1.0, one of our all-purpose, rock-solid
generators. It has excellent (sub-ns) speed, a state (256 bits) that is
large enough for any parallel application, and it passes all tests we
are aware of.
For generating just floating-point numbers, xoshiro256+ is even faster.
The state must be seeded so that it is not everywhere zero. If you have
a 64-bit seed, we suggest to seed a splitmix64 generator and use its
output to fill s. */
static inline uint64_t rotl(const uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
}
static uint64_t s[4];
uint64_t next(void) {
const uint64_t result = rotl(s[1] * 5, 7) * 9;
const uint64_t t = s[1] << 17;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rotl(s[3], 45);
return result;
}
/* This is the jump function for the generator. It is equivalent
to 2^128 calls to next(); it can be used to generate 2^128
non-overlapping subsequences for parallel computations. */
void jump(void) {
static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c };
uint64_t s0 = 0;
uint64_t s1 = 0;
uint64_t s2 = 0;
uint64_t s3 = 0;
for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
for(int b = 0; b < 64; b++) {
if (JUMP[i] & UINT64_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
s2 ^= s[2];
s3 ^= s[3];
}
next();
}
s[0] = s0;
s[1] = s1;
s[2] = s2;
s[3] = s3;
}
/* This is the long-jump function for the generator. It is equivalent to
2^192 calls to next(); it can be used to generate 2^64 starting points,
from each of which jump() will generate 2^64 non-overlapping
subsequences for parallel distributed computations. */
void long_jump(void) {
static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 };
uint64_t s0 = 0;
uint64_t s1 = 0;
uint64_t s2 = 0;
uint64_t s3 = 0;
for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
for(int b = 0; b < 64; b++) {
if (LONG_JUMP[i] & UINT64_C(1) << b) {
s0 ^= s[0];
s1 ^= s[1];
s2 ^= s[2];
s3 ^= s[3];
}
next();
}
s[0] = s0;
s[1] = s1;
s[2] = s2;
s[3] = s3;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment