Skip to content

Instantly share code, notes, and snippets.

@grafi-tt
Created April 30, 2019 08:11
Show Gist options
  • Save grafi-tt/478d41538e1da13352382e27fe992d78 to your computer and use it in GitHub Desktop.
Save grafi-tt/478d41538e1da13352382e27fe992d78 to your computer and use it in GitHub Desktop.
__device__ static inline void multfly_device_gen_round_(uint32_t *u, uint32_t *v) {
int lane = threadIdx.x & 3;
uint32_t mulu = UINT32_C(2718281829);
uint32_t mulv = UINT32_C(3141592653);
uint32_t incr = UINT32_C(0x33123456);
*u += multfly_device_rotl_(incr, lane);
*v += *u;
*v ^= multfly_device_rotl_(*u, 8);
*v *= mulv;
*u ^= multfly_device_rotl_(*v, 9);
*u += *v;
*u *= mulu;
*v ^= multfly_device_rotl_(*u, 16);
*v += *u;
}
__device__ static inline void multfly_gen_impl_(const multfly_key *key, uint64_t ctr, uint32_t result[4]) {
int lane = threadIdx.x & 3;
uint32_t u = key->v_[lane] ^ ((uint32_t)ctr + lane);
uint32_t v = key->v_[lane + 4] ^ ((uint32_t)(ctr >> 32) + lane);
multfly_device_gen_round_(&u, &v);
v = __shfl_xor(v, 1);
multfly_device_gen_round_(&u, &v);
v = __shfl_xor(v, 2);
multfly_device_gen_round_(&u, &v);
result[lane] = v;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment