Created
August 17, 2015 15:24
-
-
Save mikusp/f6f9bee70dc53dcc7002 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* ----------------------------------------------------------------------------- | |
* | |
* Module : Function | |
* Copyright : [2008..2014] Manuel M T Chakravarty, Gabriele Keller | |
* [2009..2014] Trevor L. McDonell | |
* License : BSD3 | |
* | |
* Maintainer : Trevor L. McDonell <[email protected]> | |
* Stability : experimental | |
* | |
* ---------------------------------------------------------------------------*/ | |
#ifndef __ACCELERATE_CUDA_FUNCTION_H__ | |
#define __ACCELERATE_CUDA_FUNCTION_H__ | |
#ifndef __CUDACC_RTC__ | |
#include <cuda_runtime.h> | |
#endif | |
#include "accelerate_cuda_type.h" | |
#ifdef __cplusplus | |
/* ----------------------------------------------------------------------------- | |
* Device functions required to support generated code | |
* -------------------------------------------------------------------------- */ | |
/* | |
* Type coercion | |
*/ | |
template <typename T> | |
static __inline__ __device__ Word32 reinterpret32(const T x) | |
{ | |
union { T a; Word32 b; } u; | |
u.a = x; | |
return u.b; | |
} | |
template <> | |
__inline__ __device__ Word32 reinterpret32(const Word32 x) | |
{ | |
return x; | |
} | |
template <> | |
__inline__ __device__ Word32 reinterpret32(const float x) | |
{ | |
return __float_as_int(x); | |
} | |
template <typename T> | |
static __inline__ __device__ Word64 reinterpret64(const T x) | |
{ | |
union { T a; Word64 b; } u; | |
u.a = x; | |
return u.b; | |
} | |
template <> | |
__inline__ __device__ Word64 reinterpret64(const Word64 x) | |
{ | |
return x; | |
} | |
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 130 | |
template <> | |
__inline__ __device__ Word64 reinterpret64(const double x) | |
{ | |
return __double_as_longlong(x); | |
} | |
#endif | |
/* | |
* Atomic compare-and-swap, with type coercion | |
*/ | |
template <typename T> | |
static __inline__ __device__ T atomicCAS32(T* address, T compare, T val) | |
{ | |
union { T a; Word32 b; } u; | |
u.b = atomicCAS((Word32*) address, reinterpret32<T>(compare), reinterpret32<T>(val)); | |
return u.a; | |
} | |
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 110 | |
template <> | |
__inline__ __device__ Int32 atomicCAS32(Int32* address, Int32 compare, Int32 val) | |
{ | |
return atomicCAS(address, compare, val); | |
} | |
template <> | |
__inline__ __device__ Word32 atomicCAS32(Word32* address, Word32 compare, Word32 val) | |
{ | |
return atomicCAS(address, compare, val); | |
} | |
#endif | |
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 120 | |
template <typename T> | |
static __inline__ __device__ T atomicCAS64(T* address, T compare, T val) | |
{ | |
union { T a; Word64 b; } u; | |
u.b = atomicCAS((Word64*) address, reinterpret64<T>(compare), reinterpret64<T>(val)); | |
return u.a; | |
} | |
template <> | |
__inline__ __device__ Word64 atomicCAS64(Word64* address, Word64 compare, Word64 val) | |
{ | |
return atomicCAS(address, compare, val); | |
} | |
#endif | |
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 | |
/* | |
* Warp shuffle functions | |
*/ | |
template <typename T> | |
static __inline__ __device__ T shfl_up32(T var, unsigned int delta, int width=warpSize) | |
{ | |
union { T a; Int32 b; } u, v; | |
v.a = var; | |
u.b = __shfl_up(v.b, delta, warpSize); | |
return u.a; | |
} | |
template <> | |
__inline__ __device__ int shfl_up32(int var, unsigned int delta, int width) | |
{ | |
return __shfl_up(var, delta, width); | |
} | |
template <> | |
__inline__ __device__ float shfl_up32(float var, unsigned int delta, int width) | |
{ | |
return __shfl_up(var, delta, width); | |
} | |
template <typename T> | |
static __inline__ __device__ T shfl_up64(T var, unsigned int delta, int width=warpSize) | |
{ | |
union { T a; struct { Int32 lo; Int32 hi; }; } u, v; | |
v.a = var; | |
u.lo = __shfl_up(v.lo, delta, warpSize); | |
u.hi = __shfl_up(v.hi, delta, warpSize); | |
return u.a; | |
} | |
template <typename T> | |
static __inline__ __device__ T shfl_xor32(T var, int laneMask, int width=warpSize) | |
{ | |
union { T a; Int32 b; } u, v; | |
v.a = var; | |
u.b = __shfl_xor(v.b, laneMask, warpSize); | |
return u.a; | |
} | |
template <> | |
__inline__ __device__ int shfl_xor32(int var, int laneMask, int width) | |
{ | |
return __shfl_xor(var, laneMask, width); | |
} | |
template <> | |
__inline__ __device__ float shfl_xor32(float var, int laneMask, int width) | |
{ | |
return __shfl_xor(var, laneMask, width); | |
} | |
template <typename T> | |
static __inline__ __device__ T shfl_xor64(T var, int laneMask, int width=warpSize) | |
{ | |
union { T a; struct { Int32 lo; Int32 hi; }; } u, v; | |
v.a = var; | |
u.lo = __shfl_xor(v.lo, laneMask, warpSize); | |
u.hi = __shfl_xor(v.hi, laneMask, warpSize); | |
return u.a; | |
} | |
#endif | |
#if 0 | |
/* ----------------------------------------------------------------------------- | |
* Additional helper functions | |
* -------------------------------------------------------------------------- */ | |
/* | |
* Determine if the input is a power of two | |
*/ | |
template <typename T> | |
static __inline__ __host__ __device__ T isPow2(const T x) | |
{ | |
return ((x&(x-1)) == 0); | |
} | |
/* | |
* Compute the next highest power of two | |
*/ | |
template <typename T> | |
static __inline__ __host__ T ceilPow2(const T x) | |
{ | |
#if 0 | |
--x; | |
x |= x >> 1; | |
x |= x >> 2; | |
x |= x >> 4; | |
x |= x >> 8; | |
x |= x >> 16; | |
return ++x; | |
#endif | |
return (isPow2(x)) ? x : 1u << (int) ceil(log2((double)x)); | |
} | |
/* | |
* Compute the next lowest power of two | |
*/ | |
template <typename T> | |
static __inline__ __host__ T floorPow2(const T x) | |
{ | |
#if 0 | |
float nf = (float) n; | |
return 1 << (((*(int*)&nf) >> 23) - 127); | |
#endif | |
int exp; | |
frexp(x, &exp); | |
return 1 << (exp - 1); | |
} | |
/* | |
* computes next highest multiple of f from x | |
*/ | |
template <typename T> | |
static __inline__ __host__ T multiple(const T x, const T f) | |
{ | |
return ((x + (f-1)) / f); | |
} | |
/* | |
* MS Excel-style CEIL() function. Rounds x up to nearest multiple of f | |
*/ | |
template <typename T> | |
static __inline__ __host__ T ceiling(const T x, const T f) | |
{ | |
return multiple(x, f) * f; | |
} | |
#endif | |
#endif // __cplusplus | |
#endif // __ACCELERATE_CUDA_FUNCTION_H__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment