Skip to content

Instantly share code, notes, and snippets.

@mikusp
Created August 17, 2015 15:24
Show Gist options
  • Save mikusp/f6f9bee70dc53dcc7002 to your computer and use it in GitHub Desktop.
Save mikusp/f6f9bee70dc53dcc7002 to your computer and use it in GitHub Desktop.
/* -----------------------------------------------------------------------------
*
* Module : Function
* Copyright : [2008..2014] Manuel M T Chakravarty, Gabriele Keller
* [2009..2014] Trevor L. McDonell
* License : BSD3
*
* Maintainer : Trevor L. McDonell <[email protected]>
* Stability : experimental
*
* ---------------------------------------------------------------------------*/
#ifndef __ACCELERATE_CUDA_FUNCTION_H__
#define __ACCELERATE_CUDA_FUNCTION_H__
#ifndef __CUDACC_RTC__
#include <cuda_runtime.h>
#endif
#include "accelerate_cuda_type.h"
#ifdef __cplusplus
/* -----------------------------------------------------------------------------
* Device functions required to support generated code
* -------------------------------------------------------------------------- */
/*
* Type coercion
*/
template <typename T>
static __inline__ __device__ Word32 reinterpret32(const T x)
{
union { T a; Word32 b; } u;
u.a = x;
return u.b;
}
template <>
__inline__ __device__ Word32 reinterpret32(const Word32 x)
{
return x;
}
template <>
__inline__ __device__ Word32 reinterpret32(const float x)
{
return __float_as_int(x);
}
template <typename T>
static __inline__ __device__ Word64 reinterpret64(const T x)
{
union { T a; Word64 b; } u;
u.a = x;
return u.b;
}
template <>
__inline__ __device__ Word64 reinterpret64(const Word64 x)
{
return x;
}
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 130
template <>
__inline__ __device__ Word64 reinterpret64(const double x)
{
return __double_as_longlong(x);
}
#endif
/*
* Atomic compare-and-swap, with type coercion
*/
template <typename T>
static __inline__ __device__ T atomicCAS32(T* address, T compare, T val)
{
union { T a; Word32 b; } u;
u.b = atomicCAS((Word32*) address, reinterpret32<T>(compare), reinterpret32<T>(val));
return u.a;
}
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 110
template <>
__inline__ __device__ Int32 atomicCAS32(Int32* address, Int32 compare, Int32 val)
{
return atomicCAS(address, compare, val);
}
template <>
__inline__ __device__ Word32 atomicCAS32(Word32* address, Word32 compare, Word32 val)
{
return atomicCAS(address, compare, val);
}
#endif
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 120
template <typename T>
static __inline__ __device__ T atomicCAS64(T* address, T compare, T val)
{
union { T a; Word64 b; } u;
u.b = atomicCAS((Word64*) address, reinterpret64<T>(compare), reinterpret64<T>(val));
return u.a;
}
template <>
__inline__ __device__ Word64 atomicCAS64(Word64* address, Word64 compare, Word64 val)
{
return atomicCAS(address, compare, val);
}
#endif
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
/*
* Warp shuffle functions
*/
template <typename T>
static __inline__ __device__ T shfl_up32(T var, unsigned int delta, int width=warpSize)
{
union { T a; Int32 b; } u, v;
v.a = var;
u.b = __shfl_up(v.b, delta, warpSize);
return u.a;
}
template <>
__inline__ __device__ int shfl_up32(int var, unsigned int delta, int width)
{
return __shfl_up(var, delta, width);
}
template <>
__inline__ __device__ float shfl_up32(float var, unsigned int delta, int width)
{
return __shfl_up(var, delta, width);
}
template <typename T>
static __inline__ __device__ T shfl_up64(T var, unsigned int delta, int width=warpSize)
{
union { T a; struct { Int32 lo; Int32 hi; }; } u, v;
v.a = var;
u.lo = __shfl_up(v.lo, delta, warpSize);
u.hi = __shfl_up(v.hi, delta, warpSize);
return u.a;
}
template <typename T>
static __inline__ __device__ T shfl_xor32(T var, int laneMask, int width=warpSize)
{
union { T a; Int32 b; } u, v;
v.a = var;
u.b = __shfl_xor(v.b, laneMask, warpSize);
return u.a;
}
template <>
__inline__ __device__ int shfl_xor32(int var, int laneMask, int width)
{
return __shfl_xor(var, laneMask, width);
}
template <>
__inline__ __device__ float shfl_xor32(float var, int laneMask, int width)
{
return __shfl_xor(var, laneMask, width);
}
template <typename T>
static __inline__ __device__ T shfl_xor64(T var, int laneMask, int width=warpSize)
{
union { T a; struct { Int32 lo; Int32 hi; }; } u, v;
v.a = var;
u.lo = __shfl_xor(v.lo, laneMask, warpSize);
u.hi = __shfl_xor(v.hi, laneMask, warpSize);
return u.a;
}
#endif
#if 0
/* -----------------------------------------------------------------------------
* Additional helper functions
* -------------------------------------------------------------------------- */
/*
* Determine if the input is a power of two
*/
template <typename T>
static __inline__ __host__ __device__ T isPow2(const T x)
{
return ((x&(x-1)) == 0);
}
/*
* Compute the next highest power of two
*/
template <typename T>
static __inline__ __host__ T ceilPow2(const T x)
{
#if 0
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
#endif
return (isPow2(x)) ? x : 1u << (int) ceil(log2((double)x));
}
/*
* Compute the next lowest power of two
*/
template <typename T>
static __inline__ __host__ T floorPow2(const T x)
{
#if 0
float nf = (float) n;
return 1 << (((*(int*)&nf) >> 23) - 127);
#endif
int exp;
frexp(x, &exp);
return 1 << (exp - 1);
}
/*
* computes next highest multiple of f from x
*/
template <typename T>
static __inline__ __host__ T multiple(const T x, const T f)
{
return ((x + (f-1)) / f);
}
/*
* MS Excel-style CEIL() function. Rounds x up to nearest multiple of f
*/
template <typename T>
static __inline__ __host__ T ceiling(const T x, const T f)
{
return multiple(x, f) * f;
}
#endif
#endif // __cplusplus
#endif // __ACCELERATE_CUDA_FUNCTION_H__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment