Skip to content

Instantly share code, notes, and snippets.

@cloneofsimo
Created January 23, 2022 19:04
Show Gist options
  • Save cloneofsimo/313ca4fc9ad48ac1d650be0107cfbf16 to your computer and use it in GitHub Desktop.
Save cloneofsimo/313ca4fc9ad48ac1d650be0107cfbf16 to your computer and use it in GitHub Desktop.
#define DELEGATE_FAST_GELU_GRADIENT_CUDA_KERNEL(T, FMAFunc, TanhFunc) \
template <> \
__global__ void FastGeluGradientCUDAKernel<T>( \
const int N, const T* dY, const T* X, T* dX) { \
constexpr T kAlpha = M_2_SQRTPI * M_SQRT1_2; \
constexpr T kBeta = kAlpha * gelu_utils::kFastCoeff * T(3); \
const int index = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x; \
if (index < N) { \
const T y = TanhFunc( \
kAlpha * \
FMAFunc( \
gelu_utils::kFastCoeff, \
math::utils::Cube<T>(X[index]), \
X[index])); \
dX[index] = FMAFunc( \
FMAFunc(-X[index], y * y, X[index]), \
FMAFunc(kBeta, X[index] * X[index], kAlpha), \
T(1) + y) * \
dY[index] * static_cast<T>(0.5); \
} \
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment