-
-
Save RAMitchell/79d84d7ce1b4a4919683c4fa5ef57312 to your computer and use it in GitHub Desktop.
AVX sigmoid test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(Rcpp) | |
# set the compiler flags in ~/Documents/.R/Makevars or ~/Documents/.R/Makevars | |
# by adding the following line to it | |
# CXXFLAGS=-O3 -Wall -mtune=native -funroll-loops -mavx -mfma | |
sourceCpp("avx_test.cc") | |
curve(approximate_sigmoid(x) - 1/(1 + exp(-x)), -12, 12, n = 1000); grid() | |
# the exp4096 mostly underestimates the exp | |
curve(log(exp4096(x)) - x, -9, 9, n = 1000); grid() | |
curve(log(expAgner(x)) - x, -9, 9, n = 1000); grid() | |
curve(exp4096(x) - exp(x), -9, 5, n = 1000); grid() | |
# verify the monotonicity | |
x <- seq(from=-12, to=12, length.out = 8*100) | |
min(diff(approximate_sigmoid(x))) | |
## 0 | |
min(diff(exp4096(x))) | |
## 1.84265e-07 | |
library(microbenchmark) | |
x <- runif(1000000) | |
microbenchmark(exp4096(x), exp(x)) | |
## Unit: milliseconds | |
## expr min lq mean median uq max neval cld | |
## exp4096(x) 5.636702 5.842971 8.801399 6.394068 9.333585 133.17423 100 a | |
## exp(x) 50.410999 51.238929 53.008278 51.859162 54.684212 75.79283 100 b | |
microbenchmark(expAgner(x), exp(x)) | |
microbenchmark(approximate_sigmoid(x), 1/(1+exp(x))) | |
## Unit: milliseconds | |
## expr min lq mean median uq max neval cld | |
## approximate_sigmoid(x) 5.719951 5.929213 7.354263 6.180529 9.400727 14.57115 100 a | |
## 1/(1 + exp(x)) 54.512440 55.593252 59.545110 56.124107 58.769688 187.05631 100 b |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// An Rcpp wrapper for easy testing of the AVX sigmoid and exp implementation in | |
// https://github.com/dmlc/xgboost/pull/2878 | |
#include <Rcpp.h> | |
#include <intrin.h> | |
#include <algorithm> | |
using namespace Rcpp; | |
struct Float8 { | |
__m256 x; | |
explicit Float8(const __m256& x) : x(x) {} | |
explicit Float8(const float& val) : x(_mm256_broadcast_ss(&val)) {} | |
explicit Float8(const float* vec) : x(_mm256_loadu_ps(vec)) {} | |
Float8() : x() {} | |
Float8& operator+=(const Float8& rhs) { | |
x = _mm256_add_ps(x, rhs.x); | |
return *this; | |
} | |
Float8& operator-=(const Float8& rhs) { | |
x = _mm256_sub_ps(x, rhs.x); | |
return *this; | |
} | |
Float8& operator*=(const Float8& rhs) { | |
x = _mm256_mul_ps(x, rhs.x); | |
return *this; | |
} | |
Float8& operator/=(const Float8& rhs) { | |
x = _mm256_div_ps(x, rhs.x); | |
return *this; | |
} | |
void Print() { | |
float* f = reinterpret_cast<float*>(&x); | |
printf("%f %f %f %f %f %f %f %f\n", f[0], f[1], f[2], f[3], f[4], f[5], | |
f[6], f[7]); | |
} | |
}; | |
inline Float8 operator+(Float8 lhs, const Float8& rhs) { | |
lhs += rhs; | |
return lhs; | |
} | |
inline Float8 operator-(Float8 lhs, const Float8& rhs) { | |
lhs -= rhs; | |
return lhs; | |
} | |
inline Float8 operator*(Float8 lhs, const Float8& rhs) { | |
lhs *= rhs; | |
return lhs; | |
} | |
inline Float8 operator/(Float8 lhs, const Float8& rhs) { | |
lhs /= rhs; | |
return lhs; | |
} | |
inline Float8 round(const Float8& x) { | |
return Float8(_mm256_round_ps(x.x, _MM_FROUND_TO_NEAREST_INT)); | |
} | |
inline Float8 float8_max(const Float8& a, const Float8& b) { | |
return Float8(_mm256_max_ps(a.x, b.x)); | |
} | |
inline Float8 float8_min(const Float8& a, const Float8& b) { | |
return Float8(_mm256_min_ps(a.x, b.x)); | |
} | |
// https://codingforspeed.com/using-faster-exponential-approximation/ | |
inline Float8 Exp4096(Float8 x) { | |
Float8 fraction(1.0 / 4096.0); | |
Float8 ones(1.0f); | |
x.x = _mm256_fmadd_ps(x.x, fraction.x, ones.x); // x = 1.0 + x / 4096 | |
// x *= fraction; | |
// x += ones; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
return x; | |
} | |
static inline Float8 pow2n(Float8 const& n) { | |
const float pow2_23 = 8388608.0; // 2^23 | |
const float bias = 127.0; // bias in exponent | |
Float8 a = | |
n + Float8(bias + pow2_23); // put n + bias in least significant bits | |
__m256i b = _mm256_castps_si256(a.x); | |
// Do bit shift in SSE so we don't have to use AVX2 instructions | |
__m128i c1 = _mm256_castsi256_si128(b); | |
b = _mm256_permute2f128_si256(b, b, 1); | |
__m128i c2 = _mm256_castsi256_si128(b); | |
c1 = _mm_slli_epi32(c1, 23); | |
c2 = _mm_slli_epi32(c2, 23); | |
__m256i c = _mm256_insertf128_si256(_mm256_castsi128_si256(c1), (c2), 0x1); | |
return Float8(_mm256_castsi256_ps(c)); | |
} | |
static inline Float8 polynomial_5(Float8 const& x, const float c0, | |
const float c1, const float c2, | |
const float c3, const float c4, | |
const float c5) { | |
// calculates polynomial c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 | |
Float8 x2 = x * x; | |
Float8 x4 = x2 * x2; | |
return (Float8(c2) + Float8(c3) * x) * x2 + | |
((Float8(c4) + Float8(c5) * x) * x4 + (Float8(c0) + Float8(c1) * x)); | |
} | |
inline Float8 ExpAgner(Float8 x) { | |
// Clamp input values | |
float max_x = 87.3f; | |
x = float8_min(x, Float8(max_x)); | |
x = float8_max(x, Float8(-max_x)); | |
// 1/log(2) | |
const float log2e = 1.44269504088896340736f; | |
// Taylor coefficients | |
const float P0expf = 1.f / 2.f; | |
const float P1expf = 1.f / 6.f; | |
const float P2expf = 1.f / 24.f; | |
const float P3expf = 1.f / 120.f; | |
const float P4expf = 1.f / 720.f; | |
const float P5expf = 1.f / 5040.f; | |
const float ln2f_hi = 0.693359375f; | |
const float ln2f_lo = -2.12194440e-4f; | |
Float8 r = round(x * Float8(log2e)); | |
x -= r * Float8(ln2f_hi); | |
x -= r * Float8(ln2f_lo); | |
Float8 x2 = x * x; | |
Float8 z = polynomial_5(x, P0expf, P1expf, P2expf, P3expf, P4expf, P5expf); | |
z *= x2; | |
z += x; | |
// multiply by power of 2 | |
Float8 n2 = pow2n(r); | |
z = (z + Float8(1.0f)) * n2; | |
return z; | |
} | |
inline Float8 ApproximateSigmoid(Float8 x) { | |
Float8 exp = ExpAgner(x * Float8(-1.0f)); | |
x = Float8(1.0f) + exp; | |
return Float8(1.0f) / x; | |
} | |
inline Float8 avxmax(const Float8& a, const Float8& b) { | |
return Float8(_mm256_max_ps(a.x, b.x)); | |
} | |
// Store 8 gradient pairs given vectors containing gradient and Hessian | |
struct bst_pair { | |
float grad; | |
float hess; | |
}; | |
inline void StoreGpair(bst_pair* dst, const Float8& grad, const Float8& hess) { | |
float* ptr = reinterpret_cast<float*>(dst); | |
__m256 gpair_low = _mm256_unpacklo_ps(grad.x, hess.x); | |
__m256 gpair_high = _mm256_unpackhi_ps(grad.x, hess.x); | |
_mm256_storeu_ps(ptr, _mm256_permute2f128_ps(gpair_low, gpair_high, 0x20)); | |
_mm256_storeu_ps(ptr + 8, | |
_mm256_permute2f128_ps(gpair_low, gpair_high, 0x31)); | |
} | |
// Store 8 float values | |
void StoreVec(float* dst, const Float8& x) { | |
float* ptr = reinterpret_cast<float*>(dst); | |
_mm256_storeu_ps(ptr, x.x); | |
} | |
//// R interface | |
// [[Rcpp::export]] | |
NumericVector approximate_sigmoid(NumericVector x) { | |
// have to convert doubles to float | |
std::vector<float> in = as<std::vector<float> >(x); | |
size_t n = x.size(); | |
std::vector<float> out(n); | |
const size_t remainder = n % 8; | |
for (size_t i = 0; i < n - remainder; i += 8) { | |
Float8 r = ApproximateSigmoid(Float8(&in[i])); | |
StoreVec(&out[i], r); | |
} | |
for (size_t i = n - remainder; i < n; ++i) { | |
out[i] = 1.0f / (1.0f + std::exp(-in[i])); | |
} | |
return wrap(out); | |
} | |
// [[Rcpp::export]] | |
NumericVector exp4096(NumericVector x) { | |
// have to convert doubles to float | |
std::vector<float> in = as<std::vector<float> >(x); | |
size_t n = x.size(); | |
std::vector<float> out(n); | |
const size_t remainder = n % 8; | |
for (size_t i = 0; i < n - remainder; i += 8) { | |
Float8 r = Exp4096(Float8(&in[i])); | |
StoreVec(&out[i], r); | |
} | |
for (size_t i = n - remainder; i < n; ++i) { | |
out[i] = std::exp(-in[i]); | |
} | |
return wrap(out); | |
} | |
// [[Rcpp::export]] | |
NumericVector expAgner(NumericVector x) { | |
// have to convert doubles to float | |
std::vector<float> in = as<std::vector<float> >(x); | |
size_t n = x.size(); | |
std::vector<float> out(n); | |
const size_t remainder = n % 8; | |
for (size_t i = 0; i < n - remainder; i += 8) { | |
Float8 r = ExpAgner(Float8(&in[i])); | |
StoreVec(&out[i], r); | |
} | |
for (size_t i = n - remainder; i < n; ++i) { | |
out[i] = std::exp(-in[i]); | |
} | |
return wrap(out); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment