Last active
November 19, 2017 05:21
-
-
Save khotilov/3dcf8ce42631880bbaf8d03d457d0eb2 to your computer and use it in GitHub Desktop.
AVX sigmoid test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(Rcpp) | |
# set the compiler flags in ~/Documents/.R/Makevars or ~/Documents/.R/Makevars | |
# by adding the following line to it | |
# CXXFLAGS=-O3 -Wall -mtune=native -funroll-loops -mavx -mfma | |
sourceCpp("avx_test.cc") | |
curve(approximate_sigmoid(x) - 1/(1 + exp(-x)), -12, 12, n = 1000); grid() | |
# the exp4096 mostly underestimates the exp | |
curve(log(exp4096(x)) - x, -9, 9, n = 1000); grid() | |
curve(exp4096(x) - exp(x), -9, 5, n = 1000); grid() | |
# verify the monotonicity | |
x <- seq(from=-12, to=12, length.out = 8*100) | |
min(diff(approximate_sigmoid(x))) | |
## 0 | |
min(diff(exp4096(x))) | |
## 1.84265e-07 | |
library(microbenchmark) | |
x <- runif(1000000) | |
microbenchmark(exp4096(x), exp(x)) | |
## Unit: milliseconds | |
## expr min lq mean median uq max neval cld | |
## exp4096(x) 5.636702 5.842971 8.801399 6.394068 9.333585 133.17423 100 a | |
## exp(x) 50.410999 51.238929 53.008278 51.859162 54.684212 75.79283 100 b | |
microbenchmark(approximate_sigmoid(x), 1/(1+exp(x))) | |
## Unit: milliseconds | |
## expr min lq mean median uq max neval cld | |
## approximate_sigmoid(x) 5.719951 5.929213 7.354263 6.180529 9.400727 14.57115 100 a | |
## 1/(1 + exp(x)) 54.512440 55.593252 59.545110 56.124107 58.769688 187.05631 100 b |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// An Rcpp wrapper for easy testing of the AVX sigmoid and exp implementation in | |
// https://github.com/dmlc/xgboost/pull/2878 | |
#include <intrin.h> | |
#include <algorithm> | |
#include <Rcpp.h> | |
using namespace Rcpp; | |
struct Float8 { | |
__m256 x; | |
explicit Float8(const __m256& x) : x(x) {} | |
explicit Float8(const float& val) : x(_mm256_broadcast_ss(&val)) {} | |
explicit Float8(const float* vec) : x(_mm256_loadu_ps(vec)) {} | |
Float8() : x() {} | |
Float8& operator+=(const Float8& rhs) { | |
x = _mm256_add_ps(x, rhs.x); | |
return *this; | |
} | |
Float8& operator-=(const Float8& rhs) { | |
x = _mm256_sub_ps(x, rhs.x); | |
return *this; | |
} | |
Float8& operator*=(const Float8& rhs) { | |
x = _mm256_mul_ps(x, rhs.x); | |
return *this; | |
} | |
Float8& operator/=(const Float8& rhs) { | |
x = _mm256_div_ps(x, rhs.x); | |
return *this; | |
} | |
void Print() { | |
float* f = reinterpret_cast<float*>(&x); | |
printf("%f %f %f %f %f %f %f %f\n", f[0], f[1], f[2], f[3], f[4], f[5], | |
f[6], f[7]); | |
} | |
}; | |
inline Float8 operator+(Float8 lhs, const Float8& rhs) { | |
lhs += rhs; | |
return lhs; | |
} | |
inline Float8 operator-(Float8 lhs, const Float8& rhs) { | |
lhs -= rhs; | |
return lhs; | |
} | |
inline Float8 operator*(Float8 lhs, const Float8& rhs) { | |
lhs *= rhs; | |
return lhs; | |
} | |
inline Float8 operator/(Float8 lhs, const Float8& rhs) { | |
lhs /= rhs; | |
return lhs; | |
} | |
// https://codingforspeed.com/using-faster-exponential-approximation/ | |
inline Float8 Exp4096(Float8 x) { | |
Float8 fraction(1.0 / 4096.0); | |
Float8 ones(1.0f); | |
x.x = _mm256_fmadd_ps(x.x, fraction.x, ones.x); // x = 1.0 + x / 4096 | |
//x *= fraction; | |
//x += ones; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
x *= x; | |
return x; | |
} | |
inline Float8 ApproximateSigmoid(Float8 x) { | |
Float8 exp = Exp4096(x * Float8(-1.0f)); | |
x = Float8(1.0f) + exp; | |
return Float8(_mm256_rcp_ps(x.x)); | |
} | |
inline Float8 avxmax(const Float8& a, const Float8& b) { | |
return Float8(_mm256_max_ps(a.x, b.x)); | |
} | |
// Store 8 gradient pairs given vectors containing gradient and Hessian | |
struct bst_pair {float grad; float hess;}; | |
inline void StoreGpair(bst_pair* dst, const Float8& grad, | |
const Float8& hess) { | |
float* ptr = reinterpret_cast<float*>(dst); | |
__m256 gpair_low = _mm256_unpacklo_ps(grad.x, hess.x); | |
__m256 gpair_high = _mm256_unpackhi_ps(grad.x, hess.x); | |
_mm256_storeu_ps(ptr, _mm256_permute2f128_ps(gpair_low, gpair_high, 0x20)); | |
_mm256_storeu_ps(ptr + 8, _mm256_permute2f128_ps(gpair_low, gpair_high, 0x31)); | |
} | |
// Store 8 float values | |
void StoreVec(float* dst, const Float8& x) { | |
float* ptr = reinterpret_cast<float*>(dst); | |
_mm256_storeu_ps(ptr, x.x); | |
} | |
//// R interface | |
// [[Rcpp::export]] | |
NumericVector approximate_sigmoid(NumericVector x) { | |
// have to convert doubles to float | |
std::vector<float> in = as<std::vector<float> >(x); | |
size_t n = x.size(); | |
std::vector<float> out(n); | |
const size_t remainder = n % 8; | |
for (size_t i = 0; i < n - remainder; i += 8) { | |
Float8 r = ApproximateSigmoid(Float8(&in[i])); | |
StoreVec(&out[i], r); | |
} | |
for (size_t i = n - remainder; i < n; ++i) { | |
out[i] = 1.0f / (1.0f + std::exp(-in[i])); | |
} | |
return wrap(out); | |
} | |
// [[Rcpp::export]] | |
NumericVector exp4096(NumericVector x) { | |
// have to convert doubles to float | |
std::vector<float> in = as<std::vector<float> >(x); | |
size_t n = x.size(); | |
std::vector<float> out(n); | |
const size_t remainder = n % 8; | |
for (size_t i = 0; i < n - remainder; i += 8) { | |
Float8 r = Exp4096(Float8(&in[i])); | |
StoreVec(&out[i], r); | |
} | |
for (size_t i = n - remainder; i < n; ++i) { | |
out[i] = std::exp(-in[i]); | |
} | |
return wrap(out); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment