Skip to content

Instantly share code, notes, and snippets.

@khotilov
Last active November 19, 2017 05:21
Show Gist options
  • Save khotilov/3dcf8ce42631880bbaf8d03d457d0eb2 to your computer and use it in GitHub Desktop.
Save khotilov/3dcf8ce42631880bbaf8d03d457d0eb2 to your computer and use it in GitHub Desktop.
AVX sigmoid test
library(Rcpp)
# set the compiler flags in ~/Documents/.R/Makevars or ~/Documents/.R/Makevars
# by adding the following line to it
# CXXFLAGS=-O3 -Wall -mtune=native -funroll-loops -mavx -mfma
sourceCpp("avx_test.cc")
curve(approximate_sigmoid(x) - 1/(1 + exp(-x)), -12, 12, n = 1000); grid()
# the exp4096 mostly underestimates the exp
curve(log(exp4096(x)) - x, -9, 9, n = 1000); grid()
curve(exp4096(x) - exp(x), -9, 5, n = 1000); grid()
# verify the monotonicity
x <- seq(from=-12, to=12, length.out = 8*100)
min(diff(approximate_sigmoid(x)))
## 0
min(diff(exp4096(x)))
## 1.84265e-07
library(microbenchmark)
x <- runif(1000000)
microbenchmark(exp4096(x), exp(x))
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## exp4096(x) 5.636702 5.842971 8.801399 6.394068 9.333585 133.17423 100 a
## exp(x) 50.410999 51.238929 53.008278 51.859162 54.684212 75.79283 100 b
microbenchmark(approximate_sigmoid(x), 1/(1+exp(x)))
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## approximate_sigmoid(x) 5.719951 5.929213 7.354263 6.180529 9.400727 14.57115 100 a
## 1/(1 + exp(x)) 54.512440 55.593252 59.545110 56.124107 58.769688 187.05631 100 b
// An Rcpp wrapper for easy testing of the AVX sigmoid and exp implementation in
// https://github.com/dmlc/xgboost/pull/2878
#include <intrin.h>
#include <algorithm>
#include <Rcpp.h>
using namespace Rcpp;
struct Float8 {
__m256 x;
explicit Float8(const __m256& x) : x(x) {}
explicit Float8(const float& val) : x(_mm256_broadcast_ss(&val)) {}
explicit Float8(const float* vec) : x(_mm256_loadu_ps(vec)) {}
Float8() : x() {}
Float8& operator+=(const Float8& rhs) {
x = _mm256_add_ps(x, rhs.x);
return *this;
}
Float8& operator-=(const Float8& rhs) {
x = _mm256_sub_ps(x, rhs.x);
return *this;
}
Float8& operator*=(const Float8& rhs) {
x = _mm256_mul_ps(x, rhs.x);
return *this;
}
Float8& operator/=(const Float8& rhs) {
x = _mm256_div_ps(x, rhs.x);
return *this;
}
void Print() {
float* f = reinterpret_cast<float*>(&x);
printf("%f %f %f %f %f %f %f %f\n", f[0], f[1], f[2], f[3], f[4], f[5],
f[6], f[7]);
}
};
inline Float8 operator+(Float8 lhs, const Float8& rhs) {
lhs += rhs;
return lhs;
}
inline Float8 operator-(Float8 lhs, const Float8& rhs) {
lhs -= rhs;
return lhs;
}
inline Float8 operator*(Float8 lhs, const Float8& rhs) {
lhs *= rhs;
return lhs;
}
inline Float8 operator/(Float8 lhs, const Float8& rhs) {
lhs /= rhs;
return lhs;
}
// https://codingforspeed.com/using-faster-exponential-approximation/
inline Float8 Exp4096(Float8 x) {
Float8 fraction(1.0 / 4096.0);
Float8 ones(1.0f);
x.x = _mm256_fmadd_ps(x.x, fraction.x, ones.x); // x = 1.0 + x / 4096
//x *= fraction;
//x += ones;
x *= x;
x *= x;
x *= x;
x *= x;
x *= x;
x *= x;
x *= x;
x *= x;
x *= x;
x *= x;
x *= x;
x *= x;
return x;
}
inline Float8 ApproximateSigmoid(Float8 x) {
Float8 exp = Exp4096(x * Float8(-1.0f));
x = Float8(1.0f) + exp;
return Float8(_mm256_rcp_ps(x.x));
}
inline Float8 avxmax(const Float8& a, const Float8& b) {
return Float8(_mm256_max_ps(a.x, b.x));
}
// Store 8 gradient pairs given vectors containing gradient and Hessian
struct bst_pair {float grad; float hess;};
inline void StoreGpair(bst_pair* dst, const Float8& grad,
const Float8& hess) {
float* ptr = reinterpret_cast<float*>(dst);
__m256 gpair_low = _mm256_unpacklo_ps(grad.x, hess.x);
__m256 gpair_high = _mm256_unpackhi_ps(grad.x, hess.x);
_mm256_storeu_ps(ptr, _mm256_permute2f128_ps(gpair_low, gpair_high, 0x20));
_mm256_storeu_ps(ptr + 8, _mm256_permute2f128_ps(gpair_low, gpair_high, 0x31));
}
// Store 8 float values
void StoreVec(float* dst, const Float8& x) {
float* ptr = reinterpret_cast<float*>(dst);
_mm256_storeu_ps(ptr, x.x);
}
//// R interface
// [[Rcpp::export]]
NumericVector approximate_sigmoid(NumericVector x) {
// have to convert doubles to float
std::vector<float> in = as<std::vector<float> >(x);
size_t n = x.size();
std::vector<float> out(n);
const size_t remainder = n % 8;
for (size_t i = 0; i < n - remainder; i += 8) {
Float8 r = ApproximateSigmoid(Float8(&in[i]));
StoreVec(&out[i], r);
}
for (size_t i = n - remainder; i < n; ++i) {
out[i] = 1.0f / (1.0f + std::exp(-in[i]));
}
return wrap(out);
}
// [[Rcpp::export]]
NumericVector exp4096(NumericVector x) {
// have to convert doubles to float
std::vector<float> in = as<std::vector<float> >(x);
size_t n = x.size();
std::vector<float> out(n);
const size_t remainder = n % 8;
for (size_t i = 0; i < n - remainder; i += 8) {
Float8 r = Exp4096(Float8(&in[i]));
StoreVec(&out[i], r);
}
for (size_t i = n - remainder; i < n; ++i) {
out[i] = std::exp(-in[i]);
}
return wrap(out);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment