khotilov · November 19, 2017 05:21
diff --git a/avx.R b/avx.R
 library(Rcpp)
 # set the compiler flags in ~/Documents/.R/Makevars or ~/Documents/.R/Makevars
 # by adding the following line to it
 # CXXFLAGS=-O3 -Wall -mtune=native -funroll-loops -mavx -mfma
 sourceCpp("avx_test.cc")

 curve(approximate_sigmoid(x) - 1/(1 + exp(-x)), -12, 12, n = 1000); grid()

 # the exp4096 mostly underestimates the exp
 curve(log(exp4096(x)) - x, -9, 9, n = 1000); grid()
 curve(exp4096(x) - exp(x), -9, 5, n = 1000); grid()

 # verify the monotonicity
 x <- seq(from=-12, to=12, length.out = 8*100)
 min(diff(approximate_sigmoid(x)))
 ## 0
 min(diff(exp4096(x)))
 ## 1.84265e-07


 library(microbenchmark)
 x <- runif(1000000)
 microbenchmark(exp4096(x), exp(x))
 ## Unit: milliseconds
 ##        expr       min        lq      mean    median        uq       max neval cld
 ##  exp4096(x)  5.636702  5.842971  8.801399  6.394068  9.333585 133.17423   100  a 
 ##      exp(x) 50.410999 51.238929 53.008278 51.859162 54.684212  75.79283   100   b
 microbenchmark(approximate_sigmoid(x), 1/(1+exp(x)))
 ## Unit: milliseconds
 ##                    expr       min        lq      mean    median        uq       max neval cld
 ##  approximate_sigmoid(x)  5.719951  5.929213  7.354263  6.180529  9.400727  14.57115   100  a 
 ##          1/(1 + exp(x)) 54.512440 55.593252 59.545110 56.124107 58.769688 187.05631   100   b
diff --git a/avx_test.cc b/avx_test.cc
 // An Rcpp wrapper for easy testing of the AVX sigmoid and exp implementation in
 // https://github.com/dmlc/xgboost/pull/2878

 #include <intrin.h>
 #include <algorithm>
 #include <Rcpp.h>

 using namespace Rcpp;

 struct Float8 {
  __m256 x;
  explicit Float8(const __m256& x) : x(x) {}
  explicit Float8(const float& val) : x(_mm256_broadcast_ss(&val)) {}
  explicit Float8(const float* vec) : x(_mm256_loadu_ps(vec)) {}
  Float8() : x() {}
  Float8& operator+=(const Float8& rhs) {
    x = _mm256_add_ps(x, rhs.x);
    return *this;
  }
  Float8& operator-=(const Float8& rhs) {
    x = _mm256_sub_ps(x, rhs.x);
    return *this;
  }
  Float8& operator*=(const Float8& rhs) {
    x = _mm256_mul_ps(x, rhs.x);
    return *this;
  }
  Float8& operator/=(const Float8& rhs) {
    x = _mm256_div_ps(x, rhs.x);
    return *this;
  }
  void Print() {
    float* f = reinterpret_cast<float*>(&x);
    printf("%f %f %f %f %f %f %f %f\n", f[0], f[1], f[2], f[3], f[4], f[5],
           f[6], f[7]);
  }
 };

 inline Float8 operator+(Float8 lhs, const Float8& rhs) {
  lhs += rhs;
  return lhs;
 }
 inline Float8 operator-(Float8 lhs, const Float8& rhs) {
  lhs -= rhs;
  return lhs;
 }
 inline Float8 operator*(Float8 lhs, const Float8& rhs) {
  lhs *= rhs;
  return lhs;
 }
 inline Float8 operator/(Float8 lhs, const Float8& rhs) {
  lhs /= rhs;
  return lhs;
 }

 // https://codingforspeed.com/using-faster-exponential-approximation/
 inline Float8 Exp4096(Float8 x) {
  Float8 fraction(1.0 / 4096.0);
  Float8 ones(1.0f);
  x.x = _mm256_fmadd_ps(x.x, fraction.x, ones.x);  // x = 1.0 + x / 4096
  //x *= fraction;
  //x += ones;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  return x;
 }

 inline Float8 ApproximateSigmoid(Float8 x) {
  Float8 exp = Exp4096(x * Float8(-1.0f));
  x = Float8(1.0f) + exp;
  return Float8(_mm256_rcp_ps(x.x));
 }

 inline Float8 avxmax(const Float8& a, const Float8& b) {
  return Float8(_mm256_max_ps(a.x, b.x));
 }

 // Store 8 gradient pairs given vectors containing gradient and Hessian
 struct bst_pair {float grad; float hess;};
 inline void StoreGpair(bst_pair* dst, const Float8& grad,
                       const Float8& hess) {
  float* ptr = reinterpret_cast<float*>(dst);
  __m256 gpair_low = _mm256_unpacklo_ps(grad.x, hess.x);
  __m256 gpair_high = _mm256_unpackhi_ps(grad.x, hess.x);
  _mm256_storeu_ps(ptr, _mm256_permute2f128_ps(gpair_low, gpair_high, 0x20));
  _mm256_storeu_ps(ptr + 8, _mm256_permute2f128_ps(gpair_low, gpair_high, 0x31));
 }

 // Store 8 float values
 void StoreVec(float* dst, const Float8& x) {
  float* ptr = reinterpret_cast<float*>(dst);
  _mm256_storeu_ps(ptr, x.x);
 }


 //// R interface

 // [[Rcpp::export]]
 NumericVector approximate_sigmoid(NumericVector x) {
  // have to convert doubles to float
  std::vector<float> in = as<std::vector<float> >(x);
  size_t n = x.size();
  std::vector<float> out(n);
  
  const size_t remainder = n % 8;
  for (size_t i = 0; i < n - remainder; i += 8) {
      Float8 r = ApproximateSigmoid(Float8(&in[i]));
      StoreVec(&out[i], r);
  }
  for (size_t i = n - remainder; i < n; ++i) {
    out[i] = 1.0f / (1.0f + std::exp(-in[i]));
  }
  return wrap(out);
 }

 // [[Rcpp::export]]
 NumericVector exp4096(NumericVector x) {
  // have to convert doubles to float
  std::vector<float> in = as<std::vector<float> >(x);
  size_t n = x.size();
  std::vector<float> out(n);
  
  const size_t remainder = n % 8;
  for (size_t i = 0; i < n - remainder; i += 8) {
      Float8 r = Exp4096(Float8(&in[i]));
      StoreVec(&out[i], r);
  }
  for (size_t i = n - remainder; i < n; ++i) {
    out[i] = std::exp(-in[i]);
  }
  return wrap(out);
 }
	library(Rcpp)
	# set the compiler flags in ~/Documents/.R/Makevars or ~/Documents/.R/Makevars
	# by adding the following line to it
	# CXXFLAGS=-O3 -Wall -mtune=native -funroll-loops -mavx -mfma
	sourceCpp("avx_test.cc")

	curve(approximate_sigmoid(x) - 1/(1 + exp(-x)), -12, 12, n = 1000); grid()

	# the exp4096 mostly underestimates the exp
	curve(log(exp4096(x)) - x, -9, 9, n = 1000); grid()
	curve(exp4096(x) - exp(x), -9, 5, n = 1000); grid()

	# verify the monotonicity
	x <- seq(from=-12, to=12, length.out = 8*100)
	min(diff(approximate_sigmoid(x)))
	## 0
	min(diff(exp4096(x)))
	## 1.84265e-07


	library(microbenchmark)
	x <- runif(1000000)
	microbenchmark(exp4096(x), exp(x))
	## Unit: milliseconds
	## expr min lq mean median uq max neval cld
	## exp4096(x) 5.636702 5.842971 8.801399 6.394068 9.333585 133.17423 100 a
	## exp(x) 50.410999 51.238929 53.008278 51.859162 54.684212 75.79283 100 b
	microbenchmark(approximate_sigmoid(x), 1/(1+exp(x)))
	## Unit: milliseconds
	## expr min lq mean median uq max neval cld
	## approximate_sigmoid(x) 5.719951 5.929213 7.354263 6.180529 9.400727 14.57115 100 a
	## 1/(1 + exp(x)) 54.512440 55.593252 59.545110 56.124107 58.769688 187.05631 100 b
	// An Rcpp wrapper for easy testing of the AVX sigmoid and exp implementation in
	// https://github.com/dmlc/xgboost/pull/2878

	#include <intrin.h>
	#include <algorithm>
	#include <Rcpp.h>

	using namespace Rcpp;

	struct Float8 {
	__m256 x;
	explicit Float8(const __m256& x) : x(x) {}
	explicit Float8(const float& val) : x(_mm256_broadcast_ss(&val)) {}
	explicit Float8(const float* vec) : x(_mm256_loadu_ps(vec)) {}
	Float8() : x() {}
	Float8& operator+=(const Float8& rhs) {
	x = _mm256_add_ps(x, rhs.x);
	return *this;
	}
	Float8& operator-=(const Float8& rhs) {
	x = _mm256_sub_ps(x, rhs.x);
	return *this;
	}
	Float8& operator*=(const Float8& rhs) {
	x = _mm256_mul_ps(x, rhs.x);
	return *this;
	}
	Float8& operator/=(const Float8& rhs) {
	x = _mm256_div_ps(x, rhs.x);
	return *this;
	}
	void Print() {
	float* f = reinterpret_cast<float*>(&x);
	printf("%f %f %f %f %f %f %f %f\n", f[0], f[1], f[2], f[3], f[4], f[5],
	f[6], f[7]);
	}
	};

	inline Float8 operator+(Float8 lhs, const Float8& rhs) {
	lhs += rhs;
	return lhs;
	}
	inline Float8 operator-(Float8 lhs, const Float8& rhs) {
	lhs -= rhs;
	return lhs;
	}
	inline Float8 operator*(Float8 lhs, const Float8& rhs) {
	lhs *= rhs;
	return lhs;
	}
	inline Float8 operator/(Float8 lhs, const Float8& rhs) {
	lhs /= rhs;
	return lhs;
	}

	// https://codingforspeed.com/using-faster-exponential-approximation/
	inline Float8 Exp4096(Float8 x) {
	Float8 fraction(1.0 / 4096.0);
	Float8 ones(1.0f);
	x.x = _mm256_fmadd_ps(x.x, fraction.x, ones.x); // x = 1.0 + x / 4096
	//x *= fraction;
	//x += ones;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	return x;
	}

	inline Float8 ApproximateSigmoid(Float8 x) {
	Float8 exp = Exp4096(x * Float8(-1.0f));
	x = Float8(1.0f) + exp;
	return Float8(_mm256_rcp_ps(x.x));
	}

	inline Float8 avxmax(const Float8& a, const Float8& b) {
	return Float8(_mm256_max_ps(a.x, b.x));
	}

	// Store 8 gradient pairs given vectors containing gradient and Hessian
	struct bst_pair {float grad; float hess;};
	inline void StoreGpair(bst_pair* dst, const Float8& grad,
	const Float8& hess) {
	float* ptr = reinterpret_cast<float*>(dst);
	__m256 gpair_low = _mm256_unpacklo_ps(grad.x, hess.x);
	__m256 gpair_high = _mm256_unpackhi_ps(grad.x, hess.x);
	_mm256_storeu_ps(ptr, _mm256_permute2f128_ps(gpair_low, gpair_high, 0x20));
	_mm256_storeu_ps(ptr + 8, _mm256_permute2f128_ps(gpair_low, gpair_high, 0x31));
	}

	// Store 8 float values
	void StoreVec(float* dst, const Float8& x) {
	float* ptr = reinterpret_cast<float*>(dst);
	_mm256_storeu_ps(ptr, x.x);
	}


	//// R interface

	// [[Rcpp::export]]
	NumericVector approximate_sigmoid(NumericVector x) {
	// have to convert doubles to float
	std::vector<float> in = as<std::vector<float> >(x);
	size_t n = x.size();
	std::vector<float> out(n);

	const size_t remainder = n % 8;
	for (size_t i = 0; i < n - remainder; i += 8) {
	Float8 r = ApproximateSigmoid(Float8(&in[i]));
	StoreVec(&out[i], r);
	}
	for (size_t i = n - remainder; i < n; ++i) {
	out[i] = 1.0f / (1.0f + std::exp(-in[i]));
	}
	return wrap(out);
	}

	// [[Rcpp::export]]
	NumericVector exp4096(NumericVector x) {
	// have to convert doubles to float
	std::vector<float> in = as<std::vector<float> >(x);
	size_t n = x.size();
	std::vector<float> out(n);

	const size_t remainder = n % 8;
	for (size_t i = 0; i < n - remainder; i += 8) {
	Float8 r = Exp4096(Float8(&in[i]));
	StoreVec(&out[i], r);
	}
	for (size_t i = n - remainder; i < n; ++i) {
	out[i] = std::exp(-in[i]);
	}
	return wrap(out);
	}