RAMitchell · November 15, 2018 12:02
diff --git a/avx.R b/avx.R
 library(Rcpp)
 # set the compiler flags in ~/Documents/.R/Makevars or ~/Documents/.R/Makevars
 # by adding the following line to it
 # CXXFLAGS=-O3 -Wall -mtune=native -funroll-loops -mavx -mfma
 sourceCpp("avx_test.cc")

 curve(approximate_sigmoid(x) - 1/(1 + exp(-x)), -12, 12, n = 1000); grid()

 # the exp4096 mostly underestimates the exp
 curve(log(exp4096(x)) - x, -9, 9, n = 1000); grid()
 curve(log(expAgner(x)) - x, -9, 9, n = 1000); grid()
 curve(exp4096(x) - exp(x), -9, 5, n = 1000); grid()

 # verify the monotonicity
 x <- seq(from=-12, to=12, length.out = 8*100)
 min(diff(approximate_sigmoid(x)))
 ## 0
 min(diff(exp4096(x)))
 ## 1.84265e-07


 library(microbenchmark)
 x <- runif(1000000)
 microbenchmark(exp4096(x), exp(x))
 ## Unit: milliseconds
 ##        expr       min        lq      mean    median        uq       max neval cld
 ##  exp4096(x)  5.636702  5.842971  8.801399  6.394068  9.333585 133.17423   100  a 
 ##      exp(x) 50.410999 51.238929 53.008278 51.859162 54.684212  75.79283   100   b
 microbenchmark(expAgner(x), exp(x))
 microbenchmark(approximate_sigmoid(x), 1/(1+exp(x)))
 ## Unit: milliseconds
 ##                    expr       min        lq      mean    median        uq       max neval cld
 ##  approximate_sigmoid(x)  5.719951  5.929213  7.354263  6.180529  9.400727  14.57115   100  a 
 ##          1/(1 + exp(x)) 54.512440 55.593252 59.545110 56.124107 58.769688 187.05631   100   b
diff --git a/avx_test.cc b/avx_test.cc
 // An Rcpp wrapper for easy testing of the AVX sigmoid and exp implementation in
 // https://github.com/dmlc/xgboost/pull/2878

 #include <Rcpp.h>
 #include <intrin.h>
 #include <algorithm>

 using namespace Rcpp;

 struct Float8 {
  __m256 x;
  explicit Float8(const __m256& x) : x(x) {}
  explicit Float8(const float& val) : x(_mm256_broadcast_ss(&val)) {}
  explicit Float8(const float* vec) : x(_mm256_loadu_ps(vec)) {}
  Float8() : x() {}
  Float8& operator+=(const Float8& rhs) {
    x = _mm256_add_ps(x, rhs.x);
    return *this;
  }
  Float8& operator-=(const Float8& rhs) {
    x = _mm256_sub_ps(x, rhs.x);
    return *this;
  }
  Float8& operator*=(const Float8& rhs) {
    x = _mm256_mul_ps(x, rhs.x);
    return *this;
  }
  Float8& operator/=(const Float8& rhs) {
    x = _mm256_div_ps(x, rhs.x);
    return *this;
  }
  void Print() {
    float* f = reinterpret_cast<float*>(&x);
    printf("%f %f %f %f %f %f %f %f\n", f[0], f[1], f[2], f[3], f[4], f[5],
           f[6], f[7]);
  }
 };

 inline Float8 operator+(Float8 lhs, const Float8& rhs) {
  lhs += rhs;
  return lhs;
 }
 inline Float8 operator-(Float8 lhs, const Float8& rhs) {
  lhs -= rhs;
  return lhs;
 }
 inline Float8 operator*(Float8 lhs, const Float8& rhs) {
  lhs *= rhs;
  return lhs;
 }
 inline Float8 operator/(Float8 lhs, const Float8& rhs) {
  lhs /= rhs;
  return lhs;
 }

 inline Float8 round(const Float8& x) {
  return Float8(_mm256_round_ps(x.x, _MM_FROUND_TO_NEAREST_INT));
 }

 inline Float8 float8_max(const Float8& a, const Float8& b) {
  return Float8(_mm256_max_ps(a.x, b.x));
 }
 inline Float8 float8_min(const Float8& a, const Float8& b) {
  return Float8(_mm256_min_ps(a.x, b.x));
 }

 // https://codingforspeed.com/using-faster-exponential-approximation/
 inline Float8 Exp4096(Float8 x) {
  Float8 fraction(1.0 / 4096.0);
  Float8 ones(1.0f);
  x.x = _mm256_fmadd_ps(x.x, fraction.x, ones.x);  // x = 1.0 + x / 4096
  // x *= fraction;
  // x += ones;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  x *= x;
  return x;
 }

 static inline Float8 pow2n(Float8 const& n) {
  const float pow2_23 = 8388608.0;  // 2^23
  const float bias = 127.0;         // bias in exponent
  Float8 a =
      n + Float8(bias + pow2_23);  // put n + bias in least significant bits
  __m256i b = _mm256_castps_si256(a.x);

  // Do bit shift in SSE so we don't have to use AVX2 instructions
  __m128i c1 = _mm256_castsi256_si128(b);
  b = _mm256_permute2f128_si256(b, b, 1);
  __m128i c2 = _mm256_castsi256_si128(b);
  c1 = _mm_slli_epi32(c1, 23);
  c2 = _mm_slli_epi32(c2, 23);

  __m256i c = _mm256_insertf128_si256(_mm256_castsi128_si256(c1), (c2), 0x1);
  return Float8(_mm256_castsi256_ps(c));
 }

 static inline Float8 polynomial_5(Float8 const& x, const float c0,
                                  const float c1, const float c2,
                                  const float c3, const float c4,
                                  const float c5) {
  // calculates polynomial c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
  Float8 x2 = x * x;
  Float8 x4 = x2 * x2;
  return (Float8(c2) + Float8(c3) * x) * x2 +
         ((Float8(c4) + Float8(c5) * x) * x4 + (Float8(c0) + Float8(c1) * x));
 }

 inline Float8 ExpAgner(Float8 x) {
  // Clamp input values
  float max_x = 87.3f;
  x = float8_min(x, Float8(max_x));
  x = float8_max(x, Float8(-max_x));

  // 1/log(2)
  const float log2e = 1.44269504088896340736f;

  // Taylor coefficients
  const float P0expf = 1.f / 2.f;
  const float P1expf = 1.f / 6.f;
  const float P2expf = 1.f / 24.f;
  const float P3expf = 1.f / 120.f;
  const float P4expf = 1.f / 720.f;
  const float P5expf = 1.f / 5040.f;

  const float ln2f_hi = 0.693359375f;
  const float ln2f_lo = -2.12194440e-4f;

  Float8 r = round(x * Float8(log2e));
  x -= r * Float8(ln2f_hi);
  x -= r * Float8(ln2f_lo);

  Float8 x2 = x * x;
  Float8 z = polynomial_5(x, P0expf, P1expf, P2expf, P3expf, P4expf, P5expf);
  z *= x2;
  z += x;

  // multiply by power of 2
  Float8 n2 = pow2n(r);

  z = (z + Float8(1.0f)) * n2;
  return z;
 }

 inline Float8 ApproximateSigmoid(Float8 x) {
  Float8 exp = ExpAgner(x * Float8(-1.0f));
  x = Float8(1.0f) + exp;
  return Float8(1.0f) / x;
 }

 inline Float8 avxmax(const Float8& a, const Float8& b) {
  return Float8(_mm256_max_ps(a.x, b.x));
 }

 // Store 8 gradient pairs given vectors containing gradient and Hessian
 struct bst_pair {
  float grad;
  float hess;
 };
 inline void StoreGpair(bst_pair* dst, const Float8& grad, const Float8& hess) {
  float* ptr = reinterpret_cast<float*>(dst);
  __m256 gpair_low = _mm256_unpacklo_ps(grad.x, hess.x);
  __m256 gpair_high = _mm256_unpackhi_ps(grad.x, hess.x);
  _mm256_storeu_ps(ptr, _mm256_permute2f128_ps(gpair_low, gpair_high, 0x20));
  _mm256_storeu_ps(ptr + 8,
                   _mm256_permute2f128_ps(gpair_low, gpair_high, 0x31));
 }

 // Store 8 float values
 void StoreVec(float* dst, const Float8& x) {
  float* ptr = reinterpret_cast<float*>(dst);
  _mm256_storeu_ps(ptr, x.x);
 }

 //// R interface

 // [[Rcpp::export]]
 NumericVector approximate_sigmoid(NumericVector x) {
  // have to convert doubles to float
  std::vector<float> in = as<std::vector<float> >(x);
  size_t n = x.size();
  std::vector<float> out(n);

  const size_t remainder = n % 8;
  for (size_t i = 0; i < n - remainder; i += 8) {
    Float8 r = ApproximateSigmoid(Float8(&in[i]));
    StoreVec(&out[i], r);
  }
  for (size_t i = n - remainder; i < n; ++i) {
    out[i] = 1.0f / (1.0f + std::exp(-in[i]));
  }
  return wrap(out);
 }

 // [[Rcpp::export]]
 NumericVector exp4096(NumericVector x) {
  // have to convert doubles to float
  std::vector<float> in = as<std::vector<float> >(x);
  size_t n = x.size();
  std::vector<float> out(n);

  const size_t remainder = n % 8;
  for (size_t i = 0; i < n - remainder; i += 8) {
    Float8 r = Exp4096(Float8(&in[i]));
    StoreVec(&out[i], r);
  }
  for (size_t i = n - remainder; i < n; ++i) {
    out[i] = std::exp(-in[i]);
  }
  return wrap(out);
 }

 // [[Rcpp::export]]
 NumericVector expAgner(NumericVector x) {
  // have to convert doubles to float
  std::vector<float> in = as<std::vector<float> >(x);
  size_t n = x.size();
  std::vector<float> out(n);

  const size_t remainder = n % 8;
  for (size_t i = 0; i < n - remainder; i += 8) {
    Float8 r = ExpAgner(Float8(&in[i]));
    StoreVec(&out[i], r);
  }
  for (size_t i = n - remainder; i < n; ++i) {
    out[i] = std::exp(-in[i]);
  }
  return wrap(out);
 }
	library(Rcpp)
	# set the compiler flags in ~/Documents/.R/Makevars or ~/Documents/.R/Makevars
	# by adding the following line to it
	# CXXFLAGS=-O3 -Wall -mtune=native -funroll-loops -mavx -mfma
	sourceCpp("avx_test.cc")

	curve(approximate_sigmoid(x) - 1/(1 + exp(-x)), -12, 12, n = 1000); grid()

	# the exp4096 mostly underestimates the exp
	curve(log(exp4096(x)) - x, -9, 9, n = 1000); grid()
	curve(log(expAgner(x)) - x, -9, 9, n = 1000); grid()
	curve(exp4096(x) - exp(x), -9, 5, n = 1000); grid()

	# verify the monotonicity
	x <- seq(from=-12, to=12, length.out = 8*100)
	min(diff(approximate_sigmoid(x)))
	## 0
	min(diff(exp4096(x)))
	## 1.84265e-07


	library(microbenchmark)
	x <- runif(1000000)
	microbenchmark(exp4096(x), exp(x))
	## Unit: milliseconds
	## expr min lq mean median uq max neval cld
	## exp4096(x) 5.636702 5.842971 8.801399 6.394068 9.333585 133.17423 100 a
	## exp(x) 50.410999 51.238929 53.008278 51.859162 54.684212 75.79283 100 b
	microbenchmark(expAgner(x), exp(x))
	microbenchmark(approximate_sigmoid(x), 1/(1+exp(x)))
	## Unit: milliseconds
	## expr min lq mean median uq max neval cld
	## approximate_sigmoid(x) 5.719951 5.929213 7.354263 6.180529 9.400727 14.57115 100 a
	## 1/(1 + exp(x)) 54.512440 55.593252 59.545110 56.124107 58.769688 187.05631 100 b
	// An Rcpp wrapper for easy testing of the AVX sigmoid and exp implementation in
	// https://github.com/dmlc/xgboost/pull/2878

	#include <Rcpp.h>
	#include <intrin.h>
	#include <algorithm>

	using namespace Rcpp;

	struct Float8 {
	__m256 x;
	explicit Float8(const __m256& x) : x(x) {}
	explicit Float8(const float& val) : x(_mm256_broadcast_ss(&val)) {}
	explicit Float8(const float* vec) : x(_mm256_loadu_ps(vec)) {}
	Float8() : x() {}
	Float8& operator+=(const Float8& rhs) {
	x = _mm256_add_ps(x, rhs.x);
	return *this;
	}
	Float8& operator-=(const Float8& rhs) {
	x = _mm256_sub_ps(x, rhs.x);
	return *this;
	}
	Float8& operator*=(const Float8& rhs) {
	x = _mm256_mul_ps(x, rhs.x);
	return *this;
	}
	Float8& operator/=(const Float8& rhs) {
	x = _mm256_div_ps(x, rhs.x);
	return *this;
	}
	void Print() {
	float* f = reinterpret_cast<float*>(&x);
	printf("%f %f %f %f %f %f %f %f\n", f[0], f[1], f[2], f[3], f[4], f[5],
	f[6], f[7]);
	}
	};

	inline Float8 operator+(Float8 lhs, const Float8& rhs) {
	lhs += rhs;
	return lhs;
	}
	inline Float8 operator-(Float8 lhs, const Float8& rhs) {
	lhs -= rhs;
	return lhs;
	}
	inline Float8 operator*(Float8 lhs, const Float8& rhs) {
	lhs *= rhs;
	return lhs;
	}
	inline Float8 operator/(Float8 lhs, const Float8& rhs) {
	lhs /= rhs;
	return lhs;
	}

	inline Float8 round(const Float8& x) {
	return Float8(_mm256_round_ps(x.x, _MM_FROUND_TO_NEAREST_INT));
	}

	inline Float8 float8_max(const Float8& a, const Float8& b) {
	return Float8(_mm256_max_ps(a.x, b.x));
	}
	inline Float8 float8_min(const Float8& a, const Float8& b) {
	return Float8(_mm256_min_ps(a.x, b.x));
	}

	// https://codingforspeed.com/using-faster-exponential-approximation/
	inline Float8 Exp4096(Float8 x) {
	Float8 fraction(1.0 / 4096.0);
	Float8 ones(1.0f);
	x.x = _mm256_fmadd_ps(x.x, fraction.x, ones.x); // x = 1.0 + x / 4096
	// x *= fraction;
	// x += ones;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	x *= x;
	return x;
	}

	static inline Float8 pow2n(Float8 const& n) {
	const float pow2_23 = 8388608.0; // 2^23
	const float bias = 127.0; // bias in exponent
	Float8 a =
	n + Float8(bias + pow2_23); // put n + bias in least significant bits
	__m256i b = _mm256_castps_si256(a.x);

	// Do bit shift in SSE so we don't have to use AVX2 instructions
	__m128i c1 = _mm256_castsi256_si128(b);
	b = _mm256_permute2f128_si256(b, b, 1);
	__m128i c2 = _mm256_castsi256_si128(b);
	c1 = _mm_slli_epi32(c1, 23);
	c2 = _mm_slli_epi32(c2, 23);

	__m256i c = _mm256_insertf128_si256(_mm256_castsi128_si256(c1), (c2), 0x1);
	return Float8(_mm256_castsi256_ps(c));
	}

	static inline Float8 polynomial_5(Float8 const& x, const float c0,
	const float c1, const float c2,
	const float c3, const float c4,
	const float c5) {
	// calculates polynomial c5x^5 + c4x^4 + c3x^3 + c2x^2 + c1*x + c0
	Float8 x2 = x * x;
	Float8 x4 = x2 * x2;
	return (Float8(c2) + Float8(c3) * x) * x2 +
	((Float8(c4) + Float8(c5) * x) * x4 + (Float8(c0) + Float8(c1) * x));
	}

	inline Float8 ExpAgner(Float8 x) {
	// Clamp input values
	float max_x = 87.3f;
	x = float8_min(x, Float8(max_x));
	x = float8_max(x, Float8(-max_x));

	// 1/log(2)
	const float log2e = 1.44269504088896340736f;

	// Taylor coefficients
	const float P0expf = 1.f / 2.f;
	const float P1expf = 1.f / 6.f;
	const float P2expf = 1.f / 24.f;
	const float P3expf = 1.f / 120.f;
	const float P4expf = 1.f / 720.f;
	const float P5expf = 1.f / 5040.f;

	const float ln2f_hi = 0.693359375f;
	const float ln2f_lo = -2.12194440e-4f;

	Float8 r = round(x * Float8(log2e));
	x -= r * Float8(ln2f_hi);
	x -= r * Float8(ln2f_lo);

	Float8 x2 = x * x;
	Float8 z = polynomial_5(x, P0expf, P1expf, P2expf, P3expf, P4expf, P5expf);
	z *= x2;
	z += x;

	// multiply by power of 2
	Float8 n2 = pow2n(r);

	z = (z + Float8(1.0f)) * n2;
	return z;
	}

	inline Float8 ApproximateSigmoid(Float8 x) {
	Float8 exp = ExpAgner(x * Float8(-1.0f));
	x = Float8(1.0f) + exp;
	return Float8(1.0f) / x;
	}

	inline Float8 avxmax(const Float8& a, const Float8& b) {
	return Float8(_mm256_max_ps(a.x, b.x));
	}

	// Store 8 gradient pairs given vectors containing gradient and Hessian
	struct bst_pair {
	float grad;
	float hess;
	};
	inline void StoreGpair(bst_pair* dst, const Float8& grad, const Float8& hess) {
	float* ptr = reinterpret_cast<float*>(dst);
	__m256 gpair_low = _mm256_unpacklo_ps(grad.x, hess.x);
	__m256 gpair_high = _mm256_unpackhi_ps(grad.x, hess.x);
	_mm256_storeu_ps(ptr, _mm256_permute2f128_ps(gpair_low, gpair_high, 0x20));
	_mm256_storeu_ps(ptr + 8,
	_mm256_permute2f128_ps(gpair_low, gpair_high, 0x31));
	}

	// Store 8 float values
	void StoreVec(float* dst, const Float8& x) {
	float* ptr = reinterpret_cast<float*>(dst);
	_mm256_storeu_ps(ptr, x.x);
	}

	//// R interface

	// [[Rcpp::export]]
	NumericVector approximate_sigmoid(NumericVector x) {
	// have to convert doubles to float
	std::vector<float> in = as<std::vector<float> >(x);
	size_t n = x.size();
	std::vector<float> out(n);

	const size_t remainder = n % 8;
	for (size_t i = 0; i < n - remainder; i += 8) {
	Float8 r = ApproximateSigmoid(Float8(&in[i]));
	StoreVec(&out[i], r);
	}
	for (size_t i = n - remainder; i < n; ++i) {
	out[i] = 1.0f / (1.0f + std::exp(-in[i]));
	}
	return wrap(out);
	}

	// [[Rcpp::export]]
	NumericVector exp4096(NumericVector x) {
	// have to convert doubles to float
	std::vector<float> in = as<std::vector<float> >(x);
	size_t n = x.size();
	std::vector<float> out(n);

	const size_t remainder = n % 8;
	for (size_t i = 0; i < n - remainder; i += 8) {
	Float8 r = Exp4096(Float8(&in[i]));
	StoreVec(&out[i], r);
	}
	for (size_t i = n - remainder; i < n; ++i) {
	out[i] = std::exp(-in[i]);
	}
	return wrap(out);
	}

	// [[Rcpp::export]]
	NumericVector expAgner(NumericVector x) {
	// have to convert doubles to float
	std::vector<float> in = as<std::vector<float> >(x);
	size_t n = x.size();
	std::vector<float> out(n);

	const size_t remainder = n % 8;
	for (size_t i = 0; i < n - remainder; i += 8) {
	Float8 r = ExpAgner(Float8(&in[i]));
	StoreVec(&out[i], r);
	}
	for (size_t i = n - remainder; i < n; ++i) {
	out[i] = std::exp(-in[i]);
	}
	return wrap(out);
	}