jeethu · August 27, 2024 19:58
diff --git a/bench.cpp b/bench.cpp
 #include <arm_neon.h>
 #include <benchmark/benchmark.h>

 inline float16x8_t neon_fast_exp(float16x8_t x) {
  x = vmulq_f16(x, vdupq_n_f16(float16_t(1.442695f))); // multiply with log_2(e)
  x = vmaxq_f16(x, vdupq_n_f16(float16_t(-14.f)));     // clamp under with -14
  x = vminq_f16(x, vdupq_n_f16(float16_t(14.f)));      // clamp over with 14

  float16x8_t ipart = vrndmq_f16(vaddq_f16(x, vdupq_n_f16(float16_t(0.5f))));
  float16x8_t fpart = vsubq_f16(x, ipart);

  x = vdupq_n_f16(float16_t(1.535336188319500e-4f));
  x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart);
  x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart);
  x = vfmaq_f16(vdupq_n_f16(float16_t(9.618437357674640e-3f)), x, fpart);
  x = vfmaq_f16(vdupq_n_f16(float16_t(5.550332471162809e-2f)), x, fpart);
  x = vfmaq_f16(vdupq_n_f16(float16_t(2.402264791363012e-1f)), x, fpart);
  x = vfmaq_f16(vdupq_n_f16(float16_t(6.931472028550421e-1f)), x, fpart);
  x = vfmaq_f16(vdupq_n_f16(float16_t(1.000000000000000f)), x, fpart);

  // generate 2**ipart in the floating point representation using integer
  // bitshifting
  int16x8_t epart = vcvtq_s16_f16(ipart);
  epart = vaddq_s16(epart, vdupq_n_s16(15));
  epart = vshlq_n_s16(epart, 10);

  return vmulq_f16(vreinterpretq_f16_s16(epart), x);
 }

 inline float16x8_t neon_fast_exp_fixed(float16x8_t x) {
  x = vmulq_f16(x, vdupq_n_f16(float16_t(1.442695f))); // multiply with log_2(e)
  x = vmaxq_f16(x, vdupq_n_f16(float16_t(-14.f)));     // clamp under with -14
  x = vminq_f16(x, vdupq_n_f16(float16_t(14.f)));      // clamp over with 14

  float16x8_t ipart = vrndmq_f16(vaddq_f16(x, vdupq_n_f16(float16_t(0.5f))));
  float16x8_t fpart = vsubq_f16(x, ipart);

  x = vdupq_n_f16(float16_t(1.535336188319500e-4f));
  x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart);
  x = vfmaq_f16(vdupq_n_f16(float16_t(9.618437357674640e-3f)), x, fpart);
  x = vfmaq_f16(vdupq_n_f16(float16_t(5.550332471162809e-2f)), x, fpart);
  x = vfmaq_f16(vdupq_n_f16(float16_t(2.402264791363012e-1f)), x, fpart);
  x = vfmaq_f16(vdupq_n_f16(float16_t(6.931472028550421e-1f)), x, fpart);
  x = vfmaq_f16(vdupq_n_f16(float16_t(1.000000000000000f)), x, fpart);

  // generate 2**ipart in the floating point representation using integer
  // bitshifting
  int16x8_t epart = vcvtq_s16_f16(ipart);
  epart = vaddq_s16(epart, vdupq_n_s16(15));
  epart = vshlq_n_s16(epart, 10);

  return vmulq_f16(vreinterpretq_f16_s16(epart), x);
 }

 // Benchmark function
 template <float16x8_t(Func)(float16x8_t)>
 static void BM_NeonFastExp(benchmark::State &state) {
  float16x8_t input =
      vdupq_n_f16(1.0f); // Example input vector with all elements as 1.0f
  float16x8_t result;

  for (auto _ : state) {
    result = Func(input);
    benchmark::DoNotOptimize(
        result); // Prevent the compiler from optimizing away the call
  }
 }

 // Register the function as a benchmark
 BENCHMARK(BM_NeonFastExp<neon_fast_exp>);
 BENCHMARK(BM_NeonFastExp<neon_fast_exp_fixed>);

 // Main function to run the benchmarks
 BENCHMARK_MAIN();
	#include <arm_neon.h>
	#include <benchmark/benchmark.h>

	inline float16x8_t neon_fast_exp(float16x8_t x) {
	x = vmulq_f16(x, vdupq_n_f16(float16_t(1.442695f))); // multiply with log_2(e)
	x = vmaxq_f16(x, vdupq_n_f16(float16_t(-14.f))); // clamp under with -14
	x = vminq_f16(x, vdupq_n_f16(float16_t(14.f))); // clamp over with 14

	float16x8_t ipart = vrndmq_f16(vaddq_f16(x, vdupq_n_f16(float16_t(0.5f))));
	float16x8_t fpart = vsubq_f16(x, ipart);

	x = vdupq_n_f16(float16_t(1.535336188319500e-4f));
	x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart);
	x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart);
	x = vfmaq_f16(vdupq_n_f16(float16_t(9.618437357674640e-3f)), x, fpart);
	x = vfmaq_f16(vdupq_n_f16(float16_t(5.550332471162809e-2f)), x, fpart);
	x = vfmaq_f16(vdupq_n_f16(float16_t(2.402264791363012e-1f)), x, fpart);
	x = vfmaq_f16(vdupq_n_f16(float16_t(6.931472028550421e-1f)), x, fpart);
	x = vfmaq_f16(vdupq_n_f16(float16_t(1.000000000000000f)), x, fpart);

	// generate 2**ipart in the floating point representation using integer
	// bitshifting
	int16x8_t epart = vcvtq_s16_f16(ipart);
	epart = vaddq_s16(epart, vdupq_n_s16(15));
	epart = vshlq_n_s16(epart, 10);

	return vmulq_f16(vreinterpretq_f16_s16(epart), x);
	}

	inline float16x8_t neon_fast_exp_fixed(float16x8_t x) {
	x = vmulq_f16(x, vdupq_n_f16(float16_t(1.442695f))); // multiply with log_2(e)
	x = vmaxq_f16(x, vdupq_n_f16(float16_t(-14.f))); // clamp under with -14
	x = vminq_f16(x, vdupq_n_f16(float16_t(14.f))); // clamp over with 14

	float16x8_t ipart = vrndmq_f16(vaddq_f16(x, vdupq_n_f16(float16_t(0.5f))));
	float16x8_t fpart = vsubq_f16(x, ipart);

	x = vdupq_n_f16(float16_t(1.535336188319500e-4f));
	x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart);
	x = vfmaq_f16(vdupq_n_f16(float16_t(9.618437357674640e-3f)), x, fpart);
	x = vfmaq_f16(vdupq_n_f16(float16_t(5.550332471162809e-2f)), x, fpart);
	x = vfmaq_f16(vdupq_n_f16(float16_t(2.402264791363012e-1f)), x, fpart);
	x = vfmaq_f16(vdupq_n_f16(float16_t(6.931472028550421e-1f)), x, fpart);
	x = vfmaq_f16(vdupq_n_f16(float16_t(1.000000000000000f)), x, fpart);

	// generate 2**ipart in the floating point representation using integer
	// bitshifting
	int16x8_t epart = vcvtq_s16_f16(ipart);
	epart = vaddq_s16(epart, vdupq_n_s16(15));
	epart = vshlq_n_s16(epart, 10);

	return vmulq_f16(vreinterpretq_f16_s16(epart), x);
	}

	// Benchmark function
	template <float16x8_t(Func)(float16x8_t)>
	static void BM_NeonFastExp(benchmark::State &state) {
	float16x8_t input =
	vdupq_n_f16(1.0f); // Example input vector with all elements as 1.0f
	float16x8_t result;

	for (auto _ : state) {
	result = Func(input);
	benchmark::DoNotOptimize(
	result); // Prevent the compiler from optimizing away the call
	}
	}

	// Register the function as a benchmark
	BENCHMARK(BM_NeonFastExp<neon_fast_exp>);
	BENCHMARK(BM_NeonFastExp<neon_fast_exp_fixed>);

	// Main function to run the benchmarks
	BENCHMARK_MAIN();