Created
August 27, 2024 19:58
-
-
Save jeethu/e4b4d988620cb9031afbbcdcbed9c3f8 to your computer and use it in GitHub Desktop.
neon_fast_exp benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <arm_neon.h> | |
#include <benchmark/benchmark.h> | |
inline float16x8_t neon_fast_exp(float16x8_t x) { | |
x = vmulq_f16(x, vdupq_n_f16(float16_t(1.442695f))); // multiply with log_2(e) | |
x = vmaxq_f16(x, vdupq_n_f16(float16_t(-14.f))); // clamp under with -14 | |
x = vminq_f16(x, vdupq_n_f16(float16_t(14.f))); // clamp over with 14 | |
float16x8_t ipart = vrndmq_f16(vaddq_f16(x, vdupq_n_f16(float16_t(0.5f)))); | |
float16x8_t fpart = vsubq_f16(x, ipart); | |
x = vdupq_n_f16(float16_t(1.535336188319500e-4f)); | |
x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart); | |
x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart); | |
x = vfmaq_f16(vdupq_n_f16(float16_t(9.618437357674640e-3f)), x, fpart); | |
x = vfmaq_f16(vdupq_n_f16(float16_t(5.550332471162809e-2f)), x, fpart); | |
x = vfmaq_f16(vdupq_n_f16(float16_t(2.402264791363012e-1f)), x, fpart); | |
x = vfmaq_f16(vdupq_n_f16(float16_t(6.931472028550421e-1f)), x, fpart); | |
x = vfmaq_f16(vdupq_n_f16(float16_t(1.000000000000000f)), x, fpart); | |
// generate 2**ipart in the floating point representation using integer | |
// bitshifting | |
int16x8_t epart = vcvtq_s16_f16(ipart); | |
epart = vaddq_s16(epart, vdupq_n_s16(15)); | |
epart = vshlq_n_s16(epart, 10); | |
return vmulq_f16(vreinterpretq_f16_s16(epart), x); | |
} | |
inline float16x8_t neon_fast_exp_fixed(float16x8_t x) { | |
x = vmulq_f16(x, vdupq_n_f16(float16_t(1.442695f))); // multiply with log_2(e) | |
x = vmaxq_f16(x, vdupq_n_f16(float16_t(-14.f))); // clamp under with -14 | |
x = vminq_f16(x, vdupq_n_f16(float16_t(14.f))); // clamp over with 14 | |
float16x8_t ipart = vrndmq_f16(vaddq_f16(x, vdupq_n_f16(float16_t(0.5f)))); | |
float16x8_t fpart = vsubq_f16(x, ipart); | |
x = vdupq_n_f16(float16_t(1.535336188319500e-4f)); | |
x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart); | |
x = vfmaq_f16(vdupq_n_f16(float16_t(9.618437357674640e-3f)), x, fpart); | |
x = vfmaq_f16(vdupq_n_f16(float16_t(5.550332471162809e-2f)), x, fpart); | |
x = vfmaq_f16(vdupq_n_f16(float16_t(2.402264791363012e-1f)), x, fpart); | |
x = vfmaq_f16(vdupq_n_f16(float16_t(6.931472028550421e-1f)), x, fpart); | |
x = vfmaq_f16(vdupq_n_f16(float16_t(1.000000000000000f)), x, fpart); | |
// generate 2**ipart in the floating point representation using integer | |
// bitshifting | |
int16x8_t epart = vcvtq_s16_f16(ipart); | |
epart = vaddq_s16(epart, vdupq_n_s16(15)); | |
epart = vshlq_n_s16(epart, 10); | |
return vmulq_f16(vreinterpretq_f16_s16(epart), x); | |
} | |
// Benchmark function | |
template <float16x8_t(Func)(float16x8_t)> | |
static void BM_NeonFastExp(benchmark::State &state) { | |
float16x8_t input = | |
vdupq_n_f16(1.0f); // Example input vector with all elements as 1.0f | |
float16x8_t result; | |
for (auto _ : state) { | |
result = Func(input); | |
benchmark::DoNotOptimize( | |
result); // Prevent the compiler from optimizing away the call | |
} | |
} | |
// Register the function as a benchmark | |
BENCHMARK(BM_NeonFastExp<neon_fast_exp>); | |
BENCHMARK(BM_NeonFastExp<neon_fast_exp_fixed>); | |
// Main function to run the benchmarks | |
BENCHMARK_MAIN(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment