Last active
September 24, 2021 16:20
-
-
Save x42/842fd5638d784f626caff5d62ad9863a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -- Linux / Intel -- | |
// g++ -o peak_calc peak_calc.cc -Wall -mavx -lm -O3 -fopt-info && ./peak_calc | |
// g++ -o peak_calc peak_calc.cc -Wall -msse2 -lm -O3 -fopt-info && ./peak_calc | |
// | |
// -- Linux / ARM -- | |
// g++ -o peak_calc peak_calc.cc -Wall -lm -O3 && ./peak_calc | |
// g++ -o peak_calc peak_calc.cc -Wall -mfpu=neon-vfpv4 -lm -O3 && ./peak_calc | |
// | |
// -- macOS -- | |
// g++ -o peak_calc peak_calc.cc -Wall -lm -O3 -framework Accelerate | |
#include <math.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <sys/time.h> | |
#include <time.h> | |
float | |
fallback_compute_peak (const float* const buf, uint32_t n_samples, float current) | |
{ | |
for (uint32_t i = 0; i < n_samples; ++i) { | |
const float x = fabsf (buf[i]); | |
if (x > current) { | |
current = x; | |
} | |
} | |
return current; | |
} | |
#ifdef __APPLE__ | |
#include <Accelerate/Accelerate.h> | |
#define _HAVE_DSP_COMPUTE_PEAK | |
static float | |
dsp_compute_peak (const float* buf, uint32_t n_samples, float current) | |
{ | |
float tmp = 0.0f; | |
vDSP_maxmgv (buf, (vDSP_Stride)1, &tmp, n_samples); | |
return fmaxf (current, tmp); | |
} | |
#elif (defined __aarch64__) || (defined __arm__) | |
#include <arm_acle.h> | |
#include <arm_neon.h> | |
#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0) | |
#define _HAVE_DSP_COMPUTE_PEAK | |
float | |
dsp_compute_peak (const float* src, uint32_t nframes, float current) | |
{ | |
float32x4_t vc0; | |
// Broadcast single value to all elements of the register | |
vc0 = vdupq_n_f32 (current); | |
// While pointer is not aligned, process one sample at a time | |
while (!IS_ALIGNED_TO (src, sizeof (float32x4_t)) && (nframes > 0)) { | |
float32x4_t x0; | |
x0 = vld1q_dup_f32 (src); | |
x0 = vabsq_f32 (x0); | |
vc0 = vmaxq_f32 (vc0, x0); | |
++src; | |
--nframes; | |
} | |
// SIMD portion with aligned buffers | |
do { | |
while (nframes >= 8) { | |
float32x4_t x0, x1; | |
x0 = vld1q_f32 (src + 0); | |
x1 = vld1q_f32 (src + 4); | |
x0 = vabsq_f32 (x0); | |
x1 = vabsq_f32 (x1); | |
vc0 = vmaxq_f32 (vc0, x0); | |
vc0 = vmaxq_f32 (vc0, x1); | |
src += 8; | |
nframes -= 8; | |
} | |
while (nframes >= 4) { | |
float32x4_t x0; | |
x0 = vld1q_f32 (src); | |
x0 = vabsq_f32 (x0); | |
vc0 = vmaxq_f32 (vc0, x0); | |
src += 4; | |
nframes -= 4; | |
} | |
while (nframes >= 2) { | |
float32x2_t x0; | |
float32x4_t y0; | |
x0 = vld1_f32 (src); // Load two elements | |
x0 = vabs_f32 (x0); // Compute ABS value | |
y0 = vcombine_f32 (x0, x0); // Combine two vectors | |
vc0 = vmaxq_f32 (vc0, y0); | |
src += 2; | |
nframes -= 2; | |
} | |
} while (0); | |
// Do remaining samples one frame at a time | |
while (nframes > 0) { | |
float32x4_t x0; | |
x0 = vld1q_dup_f32 (src); | |
x0 = vabsq_f32 (x0); | |
vc0 = vmaxq_f32 (vc0, x0); | |
++src; | |
--nframes; | |
} | |
// Compute the max in register | |
do { | |
float32x2_t vlo = vget_low_f32 (vc0); | |
float32x2_t vhi = vget_high_f32 (vc0); | |
float32x2_t max0 = vpmax_f32 (vlo, vhi); | |
float32x2_t max1 = vpmax_f32 (max0, max0); // Max is now at max1[0] | |
current = vget_lane_f32 (max1, 0); | |
} while (0); | |
return current; | |
} | |
#undef IS_ALIGNED_TO | |
#elif (defined __x86_64__) || (defined __i386__) || (defined _M_X64) || (defined _M_IX86) // ARCH_X86 | |
#include <immintrin.h> | |
#include <xmmintrin.h> | |
#ifdef __AVX__ // TODO runtime detect AVX | |
#define _HAVE_DSP_COMPUTE_PEAK | |
#warning using AVX, this limits available architectures | |
static float | |
dsp_compute_peak (const float* src, uint32_t n_samples, float current) | |
{ | |
const __m256 ABS_MASK = _mm256_set1_ps (-0.0F); | |
// Broadcast the current max value to all elements of the YMM register | |
__m256 vmax = _mm256_broadcast_ss (¤t); | |
// Compute single min/max of unaligned portion until alignment is reached | |
while ((((intptr_t)src) % 32 != 0) && n_samples > 0) { | |
__m256 vsrc; | |
vsrc = _mm256_setzero_ps (); | |
vsrc = _mm256_castps128_ps256 (_mm_load_ss (src)); | |
vsrc = _mm256_andnot_ps (ABS_MASK, vsrc); | |
vmax = _mm256_max_ps (vmax, vsrc); | |
++src; | |
--n_samples; | |
} | |
// Process the aligned portion 16 samples at a time | |
while (n_samples >= 16) { | |
#ifdef _WIN32 | |
_mm_prefetch (((char*)src + (16 * sizeof (float))), _mm_hint (0)); | |
#else | |
__builtin_prefetch (src + (16 * sizeof (float)), 0, 0); | |
#endif | |
__m256 vsrc1, vsrc2; | |
vsrc1 = _mm256_load_ps (src + 0); | |
vsrc2 = _mm256_load_ps (src + 8); | |
vsrc1 = _mm256_andnot_ps (ABS_MASK, vsrc1); | |
vsrc2 = _mm256_andnot_ps (ABS_MASK, vsrc2); | |
vmax = _mm256_max_ps (vmax, vsrc1); | |
vmax = _mm256_max_ps (vmax, vsrc2); | |
src += 16; | |
n_samples -= 16; | |
} | |
// Process the remaining samples 8 at a time | |
while (n_samples >= 8) { | |
__m256 vsrc; | |
vsrc = _mm256_load_ps (src); | |
vsrc = _mm256_andnot_ps (ABS_MASK, vsrc); | |
vmax = _mm256_max_ps (vmax, vsrc); | |
src += 8; | |
n_samples -= 8; | |
} | |
// If there are still some left 4 to 8 samples, process them below | |
while (n_samples > 0) { | |
__m256 vsrc; | |
vsrc = _mm256_setzero_ps (); | |
vsrc = _mm256_castps128_ps256 (_mm_load_ss (src)); | |
vsrc = _mm256_andnot_ps (ABS_MASK, vsrc); | |
vmax = _mm256_max_ps (vmax, vsrc); | |
++src; | |
--n_samples; | |
} | |
__m256 tmp; | |
tmp = _mm256_shuffle_ps (vmax, vmax, _MM_SHUFFLE (2, 3, 0, 1)); | |
vmax = _mm256_max_ps (tmp, vmax); | |
tmp = _mm256_shuffle_ps (vmax, vmax, _MM_SHUFFLE (1, 0, 3, 2)); | |
vmax = _mm256_max_ps (tmp, vmax); | |
tmp = _mm256_permute2f128_ps (vmax, vmax, 1); | |
vmax = _mm256_max_ps (tmp, vmax); | |
// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions | |
_mm256_zeroupper (); | |
#if defined(__GNUC__) && (__GNUC__ < 5) | |
return *((float*)&vmax); | |
#elif defined(__GNUC__) && (__GNUC__ < 8) | |
return vmax[0]; | |
#else | |
return _mm256_cvtss_f32 (vmax); | |
#endif | |
} | |
#elif defined __SSE2__ | |
#define _HAVE_DSP_COMPUTE_PEAK | |
static float | |
dsp_compute_peak (const float* src, uint32_t n_samples, float current) | |
{ | |
const __m128 ABS_MASK = _mm_set1_ps (-0.0F); | |
__m128 vmax; | |
__m128 temp; | |
vmax = _mm_set1_ps (current); | |
// Compute single max of unaligned portion until alignment is reached | |
while (((intptr_t)src) % 16 != 0 && n_samples > 0) { | |
temp = _mm_set1_ps (*src); | |
temp = _mm_andnot_ps (ABS_MASK, temp); | |
vmax = _mm_max_ps (vmax, temp); | |
++src; | |
--n_samples; | |
} | |
// use 64 byte prefetch for quadruple quads | |
while (n_samples >= 16) { | |
#ifdef _WIN32 | |
_mm_prefetch (((char*)src + (16 * sizeof (float))), _mm_hint (0)); | |
#else | |
__builtin_prefetch (src + (16 * sizeof (float)), 0, 0); | |
#endif | |
temp = _mm_load_ps (src); | |
temp = _mm_andnot_ps (ABS_MASK, temp); | |
vmax = _mm_max_ps (vmax, temp); | |
src += 4; | |
temp = _mm_load_ps (src); | |
temp = _mm_andnot_ps (ABS_MASK, temp); | |
vmax = _mm_max_ps (vmax, temp); | |
src += 4; | |
temp = _mm_load_ps (src); | |
temp = _mm_andnot_ps (ABS_MASK, temp); | |
vmax = _mm_max_ps (vmax, temp); | |
src += 4; | |
temp = _mm_load_ps (src); | |
temp = _mm_andnot_ps (ABS_MASK, temp); | |
vmax = _mm_max_ps (vmax, temp); | |
src += 4; | |
n_samples -= 16; | |
} | |
// temp through aligned buffers | |
while (n_samples >= 4) { | |
temp = _mm_load_ps (src); | |
temp = _mm_andnot_ps (ABS_MASK, temp); | |
vmax = _mm_max_ps (vmax, temp); | |
src += 4; | |
n_samples -= 4; | |
} | |
// temp through the rest < 4 samples | |
while (n_samples > 0) { | |
temp = _mm_set1_ps (*src); | |
temp = _mm_andnot_ps (ABS_MASK, temp); | |
vmax = _mm_max_ps (vmax, temp); | |
++src; | |
--n_samples; | |
} | |
temp = _mm_shuffle_ps (vmax, vmax, _MM_SHUFFLE (2, 3, 0, 1)); | |
vmax = _mm_max_ps (temp, vmax); | |
temp = _mm_shuffle_ps (vmax, vmax, _MM_SHUFFLE (1, 0, 3, 2)); | |
vmax = _mm_max_ps (temp, vmax); | |
#if defined(__GNUC__) && (__GNUC__ < 5) | |
return *((float*)&vmax); | |
#else | |
return vmax[0]; | |
#endif | |
} | |
#endif // SSE | |
#endif | |
#ifndef _HAVE_DSP_COMPUTE_PEAK | |
#warning dsp_compute_peak is not accelerated on this architecture | |
static float | |
dsp_compute_peak (const float* const buf, uint32_t n_samples, float current) | |
{ | |
for (uint32_t i = 0; i < n_samples; ++i) { | |
const float x = fabsf (buf[i]); | |
if (x > current) { | |
current = x; | |
} | |
} | |
return current; | |
} | |
#endif | |
#if (defined(__x86_64__) || defined(_M_X64)) | |
static const int CPU_CACHE_ALIGN = 64; | |
#elif (defined __aarch64__) || (defined __arm__) | |
static const int CPU_CACHE_ALIGN = 128; // sizeof(float32x4_t) | |
#else | |
static const int CPU_CACHE_ALIGN = 16; | |
#endif | |
int | |
main () | |
{ | |
float pk; | |
float* buf = NULL; | |
int n_samples = 1024; | |
posix_memalign ((void**)&buf, CPU_CACHE_ALIGN, n_samples * sizeof (float)); | |
#if 1 | |
srand (time (NULL)); | |
srand (5); | |
for (int i = 0; i < n_samples; ++i) { | |
buf[i] = (2.f * rand () / (float)RAND_MAX) - 1.f; | |
} | |
#else | |
memset (buf, 0, sizeof (float) * n_samples); | |
buf[10] = .5; | |
#endif | |
//buf = &buf[1]; --n_samples; | |
struct timeval tv0, tv1; | |
#if 1 | |
pk = 0; | |
gettimeofday (&tv0, NULL); | |
for (int i = 0; i < 1000000; ++i) { | |
pk = fallback_compute_peak (buf, n_samples, pk); | |
} | |
gettimeofday (&tv1, NULL); | |
printf ("SIM %8.3f ms peak: %f\n", ((tv1.tv_sec * 1000000 + tv1.tv_usec) - (tv0.tv_sec * 1000000 + tv0.tv_usec)) / 1000.f, pk); | |
#endif | |
#if 1 | |
pk = 0; | |
gettimeofday (&tv0, NULL); | |
for (int i = 0; i < 1000000; ++i) { | |
pk = dsp_compute_peak (buf, n_samples, pk); | |
} | |
gettimeofday (&tv1, NULL); | |
printf ("OPT %8.3f ms peak: %f\n", ((tv1.tv_sec * 1000000 + tv1.tv_usec) - (tv0.tv_sec * 1000000 + tv0.tv_usec)) / 1000.f, pk); | |
#endif | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment