x42 · September 24, 2021 16:20
diff --git a/peak_calc.cc b/peak_calc.cc
 // -- Linux / Intel --
 // g++ -o peak_calc peak_calc.cc -Wall -mavx -lm -O3 -fopt-info && ./peak_calc
 // g++ -o peak_calc peak_calc.cc -Wall -msse2 -lm -O3 -fopt-info && ./peak_calc
 //
 // -- Linux / ARM --
 // g++ -o peak_calc peak_calc.cc -Wall -lm -O3 && ./peak_calc
 // g++ -o peak_calc peak_calc.cc -Wall -mfpu=neon-vfpv4 -lm -O3 && ./peak_calc
 //
 // -- macOS --
 // g++ -o peak_calc peak_calc.cc -Wall -lm -O3 -framework Accelerate

 #include <math.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
 #include <time.h>

 float
 fallback_compute_peak (const float* const buf, uint32_t n_samples, float current)
 {
 	for (uint32_t i = 0; i < n_samples; ++i) {
 		const float x = fabsf (buf[i]);
 		if (x > current) {
 			current = x;
 		}
 	}
 	return current;
 }

 #ifdef __APPLE__
 #include <Accelerate/Accelerate.h>
 #define _HAVE_DSP_COMPUTE_PEAK

 static float
 dsp_compute_peak (const float* buf, uint32_t n_samples, float current)
 {
 	float tmp = 0.0f;
 	vDSP_maxmgv (buf, (vDSP_Stride)1, &tmp, n_samples);
 	return fmaxf (current, tmp);
 }

 #elif (defined __aarch64__) || (defined __arm__)

 #include <arm_acle.h>
 #include <arm_neon.h>

 #define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0)
 #define _HAVE_DSP_COMPUTE_PEAK

 float
 dsp_compute_peak (const float* src, uint32_t nframes, float current)
 {
 	float32x4_t vc0;

 	// Broadcast single value to all elements of the register
 	vc0 = vdupq_n_f32 (current);

 	// While pointer is not aligned, process one sample at a time
 	while (!IS_ALIGNED_TO (src, sizeof (float32x4_t)) && (nframes > 0)) {
 		float32x4_t x0;

 		x0  = vld1q_dup_f32 (src);
 		x0  = vabsq_f32 (x0);
 		vc0 = vmaxq_f32 (vc0, x0);

 		++src;
 		--nframes;
 	}

 	// SIMD portion with aligned buffers
 	do {
 		while (nframes >= 8) {
 			float32x4_t x0, x1;

 			x0 = vld1q_f32 (src + 0);
 			x1 = vld1q_f32 (src + 4);

 			x0 = vabsq_f32 (x0);
 			x1 = vabsq_f32 (x1);

 			vc0 = vmaxq_f32 (vc0, x0);
 			vc0 = vmaxq_f32 (vc0, x1);

 			src += 8;
 			nframes -= 8;
 		}

 		while (nframes >= 4) {
 			float32x4_t x0;

 			x0 = vld1q_f32 (src);

 			x0  = vabsq_f32 (x0);
 			vc0 = vmaxq_f32 (vc0, x0);

 			src += 4;
 			nframes -= 4;
 		}

 		while (nframes >= 2) {
 			float32x2_t x0;
 			float32x4_t y0;

 			x0 = vld1_f32 (src);        // Load two elements
 			x0 = vabs_f32 (x0);         // Compute ABS value
 			y0 = vcombine_f32 (x0, x0); // Combine two vectors

 			vc0 = vmaxq_f32 (vc0, y0);

 			src += 2;
 			nframes -= 2;
 		}
 	} while (0);

 	// Do remaining samples one frame at a time
 	while (nframes > 0) {
 		float32x4_t x0;

 		x0  = vld1q_dup_f32 (src);
 		x0  = vabsq_f32 (x0);
 		vc0 = vmaxq_f32 (vc0, x0);

 		++src;
 		--nframes;
 	}

 	// Compute the max in register
 	do {
 		float32x2_t vlo  = vget_low_f32 (vc0);
 		float32x2_t vhi  = vget_high_f32 (vc0);
 		float32x2_t max0 = vpmax_f32 (vlo, vhi);
 		float32x2_t max1 = vpmax_f32 (max0, max0); // Max is now at max1[0]
 		current          = vget_lane_f32 (max1, 0);
 	} while (0);

 	return current;
 }

 #undef IS_ALIGNED_TO

 #elif (defined __x86_64__) || (defined __i386__) || (defined _M_X64) || (defined _M_IX86) // ARCH_X86

 #include <immintrin.h>
 #include <xmmintrin.h>

 #ifdef __AVX__ // TODO runtime detect AVX
 #define _HAVE_DSP_COMPUTE_PEAK

 #warning using AVX, this limits available architectures

 static float
 dsp_compute_peak (const float* src, uint32_t n_samples, float current)
 {
 	const __m256 ABS_MASK = _mm256_set1_ps (-0.0F);

 	// Broadcast the current max value to all elements of the YMM register
 	__m256 vmax = _mm256_broadcast_ss (&current);

 	// Compute single min/max of unaligned portion until alignment is reached
 	while ((((intptr_t)src) % 32 != 0) && n_samples > 0) {
 		__m256 vsrc;

 		vsrc = _mm256_setzero_ps ();
 		vsrc = _mm256_castps128_ps256 (_mm_load_ss (src));
 		vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
 		vmax = _mm256_max_ps (vmax, vsrc);

 		++src;
 		--n_samples;
 	}

 	// Process the aligned portion 16 samples at a time
 	while (n_samples >= 16) {
 #ifdef _WIN32
 		_mm_prefetch (((char*)src + (16 * sizeof (float))), _mm_hint (0));
 #else
 		__builtin_prefetch (src + (16 * sizeof (float)), 0, 0);
 #endif
 		__m256 vsrc1, vsrc2;
 		vsrc1 = _mm256_load_ps (src + 0);
 		vsrc2 = _mm256_load_ps (src + 8);

 		vsrc1 = _mm256_andnot_ps (ABS_MASK, vsrc1);
 		vsrc2 = _mm256_andnot_ps (ABS_MASK, vsrc2);

 		vmax = _mm256_max_ps (vmax, vsrc1);
 		vmax = _mm256_max_ps (vmax, vsrc2);

 		src += 16;
 		n_samples -= 16;
 	}

 	// Process the remaining samples 8 at a time
 	while (n_samples >= 8) {
 		__m256 vsrc;

 		vsrc = _mm256_load_ps (src);
 		vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
 		vmax = _mm256_max_ps (vmax, vsrc);

 		src += 8;
 		n_samples -= 8;
 	}

 	// If there are still some left 4 to 8 samples, process them below
 	while (n_samples > 0) {
 		__m256 vsrc;

 		vsrc = _mm256_setzero_ps ();
 		vsrc = _mm256_castps128_ps256 (_mm_load_ss (src));
 		vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
 		vmax = _mm256_max_ps (vmax, vsrc);

 		++src;
 		--n_samples;
 	}

 	__m256 tmp;
 	tmp  = _mm256_shuffle_ps (vmax, vmax, _MM_SHUFFLE (2, 3, 0, 1));
 	vmax = _mm256_max_ps (tmp, vmax);
 	tmp  = _mm256_shuffle_ps (vmax, vmax, _MM_SHUFFLE (1, 0, 3, 2));
 	vmax = _mm256_max_ps (tmp, vmax);
 	tmp  = _mm256_permute2f128_ps (vmax, vmax, 1);
 	vmax = _mm256_max_ps (tmp, vmax);

 	// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions
 	_mm256_zeroupper ();

 #if defined(__GNUC__) && (__GNUC__ < 5)
 	return *((float*)&vmax);
 #elif defined(__GNUC__) && (__GNUC__ < 8)
 	return vmax[0];
 #else
 	return _mm256_cvtss_f32 (vmax);
 #endif
 }

 #elif defined __SSE2__
 #define _HAVE_DSP_COMPUTE_PEAK

 static float
 dsp_compute_peak (const float* src, uint32_t n_samples, float current)
 {
 	const __m128 ABS_MASK = _mm_set1_ps (-0.0F);

 	__m128 vmax;
 	__m128 temp;

 	vmax = _mm_set1_ps (current);

 	// Compute single max of unaligned portion until alignment is reached
 	while (((intptr_t)src) % 16 != 0 && n_samples > 0) {
 		temp = _mm_set1_ps (*src);
 		temp = _mm_andnot_ps (ABS_MASK, temp);
 		vmax = _mm_max_ps (vmax, temp);
 		++src;
 		--n_samples;
 	}

 	// use 64 byte prefetch for quadruple quads
 	while (n_samples >= 16) {
 #ifdef _WIN32
 		_mm_prefetch (((char*)src + (16 * sizeof (float))), _mm_hint (0));
 #else
 		__builtin_prefetch (src + (16 * sizeof (float)), 0, 0);
 #endif
 		temp = _mm_load_ps (src);
 		temp = _mm_andnot_ps (ABS_MASK, temp);
 		vmax = _mm_max_ps (vmax, temp);
 		src += 4;
 		temp = _mm_load_ps (src);
 		temp = _mm_andnot_ps (ABS_MASK, temp);
 		vmax = _mm_max_ps (vmax, temp);
 		src += 4;
 		temp = _mm_load_ps (src);
 		temp = _mm_andnot_ps (ABS_MASK, temp);
 		vmax = _mm_max_ps (vmax, temp);
 		src += 4;
 		temp = _mm_load_ps (src);
 		temp = _mm_andnot_ps (ABS_MASK, temp);
 		vmax = _mm_max_ps (vmax, temp);
 		src += 4;
 		n_samples -= 16;
 	}

 	// temp through aligned buffers
 	while (n_samples >= 4) {
 		temp = _mm_load_ps (src);
 		temp = _mm_andnot_ps (ABS_MASK, temp);
 		vmax = _mm_max_ps (vmax, temp);
 		src += 4;
 		n_samples -= 4;
 	}

 	// temp through the rest < 4 samples
 	while (n_samples > 0) {
 		temp = _mm_set1_ps (*src);
 		temp = _mm_andnot_ps (ABS_MASK, temp);
 		vmax = _mm_max_ps (vmax, temp);
 		++src;
 		--n_samples;
 	}

 	temp = _mm_shuffle_ps (vmax, vmax, _MM_SHUFFLE (2, 3, 0, 1));
 	vmax = _mm_max_ps (temp, vmax);
 	temp = _mm_shuffle_ps (vmax, vmax, _MM_SHUFFLE (1, 0, 3, 2));
 	vmax = _mm_max_ps (temp, vmax);

 #if defined(__GNUC__) && (__GNUC__ < 5)
 	return *((float*)&vmax);
 #else
 	return vmax[0];
 #endif
 }
 #endif // SSE

 #endif

 #ifndef _HAVE_DSP_COMPUTE_PEAK
 #warning dsp_compute_peak is not accelerated on this architecture

 static float
 dsp_compute_peak (const float* const buf, uint32_t n_samples, float current)
 {
 	for (uint32_t i = 0; i < n_samples; ++i) {
 		const float x = fabsf (buf[i]);
 		if (x > current) {
 			current = x;
 		}
 	}
 	return current;
 }
 #endif


 #if (defined(__x86_64__) || defined(_M_X64))
 static const int CPU_CACHE_ALIGN = 64;
 #elif (defined __aarch64__) || (defined __arm__)
 static const int CPU_CACHE_ALIGN = 128; // sizeof(float32x4_t)
 #else
 static const int CPU_CACHE_ALIGN = 16;
 #endif

 int
 main ()
 {
 	float  pk;
 	float* buf = NULL;
 	int    n_samples = 1024;

 	posix_memalign ((void**)&buf, CPU_CACHE_ALIGN, n_samples * sizeof (float));

 #if 1
 	srand (time (NULL));
 	srand (5);

 	for (int i = 0; i < n_samples; ++i) {
 		buf[i] = (2.f * rand () / (float)RAND_MAX) - 1.f;
 	}
 #else
 	memset (buf, 0, sizeof (float) * n_samples);
 	buf[10] = .5;
 #endif

 	//buf = &buf[1]; --n_samples;

 	struct timeval tv0, tv1;

 #if 1
 	pk = 0;
 	gettimeofday (&tv0, NULL);
 	for (int i = 0; i < 1000000; ++i) {
 		pk = fallback_compute_peak (buf, n_samples, pk);
 	}
 	gettimeofday (&tv1, NULL);
 	printf ("SIM %8.3f ms peak: %f\n", ((tv1.tv_sec * 1000000 + tv1.tv_usec) - (tv0.tv_sec * 1000000 + tv0.tv_usec)) / 1000.f, pk);
 #endif

 #if 1
 	pk = 0;
 	gettimeofday (&tv0, NULL);
 	for (int i = 0; i < 1000000; ++i) {
 		pk = dsp_compute_peak (buf, n_samples, pk);
 	}
 	gettimeofday (&tv1, NULL);
 	printf ("OPT %8.3f ms peak: %f\n", ((tv1.tv_sec * 1000000 + tv1.tv_usec) - (tv0.tv_sec * 1000000 + tv0.tv_usec)) / 1000.f, pk);
 #endif

 	return 0;
 }
	// -- Linux / Intel --
	// g++ -o peak_calc peak_calc.cc -Wall -mavx -lm -O3 -fopt-info && ./peak_calc
	// g++ -o peak_calc peak_calc.cc -Wall -msse2 -lm -O3 -fopt-info && ./peak_calc
	//
	// -- Linux / ARM --
	// g++ -o peak_calc peak_calc.cc -Wall -lm -O3 && ./peak_calc
	// g++ -o peak_calc peak_calc.cc -Wall -mfpu=neon-vfpv4 -lm -O3 && ./peak_calc
	//
	// -- macOS --
	// g++ -o peak_calc peak_calc.cc -Wall -lm -O3 -framework Accelerate

	#include <math.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <sys/time.h>
	#include <time.h>

	float
	fallback_compute_peak (const float* const buf, uint32_t n_samples, float current)
	{
	for (uint32_t i = 0; i < n_samples; ++i) {
	const float x = fabsf (buf[i]);
	if (x > current) {
	current = x;
	}
	}
	return current;
	}

	#ifdef __APPLE__
	#include <Accelerate/Accelerate.h>
	#define _HAVE_DSP_COMPUTE_PEAK

	static float
	dsp_compute_peak (const float* buf, uint32_t n_samples, float current)
	{
	float tmp = 0.0f;
	vDSP_maxmgv (buf, (vDSP_Stride)1, &tmp, n_samples);
	return fmaxf (current, tmp);
	}

	#elif (defined __aarch64__) \|\| (defined __arm__)

	#include <arm_acle.h>
	#include <arm_neon.h>

	#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0)
	#define _HAVE_DSP_COMPUTE_PEAK

	float
	dsp_compute_peak (const float* src, uint32_t nframes, float current)
	{
	float32x4_t vc0;

	// Broadcast single value to all elements of the register
	vc0 = vdupq_n_f32 (current);

	// While pointer is not aligned, process one sample at a time
	while (!IS_ALIGNED_TO (src, sizeof (float32x4_t)) && (nframes > 0)) {
	float32x4_t x0;

	x0 = vld1q_dup_f32 (src);
	x0 = vabsq_f32 (x0);
	vc0 = vmaxq_f32 (vc0, x0);

	++src;
	--nframes;
	}

	// SIMD portion with aligned buffers
	do {
	while (nframes >= 8) {
	float32x4_t x0, x1;

	x0 = vld1q_f32 (src + 0);
	x1 = vld1q_f32 (src + 4);

	x0 = vabsq_f32 (x0);
	x1 = vabsq_f32 (x1);

	vc0 = vmaxq_f32 (vc0, x0);
	vc0 = vmaxq_f32 (vc0, x1);

	src += 8;
	nframes -= 8;
	}

	while (nframes >= 4) {
	float32x4_t x0;

	x0 = vld1q_f32 (src);

	x0 = vabsq_f32 (x0);
	vc0 = vmaxq_f32 (vc0, x0);

	src += 4;
	nframes -= 4;
	}

	while (nframes >= 2) {
	float32x2_t x0;
	float32x4_t y0;

	x0 = vld1_f32 (src); // Load two elements
	x0 = vabs_f32 (x0); // Compute ABS value
	y0 = vcombine_f32 (x0, x0); // Combine two vectors

	vc0 = vmaxq_f32 (vc0, y0);

	src += 2;
	nframes -= 2;
	}
	} while (0);

	// Do remaining samples one frame at a time
	while (nframes > 0) {
	float32x4_t x0;

	x0 = vld1q_dup_f32 (src);
	x0 = vabsq_f32 (x0);
	vc0 = vmaxq_f32 (vc0, x0);

	++src;
	--nframes;
	}

	// Compute the max in register
	do {
	float32x2_t vlo = vget_low_f32 (vc0);
	float32x2_t vhi = vget_high_f32 (vc0);
	float32x2_t max0 = vpmax_f32 (vlo, vhi);
	float32x2_t max1 = vpmax_f32 (max0, max0); // Max is now at max1[0]
	current = vget_lane_f32 (max1, 0);
	} while (0);

	return current;
	}

	#undef IS_ALIGNED_TO

	#elif (defined __x86_64__) \|\| (defined __i386__) \|\| (defined _M_X64) \|\| (defined _M_IX86) // ARCH_X86

	#include <immintrin.h>
	#include <xmmintrin.h>

	#ifdef __AVX__ // TODO runtime detect AVX
	#define _HAVE_DSP_COMPUTE_PEAK

	#warning using AVX, this limits available architectures

	static float
	dsp_compute_peak (const float* src, uint32_t n_samples, float current)
	{
	const __m256 ABS_MASK = _mm256_set1_ps (-0.0F);

	// Broadcast the current max value to all elements of the YMM register
	__m256 vmax = _mm256_broadcast_ss (&current);

	// Compute single min/max of unaligned portion until alignment is reached
	while ((((intptr_t)src) % 32 != 0) && n_samples > 0) {
	__m256 vsrc;

	vsrc = _mm256_setzero_ps ();
	vsrc = _mm256_castps128_ps256 (_mm_load_ss (src));
	vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
	vmax = _mm256_max_ps (vmax, vsrc);

	++src;
	--n_samples;
	}

	// Process the aligned portion 16 samples at a time
	while (n_samples >= 16) {
	#ifdef _WIN32
	_mm_prefetch (((char)src + (16 sizeof (float))), _mm_hint (0));
	#else
	__builtin_prefetch (src + (16 * sizeof (float)), 0, 0);
	#endif
	__m256 vsrc1, vsrc2;
	vsrc1 = _mm256_load_ps (src + 0);
	vsrc2 = _mm256_load_ps (src + 8);

	vsrc1 = _mm256_andnot_ps (ABS_MASK, vsrc1);
	vsrc2 = _mm256_andnot_ps (ABS_MASK, vsrc2);

	vmax = _mm256_max_ps (vmax, vsrc1);
	vmax = _mm256_max_ps (vmax, vsrc2);

	src += 16;
	n_samples -= 16;
	}

	// Process the remaining samples 8 at a time
	while (n_samples >= 8) {
	__m256 vsrc;

	vsrc = _mm256_load_ps (src);
	vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
	vmax = _mm256_max_ps (vmax, vsrc);

	src += 8;
	n_samples -= 8;
	}

	// If there are still some left 4 to 8 samples, process them below
	while (n_samples > 0) {
	__m256 vsrc;

	vsrc = _mm256_setzero_ps ();
	vsrc = _mm256_castps128_ps256 (_mm_load_ss (src));
	vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
	vmax = _mm256_max_ps (vmax, vsrc);

	++src;
	--n_samples;
	}

	__m256 tmp;
	tmp = _mm256_shuffle_ps (vmax, vmax, _MM_SHUFFLE (2, 3, 0, 1));
	vmax = _mm256_max_ps (tmp, vmax);
	tmp = _mm256_shuffle_ps (vmax, vmax, _MM_SHUFFLE (1, 0, 3, 2));
	vmax = _mm256_max_ps (tmp, vmax);
	tmp = _mm256_permute2f128_ps (vmax, vmax, 1);
	vmax = _mm256_max_ps (tmp, vmax);

	// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions
	_mm256_zeroupper ();

	#if defined(__GNUC__) && (__GNUC__ < 5)
	return ((float)&vmax);
	#elif defined(__GNUC__) && (__GNUC__ < 8)
	return vmax[0];
	#else
	return _mm256_cvtss_f32 (vmax);
	#endif
	}

	#elif defined __SSE2__
	#define _HAVE_DSP_COMPUTE_PEAK

	static float
	dsp_compute_peak (const float* src, uint32_t n_samples, float current)
	{
	const __m128 ABS_MASK = _mm_set1_ps (-0.0F);

	__m128 vmax;
	__m128 temp;

	vmax = _mm_set1_ps (current);

	// Compute single max of unaligned portion until alignment is reached
	while (((intptr_t)src) % 16 != 0 && n_samples > 0) {
	temp = _mm_set1_ps (*src);
	temp = _mm_andnot_ps (ABS_MASK, temp);
	vmax = _mm_max_ps (vmax, temp);
	++src;
	--n_samples;
	}

	// use 64 byte prefetch for quadruple quads
	while (n_samples >= 16) {
	#ifdef _WIN32
	_mm_prefetch (((char)src + (16 sizeof (float))), _mm_hint (0));
	#else
	__builtin_prefetch (src + (16 * sizeof (float)), 0, 0);
	#endif
	temp = _mm_load_ps (src);
	temp = _mm_andnot_ps (ABS_MASK, temp);
	vmax = _mm_max_ps (vmax, temp);
	src += 4;
	temp = _mm_load_ps (src);
	temp = _mm_andnot_ps (ABS_MASK, temp);
	vmax = _mm_max_ps (vmax, temp);
	src += 4;
	temp = _mm_load_ps (src);
	temp = _mm_andnot_ps (ABS_MASK, temp);
	vmax = _mm_max_ps (vmax, temp);
	src += 4;
	temp = _mm_load_ps (src);
	temp = _mm_andnot_ps (ABS_MASK, temp);
	vmax = _mm_max_ps (vmax, temp);
	src += 4;
	n_samples -= 16;
	}

	// temp through aligned buffers
	while (n_samples >= 4) {
	temp = _mm_load_ps (src);
	temp = _mm_andnot_ps (ABS_MASK, temp);
	vmax = _mm_max_ps (vmax, temp);
	src += 4;
	n_samples -= 4;
	}

	// temp through the rest < 4 samples
	while (n_samples > 0) {
	temp = _mm_set1_ps (*src);
	temp = _mm_andnot_ps (ABS_MASK, temp);
	vmax = _mm_max_ps (vmax, temp);
	++src;
	--n_samples;
	}

	temp = _mm_shuffle_ps (vmax, vmax, _MM_SHUFFLE (2, 3, 0, 1));
	vmax = _mm_max_ps (temp, vmax);
	temp = _mm_shuffle_ps (vmax, vmax, _MM_SHUFFLE (1, 0, 3, 2));
	vmax = _mm_max_ps (temp, vmax);

	#if defined(__GNUC__) && (__GNUC__ < 5)
	return ((float)&vmax);
	#else
	return vmax[0];
	#endif
	}
	#endif // SSE

	#endif

	#ifndef _HAVE_DSP_COMPUTE_PEAK
	#warning dsp_compute_peak is not accelerated on this architecture

	static float
	dsp_compute_peak (const float* const buf, uint32_t n_samples, float current)
	{
	for (uint32_t i = 0; i < n_samples; ++i) {
	const float x = fabsf (buf[i]);
	if (x > current) {
	current = x;
	}
	}
	return current;
	}
	#endif


	#if (defined(__x86_64__) \|\| defined(_M_X64))
	static const int CPU_CACHE_ALIGN = 64;
	#elif (defined __aarch64__) \|\| (defined __arm__)
	static const int CPU_CACHE_ALIGN = 128; // sizeof(float32x4_t)
	#else
	static const int CPU_CACHE_ALIGN = 16;
	#endif

	int
	main ()
	{
	float pk;
	float* buf = NULL;
	int n_samples = 1024;

	posix_memalign ((void*)&buf, CPU_CACHE_ALIGN, n_samples sizeof (float));

	#if 1
	srand (time (NULL));
	srand (5);

	for (int i = 0; i < n_samples; ++i) {
	buf[i] = (2.f * rand () / (float)RAND_MAX) - 1.f;
	}
	#else
	memset (buf, 0, sizeof (float) * n_samples);
	buf[10] = .5;
	#endif

	//buf = &buf[1]; --n_samples;

	struct timeval tv0, tv1;

	#if 1
	pk = 0;
	gettimeofday (&tv0, NULL);
	for (int i = 0; i < 1000000; ++i) {
	pk = fallback_compute_peak (buf, n_samples, pk);
	}
	gettimeofday (&tv1, NULL);
	printf ("SIM %8.3f ms peak: %f\n", ((tv1.tv_sec * 1000000 + tv1.tv_usec) - (tv0.tv_sec * 1000000 + tv0.tv_usec)) / 1000.f, pk);
	#endif

	#if 1
	pk = 0;
	gettimeofday (&tv0, NULL);
	for (int i = 0; i < 1000000; ++i) {
	pk = dsp_compute_peak (buf, n_samples, pk);
	}
	gettimeofday (&tv1, NULL);
	printf ("OPT %8.3f ms peak: %f\n", ((tv1.tv_sec * 1000000 + tv1.tv_usec) - (tv0.tv_sec * 1000000 + tv0.tv_usec)) / 1000.f, pk);
	#endif

	return 0;
	}