Skip to content

Instantly share code, notes, and snippets.

@gnail737
Created April 22, 2014 12:47
Show Gist options
  • Select an option

  • Save gnail737/8d7f00716fa6e1dbcc87 to your computer and use it in GitHub Desktop.

Select an option

Save gnail737/8d7f00716fa6e1dbcc87 to your computer and use it in GitHub Desktop.
FFT with Neon
/*************************************************************************************************************
Main Entry Point FFT_1D_Neon()
Input Params: 1) float32_t * inOutArray
-- 1D float array with length of 2 to the power of (numOfBits+1) (plus one because each elements contains both real and imaginary component)
2) uint32_t numOfBits
**************************************************************************************************************/
#include <arm_neon.h>
#include <stdlib.h>
#include <math.h>
#define PI 3.141592654f
//given a M (M<32) bits number(0 - 2^M) revert bits in place
void reverseInPlace(uint32_t * input, uint32_t m)
{
uint32_t out = 0, scrape = 0, k;
for (k=m; k>=1; k--) {
//right shift till see the last bit
scrape =(*input)>>(k-1) & 0x00000001;
//out is assembled bit by bit from last to first
out += ((scrape == 0) ? 0 : (1<<(m-k)));
}
*input = out;
}
void FFT_Bit_Reverse(uint32_t *index, uint32_t size, uint32_t bit_length)
{
uint32_t i, t;
for (i=0; i < size; i++) {
t=i;
reverseInPlace(&t, bit_length);
index[i] = t;
}
}
void FFT_Neon_first_stage(float32_t * input_a, uint32_t size, int bits)
{
int col, acol;
float32_t nThRootUnity_x[2], nThRootUnity_y[2];
float32x2_t twiddle0, twiddle1;
float32_t * iter = input_a;
uint32_t * bitReverseIndex = (uint32_t *)malloc(sizeof(uint32_t)*size);
float32x2_t temp0, temp1;
float32x4_t temp;
FFT_Bit_Reverse(bitReverseIndex, size, bits);
//initial stage of algorithm
for (col=0; col<size; col++) {
if (bitReverseIndex[col] != -1) {
if (col != bitReverseIndex[col]) {
acol = bitReverseIndex[col];
temp0 = vld1_f32(iter);
temp1 = vld1_f32(iter + (acol - col)*2);
//swaping elements
vst1_f32(iter, temp1);
vst1_f32(iter + (acol - col)*2, temp0);
bitReverseIndex[acol] = -1;
}
//if col = bitReverseIndex[] then do nothing
}
iter += 2;
}
//now do stage 1 of algorithm
iter = input_a;
for (col=0; col<size; col+=2) {
//take one complex value
float32x2_t two_comps = vld1_f32(iter);
//and its neighbor
float32x2_t two_comps_p = vld1_f32(iter + 2);
two_comps = vadd_f32 (two_comps, two_comps_p);
two_comps_p = vsub_f32(two_comps, two_comps_p);
two_comps_p = vsub_f32(two_comps_p, two_comps_p);
if ((col & 2) != 0) {
float32_t cosCoef = 1;
float32_t sinCoef = 0;
nThRootUnity_x[0] = cosCoef;
nThRootUnity_x[1] = sinCoef;
nThRootUnity_y[0] = -sinCoef;
nThRootUnity_y[1] = cosCoef;
twiddle0 = vld1_f32(nThRootUnity_x);
twiddle1 = vld1_f32(nThRootUnity_y);
float32x2_t mult_temp0 = vmul_f32(twiddle0, two_comps);
float32x2_t mult_temp1 = vmul_f32(twiddle1, two_comps);
two_comps = vadd_f32(mult_temp0, mult_temp1);
nThRootUnity_x[0] = -sinCoef;
nThRootUnity_x[1] = -cosCoef;
nThRootUnity_y[0] = cosCoef;
nThRootUnity_y[1] = -sinCoef;
twiddle0 = vld1_f32(nThRootUnity_x);
twiddle1 = vld1_f32(nThRootUnity_y);
mult_temp0 = vmul_f32(twiddle0, two_comps_p);
mult_temp1 = vmul_f32(twiddle1, two_comps_p);
two_comps_p = vadd_f32(mult_temp0, mult_temp1);
}
//two adjacent vectors can store them in one instruction
temp = vcombine_f32(two_comps, two_comps_p);
vst1q_f32(iter, temp);
iter += 4;
}
free(bitReverseIndex);
}
//for stage >= 2
void FFT_Neon_per_stage(int stage, float32_t * input_a, int size)
{
int col;
float32_t nThRootUnity_x[4], nThRootUnity_y[4];
float32x4_t twiddle0, twiddle1;
float32_t * iter = input_a;
stage = 1<<(stage-1);
//we take two pairs of complex numbers at each iteration
for (col=0; col<size; col+=2) {
if ((col & stage) != 0) { //already processed this
iter += 4;
continue;
}
//take two complex values
float32x4_t two_comps = vld1q_f32(iter);
//and their counter parts
float32x4_t two_comps_p = vld1q_f32(iter + (stage)*2);
//in-place addition/substraction (a0, b0) = (a0+a1, b0+b1)
// (a1, b1) = (a0-a1, b0-b1)
// (a1, b1) = (a1-a1, b1-b1)
two_comps = vaddq_f32 (two_comps, two_comps_p);
two_comps_p = vsubq_f32(two_comps, two_comps_p);
two_comps_p = vsubq_f32(two_comps_p, two_comps_p);
//mult twiddle factor before storing
if (col & (stage<<1)) {
//this part can be optimized, ideally shouldn't use cos sin at all
float32_t cosCoef = cos(-2.0f*PI*(col&((stage<<1)-1))/(float)(stage<<2));
float32_t sinCoef = sin(-2.0f*PI*(col&((stage<<1)-1))/(float)(stage<<2));
nThRootUnity_x[0] = nThRootUnity_x[2] = cosCoef;
nThRootUnity_x[1] = nThRootUnity_x[3] = sinCoef;
nThRootUnity_y[0] = nThRootUnity_y[2] = -sinCoef;
nThRootUnity_y[1] = nThRootUnity_y[3] = cosCoef;
twiddle0 = vld1q_f32(nThRootUnity_x);
twiddle1 = vld1q_f32(nThRootUnity_y);
float32x4_t mult_temp0 = vmulq_f32(twiddle0, two_comps);
float32x4_t mult_temp1 = vmulq_f32(twiddle1, two_comps);
two_comps = vaddq_f32(mult_temp0, mult_temp1);
//float32_t cosCoef_p = -sinCoef;
//float32_t sinCoef_p = -cosCoef;
nThRootUnity_x[0] = nThRootUnity_x[2] = -sinCoef;
nThRootUnity_x[1] = nThRootUnity_x[3] = -cosCoef;
nThRootUnity_y[0] = nThRootUnity_y[2] = cosCoef;
nThRootUnity_y[1] = nThRootUnity_y[3] = -sinCoef;
twiddle0 = vld1q_f32(nThRootUnity_x);
twiddle1 = vld1q_f32(nThRootUnity_y);
mult_temp0 = vmulq_f32(twiddle0, two_comps_p);
mult_temp1 = vmulq_f32(twiddle1, two_comps_p);
two_comps_p = vaddq_f32(mult_temp0, mult_temp1);
}
//store result
vst1q_f32(iter, two_comps);
vst1q_f32((iter + stage*2), two_comps_p);
iter += 4;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment