Created
April 22, 2014 12:47
-
-
Save gnail737/8d7f00716fa6e1dbcc87 to your computer and use it in GitHub Desktop.
FFT with Neon
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /************************************************************************************************************* | |
| Main Entry Point FFT_1D_Neon() | |
| Input Params: 1) float32_t * inOutArray | |
| -- 1D float array with length of 2 to the power of (numOfBits+1) (plus one because each elements contains both real and imaginary component) | |
| 2) uint32_t numOfBits | |
| **************************************************************************************************************/ | |
| #include <arm_neon.h> | |
| #include <stdlib.h> | |
| #include <math.h> | |
| #define PI 3.141592654f | |
| //given a M (M<32) bits number(0 - 2^M) revert bits in place | |
| void reverseInPlace(uint32_t * input, uint32_t m) | |
| { | |
| uint32_t out = 0, scrape = 0, k; | |
| for (k=m; k>=1; k--) { | |
| //right shift till see the last bit | |
| scrape =(*input)>>(k-1) & 0x00000001; | |
| //out is assembled bit by bit from last to first | |
| out += ((scrape == 0) ? 0 : (1<<(m-k))); | |
| } | |
| *input = out; | |
| } | |
| void FFT_Bit_Reverse(uint32_t *index, uint32_t size, uint32_t bit_length) | |
| { | |
| uint32_t i, t; | |
| for (i=0; i < size; i++) { | |
| t=i; | |
| reverseInPlace(&t, bit_length); | |
| index[i] = t; | |
| } | |
| } | |
| void FFT_Neon_first_stage(float32_t * input_a, uint32_t size, int bits) | |
| { | |
| int col, acol; | |
| float32_t nThRootUnity_x[2], nThRootUnity_y[2]; | |
| float32x2_t twiddle0, twiddle1; | |
| float32_t * iter = input_a; | |
| uint32_t * bitReverseIndex = (uint32_t *)malloc(sizeof(uint32_t)*size); | |
| float32x2_t temp0, temp1; | |
| float32x4_t temp; | |
| FFT_Bit_Reverse(bitReverseIndex, size, bits); | |
| //initial stage of algorithm | |
| for (col=0; col<size; col++) { | |
| if (bitReverseIndex[col] != -1) { | |
| if (col != bitReverseIndex[col]) { | |
| acol = bitReverseIndex[col]; | |
| temp0 = vld1_f32(iter); | |
| temp1 = vld1_f32(iter + (acol - col)*2); | |
| //swaping elements | |
| vst1_f32(iter, temp1); | |
| vst1_f32(iter + (acol - col)*2, temp0); | |
| bitReverseIndex[acol] = -1; | |
| } | |
| //if col = bitReverseIndex[] then do nothing | |
| } | |
| iter += 2; | |
| } | |
| //now do stage 1 of algorithm | |
| iter = input_a; | |
| for (col=0; col<size; col+=2) { | |
| //take one complex value | |
| float32x2_t two_comps = vld1_f32(iter); | |
| //and its neighbor | |
| float32x2_t two_comps_p = vld1_f32(iter + 2); | |
| two_comps = vadd_f32 (two_comps, two_comps_p); | |
| two_comps_p = vsub_f32(two_comps, two_comps_p); | |
| two_comps_p = vsub_f32(two_comps_p, two_comps_p); | |
| if ((col & 2) != 0) { | |
| float32_t cosCoef = 1; | |
| float32_t sinCoef = 0; | |
| nThRootUnity_x[0] = cosCoef; | |
| nThRootUnity_x[1] = sinCoef; | |
| nThRootUnity_y[0] = -sinCoef; | |
| nThRootUnity_y[1] = cosCoef; | |
| twiddle0 = vld1_f32(nThRootUnity_x); | |
| twiddle1 = vld1_f32(nThRootUnity_y); | |
| float32x2_t mult_temp0 = vmul_f32(twiddle0, two_comps); | |
| float32x2_t mult_temp1 = vmul_f32(twiddle1, two_comps); | |
| two_comps = vadd_f32(mult_temp0, mult_temp1); | |
| nThRootUnity_x[0] = -sinCoef; | |
| nThRootUnity_x[1] = -cosCoef; | |
| nThRootUnity_y[0] = cosCoef; | |
| nThRootUnity_y[1] = -sinCoef; | |
| twiddle0 = vld1_f32(nThRootUnity_x); | |
| twiddle1 = vld1_f32(nThRootUnity_y); | |
| mult_temp0 = vmul_f32(twiddle0, two_comps_p); | |
| mult_temp1 = vmul_f32(twiddle1, two_comps_p); | |
| two_comps_p = vadd_f32(mult_temp0, mult_temp1); | |
| } | |
| //two adjacent vectors can store them in one instruction | |
| temp = vcombine_f32(two_comps, two_comps_p); | |
| vst1q_f32(iter, temp); | |
| iter += 4; | |
| } | |
| free(bitReverseIndex); | |
| } | |
| //for stage >= 2 | |
| void FFT_Neon_per_stage(int stage, float32_t * input_a, int size) | |
| { | |
| int col; | |
| float32_t nThRootUnity_x[4], nThRootUnity_y[4]; | |
| float32x4_t twiddle0, twiddle1; | |
| float32_t * iter = input_a; | |
| stage = 1<<(stage-1); | |
| //we take two pairs of complex numbers at each iteration | |
| for (col=0; col<size; col+=2) { | |
| if ((col & stage) != 0) { //already processed this | |
| iter += 4; | |
| continue; | |
| } | |
| //take two complex values | |
| float32x4_t two_comps = vld1q_f32(iter); | |
| //and their counter parts | |
| float32x4_t two_comps_p = vld1q_f32(iter + (stage)*2); | |
| //in-place addition/substraction (a0, b0) = (a0+a1, b0+b1) | |
| // (a1, b1) = (a0-a1, b0-b1) | |
| // (a1, b1) = (a1-a1, b1-b1) | |
| two_comps = vaddq_f32 (two_comps, two_comps_p); | |
| two_comps_p = vsubq_f32(two_comps, two_comps_p); | |
| two_comps_p = vsubq_f32(two_comps_p, two_comps_p); | |
| //mult twiddle factor before storing | |
| if (col & (stage<<1)) { | |
| //this part can be optimized, ideally shouldn't use cos sin at all | |
| float32_t cosCoef = cos(-2.0f*PI*(col&((stage<<1)-1))/(float)(stage<<2)); | |
| float32_t sinCoef = sin(-2.0f*PI*(col&((stage<<1)-1))/(float)(stage<<2)); | |
| nThRootUnity_x[0] = nThRootUnity_x[2] = cosCoef; | |
| nThRootUnity_x[1] = nThRootUnity_x[3] = sinCoef; | |
| nThRootUnity_y[0] = nThRootUnity_y[2] = -sinCoef; | |
| nThRootUnity_y[1] = nThRootUnity_y[3] = cosCoef; | |
| twiddle0 = vld1q_f32(nThRootUnity_x); | |
| twiddle1 = vld1q_f32(nThRootUnity_y); | |
| float32x4_t mult_temp0 = vmulq_f32(twiddle0, two_comps); | |
| float32x4_t mult_temp1 = vmulq_f32(twiddle1, two_comps); | |
| two_comps = vaddq_f32(mult_temp0, mult_temp1); | |
| //float32_t cosCoef_p = -sinCoef; | |
| //float32_t sinCoef_p = -cosCoef; | |
| nThRootUnity_x[0] = nThRootUnity_x[2] = -sinCoef; | |
| nThRootUnity_x[1] = nThRootUnity_x[3] = -cosCoef; | |
| nThRootUnity_y[0] = nThRootUnity_y[2] = cosCoef; | |
| nThRootUnity_y[1] = nThRootUnity_y[3] = -sinCoef; | |
| twiddle0 = vld1q_f32(nThRootUnity_x); | |
| twiddle1 = vld1q_f32(nThRootUnity_y); | |
| mult_temp0 = vmulq_f32(twiddle0, two_comps_p); | |
| mult_temp1 = vmulq_f32(twiddle1, two_comps_p); | |
| two_comps_p = vaddq_f32(mult_temp0, mult_temp1); | |
| } | |
| //store result | |
| vst1q_f32(iter, two_comps); | |
| vst1q_f32((iter + stage*2), two_comps_p); | |
| iter += 4; | |
| } | |
| } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment