gnail737 · April 22, 2014 12:47
diff --git a/gistfile1.txt b/gistfile1.txt
 /*************************************************************************************************************
 Main Entry Point FFT_1D_Neon()
 Input Params: 1) float32_t * inOutArray 
                -- 1D float array with length of 2 to the power of (numOfBits+1) (plus one because each elements contains both real and imaginary component) 
               2) uint32_t numOfBits


 **************************************************************************************************************/

 #include <arm_neon.h>
 #include <stdlib.h>
 #include <math.h>

 #define  PI 3.141592654f
 //given a M (M<32) bits number(0 - 2^M) revert bits in place
 void reverseInPlace(uint32_t * input, uint32_t m)
 {
 	uint32_t out = 0, scrape = 0, k;
 	for (k=m; k>=1; k--) {
 		//right shift till see the last bit
 		scrape =(*input)>>(k-1) & 0x00000001;
 		//out is assembled bit by bit from last to first
 	    out += ((scrape == 0) ?  0 : (1<<(m-k)));
 	}

 	*input = out;
 }

 void FFT_Bit_Reverse(uint32_t *index, uint32_t size, uint32_t bit_length)
 {
 	uint32_t i, t;
 	for (i=0; i < size; i++) {
 		t=i;
 		reverseInPlace(&t, bit_length);
 		index[i] = t;
 	}
 }

 void FFT_Neon_first_stage(float32_t * input_a, uint32_t size, int bits)
 {
 	int col, acol;
 	float32_t nThRootUnity_x[2], nThRootUnity_y[2];
 	float32x2_t twiddle0, twiddle1;
 	float32_t * iter = input_a;
 	uint32_t * bitReverseIndex = (uint32_t *)malloc(sizeof(uint32_t)*size);
 	float32x2_t temp0, temp1;
 	float32x4_t temp;
 	FFT_Bit_Reverse(bitReverseIndex, size, bits);

 	//initial stage of algorithm
 	for (col=0; col<size; col++) {
 		if (bitReverseIndex[col] != -1) {
 			if (col != bitReverseIndex[col]) {
 				acol = bitReverseIndex[col];
 				temp0 = vld1_f32(iter);
 				temp1 = vld1_f32(iter + (acol - col)*2);
 				//swaping elements
 				vst1_f32(iter, temp1);
 				vst1_f32(iter + (acol - col)*2, temp0);
 				bitReverseIndex[acol] = -1;
 			}
            //if col = bitReverseIndex[] then do nothing
 		}

 		iter += 2;
 	}

 	//now do stage 1 of algorithm
 	iter = input_a;
 	for (col=0; col<size; col+=2) {
 		//take one complex value
 		float32x2_t two_comps = vld1_f32(iter);
 		//and its neighbor
 		float32x2_t two_comps_p = vld1_f32(iter + 2);

 		two_comps = vadd_f32 (two_comps, two_comps_p);
 		two_comps_p = vsub_f32(two_comps, two_comps_p);
 		two_comps_p = vsub_f32(two_comps_p, two_comps_p);

 		if ((col & 2) != 0) {
 			float32_t cosCoef = 1;
 			float32_t sinCoef = 0;

 			nThRootUnity_x[0] = cosCoef;
 			nThRootUnity_x[1] = sinCoef;
 			nThRootUnity_y[0] = -sinCoef;
 		    nThRootUnity_y[1] = cosCoef;

 		    twiddle0 = vld1_f32(nThRootUnity_x);
 		    twiddle1 = vld1_f32(nThRootUnity_y);

 		    float32x2_t mult_temp0 = vmul_f32(twiddle0, two_comps);
 		    float32x2_t mult_temp1 = vmul_f32(twiddle1, two_comps);
 		    two_comps = vadd_f32(mult_temp0, mult_temp1);

 		    nThRootUnity_x[0] = -sinCoef;
 		    nThRootUnity_x[1] = -cosCoef;
 		    nThRootUnity_y[0] = cosCoef;
 		    nThRootUnity_y[1] = -sinCoef;

 		    twiddle0 = vld1_f32(nThRootUnity_x);
 		    twiddle1 = vld1_f32(nThRootUnity_y);

 		    mult_temp0 = vmul_f32(twiddle0, two_comps_p);
 		    mult_temp1 = vmul_f32(twiddle1, two_comps_p);
 		    two_comps_p = vadd_f32(mult_temp0, mult_temp1);
 		}
 		//two adjacent vectors can store them in one instruction
 		temp = vcombine_f32(two_comps, two_comps_p);
 		vst1q_f32(iter, temp);

 		iter += 4;
 	}

 	free(bitReverseIndex);
 }
 //for stage >= 2
 void FFT_Neon_per_stage(int stage, float32_t * input_a, int size)
 {
 	int col;
 	float32_t nThRootUnity_x[4], nThRootUnity_y[4];
 	float32x4_t twiddle0, twiddle1;
 	float32_t * iter = input_a;
 	stage = 1<<(stage-1);
 	//we take two pairs of complex numbers at each iteration
 	for (col=0; col<size; col+=2) {
 		if ((col & stage) != 0) { //already processed this
 			iter += 4;
 			continue;
 		}
 		//take two complex values
 		float32x4_t two_comps = vld1q_f32(iter);
 		//and their counter parts
 		float32x4_t two_comps_p = vld1q_f32(iter + (stage)*2);

 		//in-place addition/substraction (a0, b0) = (a0+a1, b0+b1)
 		//                               (a1, b1) = (a0-a1, b0-b1)
 		//                               (a1, b1) = (a1-a1, b1-b1)
 		two_comps = vaddq_f32 (two_comps, two_comps_p);
 		two_comps_p = vsubq_f32(two_comps, two_comps_p);
 		two_comps_p = vsubq_f32(two_comps_p, two_comps_p);

 		//mult twiddle factor before storing
 		if (col & (stage<<1)) {
 			//this part can be optimized, ideally shouldn't use cos sin at all
 			float32_t cosCoef = cos(-2.0f*PI*(col&((stage<<1)-1))/(float)(stage<<2));
 			float32_t sinCoef = sin(-2.0f*PI*(col&((stage<<1)-1))/(float)(stage<<2));

 			nThRootUnity_x[0] = nThRootUnity_x[2] = cosCoef;
 			nThRootUnity_x[1] = nThRootUnity_x[3] = sinCoef;
 			nThRootUnity_y[0] = nThRootUnity_y[2] = -sinCoef;
 			nThRootUnity_y[1] = nThRootUnity_y[3] = cosCoef;

 			twiddle0 = vld1q_f32(nThRootUnity_x);
 			twiddle1 = vld1q_f32(nThRootUnity_y);

 			float32x4_t mult_temp0 = vmulq_f32(twiddle0, two_comps);
 			float32x4_t mult_temp1 = vmulq_f32(twiddle1, two_comps);
 			two_comps = vaddq_f32(mult_temp0, mult_temp1);

 			//float32_t cosCoef_p = -sinCoef;
 			//float32_t sinCoef_p = -cosCoef;
 			nThRootUnity_x[0] = nThRootUnity_x[2] = -sinCoef;
 			nThRootUnity_x[1] = nThRootUnity_x[3] = -cosCoef;
 			nThRootUnity_y[0] = nThRootUnity_y[2] = cosCoef;
 			nThRootUnity_y[1] = nThRootUnity_y[3] = -sinCoef;

 			twiddle0 = vld1q_f32(nThRootUnity_x);
 			twiddle1 = vld1q_f32(nThRootUnity_y);

 			mult_temp0 = vmulq_f32(twiddle0, two_comps_p);
 			mult_temp1 = vmulq_f32(twiddle1, two_comps_p);
 			two_comps_p = vaddq_f32(mult_temp0, mult_temp1);
 		}

 		//store result
 		vst1q_f32(iter, two_comps);
 	    vst1q_f32((iter + stage*2), two_comps_p);
 	    iter += 4;
 	}
 }
	/*************************************************************************************************************
	Main Entry Point FFT_1D_Neon()
	Input Params: 1) float32_t * inOutArray
	-- 1D float array with length of 2 to the power of (numOfBits+1) (plus one because each elements contains both real and imaginary component)
	2) uint32_t numOfBits


	**************************************************************************************************************/

	#include <arm_neon.h>
	#include <stdlib.h>
	#include <math.h>

	#define PI 3.141592654f
	//given a M (M<32) bits number(0 - 2^M) revert bits in place
	void reverseInPlace(uint32_t * input, uint32_t m)
	{
	uint32_t out = 0, scrape = 0, k;
	for (k=m; k>=1; k--) {
	//right shift till see the last bit
	scrape =(*input)>>(k-1) & 0x00000001;
	//out is assembled bit by bit from last to first
	out += ((scrape == 0) ? 0 : (1<<(m-k)));
	}

	*input = out;
	}

	void FFT_Bit_Reverse(uint32_t *index, uint32_t size, uint32_t bit_length)
	{
	uint32_t i, t;
	for (i=0; i < size; i++) {
	t=i;
	reverseInPlace(&t, bit_length);
	index[i] = t;
	}
	}

	void FFT_Neon_first_stage(float32_t * input_a, uint32_t size, int bits)
	{
	int col, acol;
	float32_t nThRootUnity_x[2], nThRootUnity_y[2];
	float32x2_t twiddle0, twiddle1;
	float32_t * iter = input_a;
	uint32_t * bitReverseIndex = (uint32_t )malloc(sizeof(uint32_t)size);
	float32x2_t temp0, temp1;
	float32x4_t temp;
	FFT_Bit_Reverse(bitReverseIndex, size, bits);

	//initial stage of algorithm
	for (col=0; col<size; col++) {
	if (bitReverseIndex[col] != -1) {
	if (col != bitReverseIndex[col]) {
	acol = bitReverseIndex[col];
	temp0 = vld1_f32(iter);
	temp1 = vld1_f32(iter + (acol - col)*2);
	//swaping elements
	vst1_f32(iter, temp1);
	vst1_f32(iter + (acol - col)*2, temp0);
	bitReverseIndex[acol] = -1;
	}
	//if col = bitReverseIndex[] then do nothing
	}

	iter += 2;
	}

	//now do stage 1 of algorithm
	iter = input_a;
	for (col=0; col<size; col+=2) {
	//take one complex value
	float32x2_t two_comps = vld1_f32(iter);
	//and its neighbor
	float32x2_t two_comps_p = vld1_f32(iter + 2);

	two_comps = vadd_f32 (two_comps, two_comps_p);
	two_comps_p = vsub_f32(two_comps, two_comps_p);
	two_comps_p = vsub_f32(two_comps_p, two_comps_p);

	if ((col & 2) != 0) {
	float32_t cosCoef = 1;
	float32_t sinCoef = 0;

	nThRootUnity_x[0] = cosCoef;
	nThRootUnity_x[1] = sinCoef;
	nThRootUnity_y[0] = -sinCoef;
	nThRootUnity_y[1] = cosCoef;

	twiddle0 = vld1_f32(nThRootUnity_x);
	twiddle1 = vld1_f32(nThRootUnity_y);

	float32x2_t mult_temp0 = vmul_f32(twiddle0, two_comps);
	float32x2_t mult_temp1 = vmul_f32(twiddle1, two_comps);
	two_comps = vadd_f32(mult_temp0, mult_temp1);

	nThRootUnity_x[0] = -sinCoef;
	nThRootUnity_x[1] = -cosCoef;
	nThRootUnity_y[0] = cosCoef;
	nThRootUnity_y[1] = -sinCoef;

	twiddle0 = vld1_f32(nThRootUnity_x);
	twiddle1 = vld1_f32(nThRootUnity_y);

	mult_temp0 = vmul_f32(twiddle0, two_comps_p);
	mult_temp1 = vmul_f32(twiddle1, two_comps_p);
	two_comps_p = vadd_f32(mult_temp0, mult_temp1);
	}
	//two adjacent vectors can store them in one instruction
	temp = vcombine_f32(two_comps, two_comps_p);
	vst1q_f32(iter, temp);

	iter += 4;
	}

	free(bitReverseIndex);
	}
	//for stage >= 2
	void FFT_Neon_per_stage(int stage, float32_t * input_a, int size)
	{
	int col;
	float32_t nThRootUnity_x[4], nThRootUnity_y[4];
	float32x4_t twiddle0, twiddle1;
	float32_t * iter = input_a;
	stage = 1<<(stage-1);
	//we take two pairs of complex numbers at each iteration
	for (col=0; col<size; col+=2) {
	if ((col & stage) != 0) { //already processed this
	iter += 4;
	continue;
	}
	//take two complex values
	float32x4_t two_comps = vld1q_f32(iter);
	//and their counter parts
	float32x4_t two_comps_p = vld1q_f32(iter + (stage)*2);

	//in-place addition/substraction (a0, b0) = (a0+a1, b0+b1)
	// (a1, b1) = (a0-a1, b0-b1)
	// (a1, b1) = (a1-a1, b1-b1)
	two_comps = vaddq_f32 (two_comps, two_comps_p);
	two_comps_p = vsubq_f32(two_comps, two_comps_p);
	two_comps_p = vsubq_f32(two_comps_p, two_comps_p);

	//mult twiddle factor before storing
	if (col & (stage<<1)) {
	//this part can be optimized, ideally shouldn't use cos sin at all
	float32_t cosCoef = cos(-2.0fPI(col&((stage<<1)-1))/(float)(stage<<2));
	float32_t sinCoef = sin(-2.0fPI(col&((stage<<1)-1))/(float)(stage<<2));

	nThRootUnity_x[0] = nThRootUnity_x[2] = cosCoef;
	nThRootUnity_x[1] = nThRootUnity_x[3] = sinCoef;
	nThRootUnity_y[0] = nThRootUnity_y[2] = -sinCoef;
	nThRootUnity_y[1] = nThRootUnity_y[3] = cosCoef;

	twiddle0 = vld1q_f32(nThRootUnity_x);
	twiddle1 = vld1q_f32(nThRootUnity_y);

	float32x4_t mult_temp0 = vmulq_f32(twiddle0, two_comps);
	float32x4_t mult_temp1 = vmulq_f32(twiddle1, two_comps);
	two_comps = vaddq_f32(mult_temp0, mult_temp1);

	//float32_t cosCoef_p = -sinCoef;
	//float32_t sinCoef_p = -cosCoef;
	nThRootUnity_x[0] = nThRootUnity_x[2] = -sinCoef;
	nThRootUnity_x[1] = nThRootUnity_x[3] = -cosCoef;
	nThRootUnity_y[0] = nThRootUnity_y[2] = cosCoef;
	nThRootUnity_y[1] = nThRootUnity_y[3] = -sinCoef;

	twiddle0 = vld1q_f32(nThRootUnity_x);
	twiddle1 = vld1q_f32(nThRootUnity_y);

	mult_temp0 = vmulq_f32(twiddle0, two_comps_p);
	mult_temp1 = vmulq_f32(twiddle1, two_comps_p);
	two_comps_p = vaddq_f32(mult_temp0, mult_temp1);
	}

	//store result
	vst1q_f32(iter, two_comps);
	vst1q_f32((iter + stage*2), two_comps_p);
	iter += 4;
	}
	}
No results found