ashafq · February 20, 2025 00:22
diff --git a/x1.c b/x1.c
 /*
 * Copyright (C) 2025 Ayan Shafqat <[email protected]>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

 #include <assert.h>
 #include <stdalign.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <string.h>
 #include <xmmintrin.h>

 void process_crossover_kernel(const float *coeff, float *state,
                              const float *input, float *output, size_t frames);

 void process_crossover(const float *coeff, float *state, const float *input,
                       float *output, float *scratch, size_t num_frames,
                       size_t radix) {
  /*
   * `radix` is defined as log2(output_bands). Since, there are four fitlers
   * utilizing 4-SIMD lanes, number of stages is half of radix.
   */

  assert(radix >= 2);
  size_t num_stages = radix / 2;

  /*
   * Preprocess the input buffer:
   * Since the crossover kernel processes a stereo pair (2 samples) per frame,
   * duplicate each mono sample into two lanes.
   */
  for (size_t frame = 0; frame < num_frames; ++frame) {
    scratch[frame * 2 + 0] = input[frame];
    scratch[frame * 2 + 1] = input[frame];
  }

  /*
   * Each biquad filter section is defined by 5 groups of coefficients,
   * with 4 coefficients per SIMD lane (thus 5 * 4 coefficients per set).
   */
  const size_t coeffs_per_set = 5 * 4;

  /*
   * Similarly, each biquad filter uses 2 sets for its state,
   * with 4 state values per set.
   */
  const size_t states_per_set = 2 * 4;

  /*
   * The kernel outputs 4 channels (SIMD lanes) per filter stage, while
   * the input to the kernel has 2 channels (from the duplicated mono input).
   *
   * Hence, the "block" sizes for each set are defined as:
   *   - block_per_set_in: 2 channels * num_frames
   *   - block_per_set_out: 4 channels * num_frames
   */
  const size_t block_per_set_out = 4 * num_frames;
  const size_t block_per_set_in = 2 * num_frames;

  /* Some offset counters to keep track of pointers */
  size_t coeff_offset = 0;
  size_t state_offset = 0;
  size_t scratch_offset = 0;

  /* Num sets start at 1, and then doubles every stage */
  size_t num_sets = 1;

  for (size_t stage = 0; stage < num_stages; ++stage) {
    for (size_t set = 0; set < num_sets; ++set) {
      const float *coeff_ptr = coeff + coeff_offset;
      float *state_ptr = state + state_offset;
      float *scratch_ptr = scratch + scratch_offset;

      /*
       * Compute input pointer for this set:
       * Each set uses a contiguous block of 'block_per_set_in' samples in the
       * scratch buffer.
       */
      const float *input_ptr =
          (const float *)(scratch_ptr + (set * block_per_set_in));

      float *output_ptr = NULL;
      if (stage == num_stages - 1) {
        /*
         * For the final stage, write the filter output directly into the output
         * buffer.
         */
        output_ptr = output + (set * block_per_set_out);
      } else {
        /*
         * For intermediate stages, write the output back into the scratch
         * buffer. This output region is located after the current input block.
         */
        output_ptr = scratch_ptr + (set * block_per_set_out) +
                     (num_sets * block_per_set_in);
      }

      process_crossover_kernel(coeff_ptr, state_ptr, input_ptr, output_ptr,
                               num_frames);
      /* Advance the coefficient and state offsets to point to the next filter
       * set. */
      coeff_offset += coeffs_per_set;
      state_offset += states_per_set;
    }

    /*
     * After processing all sets for this stage, move the scratch offset
     * to the next available block (i.e. after all input blocks of this stage).
     */
    scratch_offset += num_sets * block_per_set_in;
    num_sets *= 2;
  }
 }

 void process_crossover_kernel(const float *coeff, float *state,
                              const float *input, float *output,
                              size_t frames) {
  // Set up pointers
  coeff = __builtin_assume_aligned(coeff, 16);
  state = __builtin_assume_aligned(state, 16);
  input = __builtin_assume_aligned(input, 16);
  output = __builtin_assume_aligned(output, 16);

  // Load coefficients
  const __m128 vb0 = _mm_load_ps(coeff + 0 * 4);
  const __m128 vb1 = _mm_load_ps(coeff + 1 * 4);
  const __m128 vb2 = _mm_load_ps(coeff + 2 * 4);

  const __m128 va1 = _mm_load_ps(coeff + 3 * 4);
  const __m128 va2 = _mm_load_ps(coeff + 4 * 4);

  // Load states
  __m128 vw1 = _mm_load_ps(state + 0 * 4);
  __m128 vw2 = _mm_load_ps(state + 1 * 4);

  // Process samples
  for (size_t i = 0; i < frames; i++) {
    // Load input
    __m128 vx = _mm_setzero_ps();
    vx = _mm_loadl_pi(vx, (const __m64 *)(input + (2 * i)));
    vx = _mm_shuffle_ps(vx, vx, _MM_SHUFFLE(1, 1, 0, 0));

    // Compute output
    // y = b0 * x + w1
    __m128 vb0_vx = _mm_mul_ps(vb0, vx);
    __m128 vy = _mm_add_ps(vb0_vx, vw1);

    // Update state: w1
    // w1 = b1 * x - a1 * y + w2
    __m128 vb1_vx = _mm_mul_ps(vb1, vx);
    __m128 va1_vy = _mm_mul_ps(va1, vy);
    vw1 = _mm_sub_ps(vb1_vx, va1_vy);
    vw1 = _mm_add_ps(vw1, vw2);

    // Update state: w2
    // w2 = b2 * x - a2 * y
    __m128 vb2_vx = _mm_mul_ps(vb2, vx);
    __m128 va2_vy = _mm_mul_ps(va2, vy);
    vw2 = _mm_sub_ps(vb2_vx, va2_vy);

    // Store output to buffer, and update pointer
    _mm_store_ps(output + (4 * i), vy);
  }

  // Store state in state buffer
  _mm_store_ps(state + 0 * 4, vw1);
  _mm_store_ps(state + 1 * 4, vw2);
 }

 /*
 * TESTS
 */
 #define RADIX (2)
 #define BANDS (1 << RADIX)
 #define TOTAL_SETS ((1 << (RADIX + 2)))
 #define FRAMES (8)

 #define NUM_IN (1)
 #define NUM_OUT (BANDS)

 #define NUM_GROUPS 5    // b0, b1, b2, a1, a2
 #define SIMD_CHANNELS 4 // 4 coefficients per group
 #define GROUP_SIZE (NUM_GROUPS * SIMD_CHANNELS)

 #define IN_SIZE (FRAMES)
 #define OUT_SIZE (FRAMES * BANDS)
 #define SCRATCH_SIZE (TOTAL_SETS * 4 * FRAMES)

 // coeff is assumed to have TOTAL_SETS * GROUP_SIZE floats.
 void fill_coefficients(float *coeff, size_t total_sets) {
  for (size_t set = 0; set < total_sets; ++set) {
    for (size_t group = 0; group < NUM_GROUPS; ++group) {
      for (size_t channel = 0; channel < SIMD_CHANNELS; ++channel) {
        size_t offset = set * GROUP_SIZE + group * SIMD_CHANNELS + channel;
        switch (group) {
        case 0: // b0
          coeff[offset] = 1.0F;
          break;
        case 1: // b1
        case 2: // b2
        case 3: // a1
        case 4: // a2
          coeff[offset] = 0.0;
          break;
        default:
          break;
        }
      }
    }
  }
 }

 void print_float_array(const float *arr, size_t len) {
  for (size_t i = 0; i < len; i++) {
    printf("%f ", arr[i]);
  }
  printf("\n");
 }

 void print_output_buffer(const float *arr, size_t num_frames,
                         size_t num_channels) {
  printf("Frame\t");
  for (size_t ch = 0; ch < num_channels; ch++) {
    printf("Ch%zu\t", ch);
  }
  printf("\n");

  for (size_t frame = 0; frame < num_frames; frame++) {
    printf("%zu:\t", frame);
    for (size_t ch = 0; ch < num_channels; ch++) {
      printf("% f\t", arr[frame * num_channels + ch]);
    }
    printf("\n");
  }
 }

 static alignas(16) float coeff[5 * 4 * TOTAL_SETS];
 static alignas(16) float state[2 * 4 * TOTAL_SETS];

 static alignas(16) float input[IN_SIZE];
 static alignas(16) float output[OUT_SIZE];
 static alignas(16) float scratch[SCRATCH_SIZE];

 int main() {
  fill_coefficients(coeff, TOTAL_SETS);

  memset(state, 0, sizeof(state));
  memset(scratch, 0, sizeof(scratch));
  memset(output, 0, sizeof(output));

  for (size_t i = 0; i < FRAMES; i++) {
    input[i] = 1 + i; // (i % 2 == 0) ? 1.F : -1.F;
  }

  process_crossover(coeff, state, input, output, scratch, FRAMES, RADIX);

  puts("input:");
  print_float_array(input, IN_SIZE);
  puts("output:");
  print_output_buffer(output, FRAMES, NUM_OUT);
 }
	/*
	* Copyright (C) 2025 Ayan Shafqat <[email protected]>
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 3, or (at your option)
	* any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program. If not, see <http://www.gnu.org/licenses/>.
	*/

	#include <assert.h>
	#include <stdalign.h>
	#include <stddef.h>
	#include <stdio.h>
	#include <string.h>
	#include <xmmintrin.h>

	void process_crossover_kernel(const float coeff, float state,
	const float input, float output, size_t frames);

	void process_crossover(const float coeff, float state, const float *input,
	float output, float scratch, size_t num_frames,
	size_t radix) {
	/*
	* `radix` is defined as log2(output_bands). Since, there are four fitlers
	* utilizing 4-SIMD lanes, number of stages is half of radix.
	*/

	assert(radix >= 2);
	size_t num_stages = radix / 2;

	/*
	* Preprocess the input buffer:
	* Since the crossover kernel processes a stereo pair (2 samples) per frame,
	* duplicate each mono sample into two lanes.
	*/
	for (size_t frame = 0; frame < num_frames; ++frame) {
	scratch[frame * 2 + 0] = input[frame];
	scratch[frame * 2 + 1] = input[frame];
	}

	/*
	* Each biquad filter section is defined by 5 groups of coefficients,
	* with 4 coefficients per SIMD lane (thus 5 * 4 coefficients per set).
	*/
	const size_t coeffs_per_set = 5 * 4;

	/*
	* Similarly, each biquad filter uses 2 sets for its state,
	* with 4 state values per set.
	*/
	const size_t states_per_set = 2 * 4;

	/*
	* The kernel outputs 4 channels (SIMD lanes) per filter stage, while
	* the input to the kernel has 2 channels (from the duplicated mono input).
	*
	* Hence, the "block" sizes for each set are defined as:
	* - block_per_set_in: 2 channels * num_frames
	* - block_per_set_out: 4 channels * num_frames
	*/
	const size_t block_per_set_out = 4 * num_frames;
	const size_t block_per_set_in = 2 * num_frames;

	/* Some offset counters to keep track of pointers */
	size_t coeff_offset = 0;
	size_t state_offset = 0;
	size_t scratch_offset = 0;

	/* Num sets start at 1, and then doubles every stage */
	size_t num_sets = 1;

	for (size_t stage = 0; stage < num_stages; ++stage) {
	for (size_t set = 0; set < num_sets; ++set) {
	const float *coeff_ptr = coeff + coeff_offset;
	float *state_ptr = state + state_offset;
	float *scratch_ptr = scratch + scratch_offset;

	/*
	* Compute input pointer for this set:
	* Each set uses a contiguous block of 'block_per_set_in' samples in the
	* scratch buffer.
	*/
	const float *input_ptr =
	(const float )(scratch_ptr + (set block_per_set_in));

	float *output_ptr = NULL;
	if (stage == num_stages - 1) {
	/*
	* For the final stage, write the filter output directly into the output
	* buffer.
	*/
	output_ptr = output + (set * block_per_set_out);
	} else {
	/*
	* For intermediate stages, write the output back into the scratch
	* buffer. This output region is located after the current input block.
	*/
	output_ptr = scratch_ptr + (set * block_per_set_out) +
	(num_sets * block_per_set_in);
	}

	process_crossover_kernel(coeff_ptr, state_ptr, input_ptr, output_ptr,
	num_frames);
	/* Advance the coefficient and state offsets to point to the next filter
	* set. */
	coeff_offset += coeffs_per_set;
	state_offset += states_per_set;
	}

	/*
	* After processing all sets for this stage, move the scratch offset
	* to the next available block (i.e. after all input blocks of this stage).
	*/
	scratch_offset += num_sets * block_per_set_in;
	num_sets *= 2;
	}
	}

	void process_crossover_kernel(const float coeff, float state,
	const float input, float output,
	size_t frames) {
	// Set up pointers
	coeff = __builtin_assume_aligned(coeff, 16);
	state = __builtin_assume_aligned(state, 16);
	input = __builtin_assume_aligned(input, 16);
	output = __builtin_assume_aligned(output, 16);

	// Load coefficients
	const __m128 vb0 = _mm_load_ps(coeff + 0 * 4);
	const __m128 vb1 = _mm_load_ps(coeff + 1 * 4);
	const __m128 vb2 = _mm_load_ps(coeff + 2 * 4);

	const __m128 va1 = _mm_load_ps(coeff + 3 * 4);
	const __m128 va2 = _mm_load_ps(coeff + 4 * 4);

	// Load states
	__m128 vw1 = _mm_load_ps(state + 0 * 4);
	__m128 vw2 = _mm_load_ps(state + 1 * 4);

	// Process samples
	for (size_t i = 0; i < frames; i++) {
	// Load input
	__m128 vx = _mm_setzero_ps();
	vx = _mm_loadl_pi(vx, (const __m64 )(input + (2 i)));
	vx = _mm_shuffle_ps(vx, vx, _MM_SHUFFLE(1, 1, 0, 0));

	// Compute output
	// y = b0 * x + w1
	__m128 vb0_vx = _mm_mul_ps(vb0, vx);
	__m128 vy = _mm_add_ps(vb0_vx, vw1);

	// Update state: w1
	// w1 = b1 * x - a1 * y + w2
	__m128 vb1_vx = _mm_mul_ps(vb1, vx);
	__m128 va1_vy = _mm_mul_ps(va1, vy);
	vw1 = _mm_sub_ps(vb1_vx, va1_vy);
	vw1 = _mm_add_ps(vw1, vw2);

	// Update state: w2
	// w2 = b2 * x - a2 * y
	__m128 vb2_vx = _mm_mul_ps(vb2, vx);
	__m128 va2_vy = _mm_mul_ps(va2, vy);
	vw2 = _mm_sub_ps(vb2_vx, va2_vy);

	// Store output to buffer, and update pointer
	_mm_store_ps(output + (4 * i), vy);
	}

	// Store state in state buffer
	_mm_store_ps(state + 0 * 4, vw1);
	_mm_store_ps(state + 1 * 4, vw2);
	}

	/*
	* TESTS
	*/
	#define RADIX (2)
	#define BANDS (1 << RADIX)
	#define TOTAL_SETS ((1 << (RADIX + 2)))
	#define FRAMES (8)

	#define NUM_IN (1)
	#define NUM_OUT (BANDS)

	#define NUM_GROUPS 5 // b0, b1, b2, a1, a2
	#define SIMD_CHANNELS 4 // 4 coefficients per group
	#define GROUP_SIZE (NUM_GROUPS * SIMD_CHANNELS)

	#define IN_SIZE (FRAMES)
	#define OUT_SIZE (FRAMES * BANDS)
	#define SCRATCH_SIZE (TOTAL_SETS * 4 * FRAMES)

	// coeff is assumed to have TOTAL_SETS * GROUP_SIZE floats.
	void fill_coefficients(float *coeff, size_t total_sets) {
	for (size_t set = 0; set < total_sets; ++set) {
	for (size_t group = 0; group < NUM_GROUPS; ++group) {
	for (size_t channel = 0; channel < SIMD_CHANNELS; ++channel) {
	size_t offset = set * GROUP_SIZE + group * SIMD_CHANNELS + channel;
	switch (group) {
	case 0: // b0
	coeff[offset] = 1.0F;
	break;
	case 1: // b1
	case 2: // b2
	case 3: // a1
	case 4: // a2
	coeff[offset] = 0.0;
	break;
	default:
	break;
	}
	}
	}
	}
	}

	void print_float_array(const float *arr, size_t len) {
	for (size_t i = 0; i < len; i++) {
	printf("%f ", arr[i]);
	}
	printf("\n");
	}

	void print_output_buffer(const float *arr, size_t num_frames,
	size_t num_channels) {
	printf("Frame\t");
	for (size_t ch = 0; ch < num_channels; ch++) {
	printf("Ch%zu\t", ch);
	}
	printf("\n");

	for (size_t frame = 0; frame < num_frames; frame++) {
	printf("%zu:\t", frame);
	for (size_t ch = 0; ch < num_channels; ch++) {
	printf("% f\t", arr[frame * num_channels + ch]);
	}
	printf("\n");
	}
	}

	static alignas(16) float coeff[5 * 4 * TOTAL_SETS];
	static alignas(16) float state[2 * 4 * TOTAL_SETS];

	static alignas(16) float input[IN_SIZE];
	static alignas(16) float output[OUT_SIZE];
	static alignas(16) float scratch[SCRATCH_SIZE];

	int main() {
	fill_coefficients(coeff, TOTAL_SETS);

	memset(state, 0, sizeof(state));
	memset(scratch, 0, sizeof(scratch));
	memset(output, 0, sizeof(output));

	for (size_t i = 0; i < FRAMES; i++) {
	input[i] = 1 + i; // (i % 2 == 0) ? 1.F : -1.F;
	}

	process_crossover(coeff, state, input, output, scratch, FRAMES, RADIX);

	puts("input:");
	print_float_array(input, IN_SIZE);
	puts("output:");
	print_output_buffer(output, FRAMES, NUM_OUT);
	}