Skip to content

Instantly share code, notes, and snippets.

@ashafq
Created February 20, 2025 00:22
Show Gist options
  • Save ashafq/cdd23025719faae97a83db945df77288 to your computer and use it in GitHub Desktop.
Save ashafq/cdd23025719faae97a83db945df77288 to your computer and use it in GitHub Desktop.
Biquad Crossover Filter
/*
* Copyright (C) 2025 Ayan Shafqat <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <assert.h>
#include <stdalign.h>
#include <stddef.h>
#include <stdio.h>
#include <string.h>
#include <xmmintrin.h>
void process_crossover_kernel(const float *coeff, float *state,
const float *input, float *output, size_t frames);
void process_crossover(const float *coeff, float *state, const float *input,
float *output, float *scratch, size_t num_frames,
size_t radix) {
/*
* `radix` is defined as log2(output_bands). Since, there are four fitlers
* utilizing 4-SIMD lanes, number of stages is half of radix.
*/
assert(radix >= 2);
size_t num_stages = radix / 2;
/*
* Preprocess the input buffer:
* Since the crossover kernel processes a stereo pair (2 samples) per frame,
* duplicate each mono sample into two lanes.
*/
for (size_t frame = 0; frame < num_frames; ++frame) {
scratch[frame * 2 + 0] = input[frame];
scratch[frame * 2 + 1] = input[frame];
}
/*
* Each biquad filter section is defined by 5 groups of coefficients,
* with 4 coefficients per SIMD lane (thus 5 * 4 coefficients per set).
*/
const size_t coeffs_per_set = 5 * 4;
/*
* Similarly, each biquad filter uses 2 sets for its state,
* with 4 state values per set.
*/
const size_t states_per_set = 2 * 4;
/*
* The kernel outputs 4 channels (SIMD lanes) per filter stage, while
* the input to the kernel has 2 channels (from the duplicated mono input).
*
* Hence, the "block" sizes for each set are defined as:
* - block_per_set_in: 2 channels * num_frames
* - block_per_set_out: 4 channels * num_frames
*/
const size_t block_per_set_out = 4 * num_frames;
const size_t block_per_set_in = 2 * num_frames;
/* Some offset counters to keep track of pointers */
size_t coeff_offset = 0;
size_t state_offset = 0;
size_t scratch_offset = 0;
/* Num sets start at 1, and then doubles every stage */
size_t num_sets = 1;
for (size_t stage = 0; stage < num_stages; ++stage) {
for (size_t set = 0; set < num_sets; ++set) {
const float *coeff_ptr = coeff + coeff_offset;
float *state_ptr = state + state_offset;
float *scratch_ptr = scratch + scratch_offset;
/*
* Compute input pointer for this set:
* Each set uses a contiguous block of 'block_per_set_in' samples in the
* scratch buffer.
*/
const float *input_ptr =
(const float *)(scratch_ptr + (set * block_per_set_in));
float *output_ptr = NULL;
if (stage == num_stages - 1) {
/*
* For the final stage, write the filter output directly into the output
* buffer.
*/
output_ptr = output + (set * block_per_set_out);
} else {
/*
* For intermediate stages, write the output back into the scratch
* buffer. This output region is located after the current input block.
*/
output_ptr = scratch_ptr + (set * block_per_set_out) +
(num_sets * block_per_set_in);
}
process_crossover_kernel(coeff_ptr, state_ptr, input_ptr, output_ptr,
num_frames);
/* Advance the coefficient and state offsets to point to the next filter
* set. */
coeff_offset += coeffs_per_set;
state_offset += states_per_set;
}
/*
* After processing all sets for this stage, move the scratch offset
* to the next available block (i.e. after all input blocks of this stage).
*/
scratch_offset += num_sets * block_per_set_in;
num_sets *= 2;
}
}
void process_crossover_kernel(const float *coeff, float *state,
const float *input, float *output,
size_t frames) {
// Set up pointers
coeff = __builtin_assume_aligned(coeff, 16);
state = __builtin_assume_aligned(state, 16);
input = __builtin_assume_aligned(input, 16);
output = __builtin_assume_aligned(output, 16);
// Load coefficients
const __m128 vb0 = _mm_load_ps(coeff + 0 * 4);
const __m128 vb1 = _mm_load_ps(coeff + 1 * 4);
const __m128 vb2 = _mm_load_ps(coeff + 2 * 4);
const __m128 va1 = _mm_load_ps(coeff + 3 * 4);
const __m128 va2 = _mm_load_ps(coeff + 4 * 4);
// Load states
__m128 vw1 = _mm_load_ps(state + 0 * 4);
__m128 vw2 = _mm_load_ps(state + 1 * 4);
// Process samples
for (size_t i = 0; i < frames; i++) {
// Load input
__m128 vx = _mm_setzero_ps();
vx = _mm_loadl_pi(vx, (const __m64 *)(input + (2 * i)));
vx = _mm_shuffle_ps(vx, vx, _MM_SHUFFLE(1, 1, 0, 0));
// Compute output
// y = b0 * x + w1
__m128 vb0_vx = _mm_mul_ps(vb0, vx);
__m128 vy = _mm_add_ps(vb0_vx, vw1);
// Update state: w1
// w1 = b1 * x - a1 * y + w2
__m128 vb1_vx = _mm_mul_ps(vb1, vx);
__m128 va1_vy = _mm_mul_ps(va1, vy);
vw1 = _mm_sub_ps(vb1_vx, va1_vy);
vw1 = _mm_add_ps(vw1, vw2);
// Update state: w2
// w2 = b2 * x - a2 * y
__m128 vb2_vx = _mm_mul_ps(vb2, vx);
__m128 va2_vy = _mm_mul_ps(va2, vy);
vw2 = _mm_sub_ps(vb2_vx, va2_vy);
// Store output to buffer, and update pointer
_mm_store_ps(output + (4 * i), vy);
}
// Store state in state buffer
_mm_store_ps(state + 0 * 4, vw1);
_mm_store_ps(state + 1 * 4, vw2);
}
/*
* TESTS
*/
#define RADIX (2)
#define BANDS (1 << RADIX)
#define TOTAL_SETS ((1 << (RADIX + 2)))
#define FRAMES (8)
#define NUM_IN (1)
#define NUM_OUT (BANDS)
#define NUM_GROUPS 5 // b0, b1, b2, a1, a2
#define SIMD_CHANNELS 4 // 4 coefficients per group
#define GROUP_SIZE (NUM_GROUPS * SIMD_CHANNELS)
#define IN_SIZE (FRAMES)
#define OUT_SIZE (FRAMES * BANDS)
#define SCRATCH_SIZE (TOTAL_SETS * 4 * FRAMES)
// coeff is assumed to have TOTAL_SETS * GROUP_SIZE floats.
void fill_coefficients(float *coeff, size_t total_sets) {
for (size_t set = 0; set < total_sets; ++set) {
for (size_t group = 0; group < NUM_GROUPS; ++group) {
for (size_t channel = 0; channel < SIMD_CHANNELS; ++channel) {
size_t offset = set * GROUP_SIZE + group * SIMD_CHANNELS + channel;
switch (group) {
case 0: // b0
coeff[offset] = 1.0F;
break;
case 1: // b1
case 2: // b2
case 3: // a1
case 4: // a2
coeff[offset] = 0.0;
break;
default:
break;
}
}
}
}
}
void print_float_array(const float *arr, size_t len) {
for (size_t i = 0; i < len; i++) {
printf("%f ", arr[i]);
}
printf("\n");
}
void print_output_buffer(const float *arr, size_t num_frames,
size_t num_channels) {
printf("Frame\t");
for (size_t ch = 0; ch < num_channels; ch++) {
printf("Ch%zu\t", ch);
}
printf("\n");
for (size_t frame = 0; frame < num_frames; frame++) {
printf("%zu:\t", frame);
for (size_t ch = 0; ch < num_channels; ch++) {
printf("% f\t", arr[frame * num_channels + ch]);
}
printf("\n");
}
}
static alignas(16) float coeff[5 * 4 * TOTAL_SETS];
static alignas(16) float state[2 * 4 * TOTAL_SETS];
static alignas(16) float input[IN_SIZE];
static alignas(16) float output[OUT_SIZE];
static alignas(16) float scratch[SCRATCH_SIZE];
int main() {
fill_coefficients(coeff, TOTAL_SETS);
memset(state, 0, sizeof(state));
memset(scratch, 0, sizeof(scratch));
memset(output, 0, sizeof(output));
for (size_t i = 0; i < FRAMES; i++) {
input[i] = 1 + i; // (i % 2 == 0) ? 1.F : -1.F;
}
process_crossover(coeff, state, input, output, scratch, FRAMES, RADIX);
puts("input:");
print_float_array(input, IN_SIZE);
puts("output:");
print_output_buffer(output, FRAMES, NUM_OUT);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment