-
-
Save ashafq/cdd23025719faae97a83db945df77288 to your computer and use it in GitHub Desktop.
Biquad Crossover Filter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (C) 2025 Ayan Shafqat <[email protected]> | |
* | |
* This program is free software; you can redistribute it and/or modify | |
* it under the terms of the GNU General Public License as published by | |
* the Free Software Foundation; either version 3, or (at your option) | |
* any later version. | |
* | |
* This program is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU General Public License for more details. | |
* | |
* You should have received a copy of the GNU General Public License | |
* along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
#include <assert.h> | |
#include <stdalign.h> | |
#include <stddef.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <xmmintrin.h> | |
void process_crossover_kernel(const float *coeff, float *state, | |
const float *input, float *output, size_t frames); | |
void process_crossover(const float *coeff, float *state, const float *input, | |
float *output, float *scratch, size_t num_frames, | |
size_t radix) { | |
/* | |
* `radix` is defined as log2(output_bands). Since, there are four fitlers | |
* utilizing 4-SIMD lanes, number of stages is half of radix. | |
*/ | |
assert(radix >= 2); | |
size_t num_stages = radix / 2; | |
/* | |
* Preprocess the input buffer: | |
* Since the crossover kernel processes a stereo pair (2 samples) per frame, | |
* duplicate each mono sample into two lanes. | |
*/ | |
for (size_t frame = 0; frame < num_frames; ++frame) { | |
scratch[frame * 2 + 0] = input[frame]; | |
scratch[frame * 2 + 1] = input[frame]; | |
} | |
/* | |
* Each biquad filter section is defined by 5 groups of coefficients, | |
* with 4 coefficients per SIMD lane (thus 5 * 4 coefficients per set). | |
*/ | |
const size_t coeffs_per_set = 5 * 4; | |
/* | |
* Similarly, each biquad filter uses 2 sets for its state, | |
* with 4 state values per set. | |
*/ | |
const size_t states_per_set = 2 * 4; | |
/* | |
* The kernel outputs 4 channels (SIMD lanes) per filter stage, while | |
* the input to the kernel has 2 channels (from the duplicated mono input). | |
* | |
* Hence, the "block" sizes for each set are defined as: | |
* - block_per_set_in: 2 channels * num_frames | |
* - block_per_set_out: 4 channels * num_frames | |
*/ | |
const size_t block_per_set_out = 4 * num_frames; | |
const size_t block_per_set_in = 2 * num_frames; | |
/* Some offset counters to keep track of pointers */ | |
size_t coeff_offset = 0; | |
size_t state_offset = 0; | |
size_t scratch_offset = 0; | |
/* Num sets start at 1, and then doubles every stage */ | |
size_t num_sets = 1; | |
for (size_t stage = 0; stage < num_stages; ++stage) { | |
for (size_t set = 0; set < num_sets; ++set) { | |
const float *coeff_ptr = coeff + coeff_offset; | |
float *state_ptr = state + state_offset; | |
float *scratch_ptr = scratch + scratch_offset; | |
/* | |
* Compute input pointer for this set: | |
* Each set uses a contiguous block of 'block_per_set_in' samples in the | |
* scratch buffer. | |
*/ | |
const float *input_ptr = | |
(const float *)(scratch_ptr + (set * block_per_set_in)); | |
float *output_ptr = NULL; | |
if (stage == num_stages - 1) { | |
/* | |
* For the final stage, write the filter output directly into the output | |
* buffer. | |
*/ | |
output_ptr = output + (set * block_per_set_out); | |
} else { | |
/* | |
* For intermediate stages, write the output back into the scratch | |
* buffer. This output region is located after the current input block. | |
*/ | |
output_ptr = scratch_ptr + (set * block_per_set_out) + | |
(num_sets * block_per_set_in); | |
} | |
process_crossover_kernel(coeff_ptr, state_ptr, input_ptr, output_ptr, | |
num_frames); | |
/* Advance the coefficient and state offsets to point to the next filter | |
* set. */ | |
coeff_offset += coeffs_per_set; | |
state_offset += states_per_set; | |
} | |
/* | |
* After processing all sets for this stage, move the scratch offset | |
* to the next available block (i.e. after all input blocks of this stage). | |
*/ | |
scratch_offset += num_sets * block_per_set_in; | |
num_sets *= 2; | |
} | |
} | |
void process_crossover_kernel(const float *coeff, float *state, | |
const float *input, float *output, | |
size_t frames) { | |
// Set up pointers | |
coeff = __builtin_assume_aligned(coeff, 16); | |
state = __builtin_assume_aligned(state, 16); | |
input = __builtin_assume_aligned(input, 16); | |
output = __builtin_assume_aligned(output, 16); | |
// Load coefficients | |
const __m128 vb0 = _mm_load_ps(coeff + 0 * 4); | |
const __m128 vb1 = _mm_load_ps(coeff + 1 * 4); | |
const __m128 vb2 = _mm_load_ps(coeff + 2 * 4); | |
const __m128 va1 = _mm_load_ps(coeff + 3 * 4); | |
const __m128 va2 = _mm_load_ps(coeff + 4 * 4); | |
// Load states | |
__m128 vw1 = _mm_load_ps(state + 0 * 4); | |
__m128 vw2 = _mm_load_ps(state + 1 * 4); | |
// Process samples | |
for (size_t i = 0; i < frames; i++) { | |
// Load input | |
__m128 vx = _mm_setzero_ps(); | |
vx = _mm_loadl_pi(vx, (const __m64 *)(input + (2 * i))); | |
vx = _mm_shuffle_ps(vx, vx, _MM_SHUFFLE(1, 1, 0, 0)); | |
// Compute output | |
// y = b0 * x + w1 | |
__m128 vb0_vx = _mm_mul_ps(vb0, vx); | |
__m128 vy = _mm_add_ps(vb0_vx, vw1); | |
// Update state: w1 | |
// w1 = b1 * x - a1 * y + w2 | |
__m128 vb1_vx = _mm_mul_ps(vb1, vx); | |
__m128 va1_vy = _mm_mul_ps(va1, vy); | |
vw1 = _mm_sub_ps(vb1_vx, va1_vy); | |
vw1 = _mm_add_ps(vw1, vw2); | |
// Update state: w2 | |
// w2 = b2 * x - a2 * y | |
__m128 vb2_vx = _mm_mul_ps(vb2, vx); | |
__m128 va2_vy = _mm_mul_ps(va2, vy); | |
vw2 = _mm_sub_ps(vb2_vx, va2_vy); | |
// Store output to buffer, and update pointer | |
_mm_store_ps(output + (4 * i), vy); | |
} | |
// Store state in state buffer | |
_mm_store_ps(state + 0 * 4, vw1); | |
_mm_store_ps(state + 1 * 4, vw2); | |
} | |
/* | |
* TESTS | |
*/ | |
#define RADIX (2) | |
#define BANDS (1 << RADIX) | |
#define TOTAL_SETS ((1 << (RADIX + 2))) | |
#define FRAMES (8) | |
#define NUM_IN (1) | |
#define NUM_OUT (BANDS) | |
#define NUM_GROUPS 5 // b0, b1, b2, a1, a2 | |
#define SIMD_CHANNELS 4 // 4 coefficients per group | |
#define GROUP_SIZE (NUM_GROUPS * SIMD_CHANNELS) | |
#define IN_SIZE (FRAMES) | |
#define OUT_SIZE (FRAMES * BANDS) | |
#define SCRATCH_SIZE (TOTAL_SETS * 4 * FRAMES) | |
// coeff is assumed to have TOTAL_SETS * GROUP_SIZE floats. | |
void fill_coefficients(float *coeff, size_t total_sets) { | |
for (size_t set = 0; set < total_sets; ++set) { | |
for (size_t group = 0; group < NUM_GROUPS; ++group) { | |
for (size_t channel = 0; channel < SIMD_CHANNELS; ++channel) { | |
size_t offset = set * GROUP_SIZE + group * SIMD_CHANNELS + channel; | |
switch (group) { | |
case 0: // b0 | |
coeff[offset] = 1.0F; | |
break; | |
case 1: // b1 | |
case 2: // b2 | |
case 3: // a1 | |
case 4: // a2 | |
coeff[offset] = 0.0; | |
break; | |
default: | |
break; | |
} | |
} | |
} | |
} | |
} | |
void print_float_array(const float *arr, size_t len) { | |
for (size_t i = 0; i < len; i++) { | |
printf("%f ", arr[i]); | |
} | |
printf("\n"); | |
} | |
void print_output_buffer(const float *arr, size_t num_frames, | |
size_t num_channels) { | |
printf("Frame\t"); | |
for (size_t ch = 0; ch < num_channels; ch++) { | |
printf("Ch%zu\t", ch); | |
} | |
printf("\n"); | |
for (size_t frame = 0; frame < num_frames; frame++) { | |
printf("%zu:\t", frame); | |
for (size_t ch = 0; ch < num_channels; ch++) { | |
printf("% f\t", arr[frame * num_channels + ch]); | |
} | |
printf("\n"); | |
} | |
} | |
static alignas(16) float coeff[5 * 4 * TOTAL_SETS]; | |
static alignas(16) float state[2 * 4 * TOTAL_SETS]; | |
static alignas(16) float input[IN_SIZE]; | |
static alignas(16) float output[OUT_SIZE]; | |
static alignas(16) float scratch[SCRATCH_SIZE]; | |
int main() { | |
fill_coefficients(coeff, TOTAL_SETS); | |
memset(state, 0, sizeof(state)); | |
memset(scratch, 0, sizeof(scratch)); | |
memset(output, 0, sizeof(output)); | |
for (size_t i = 0; i < FRAMES; i++) { | |
input[i] = 1 + i; // (i % 2 == 0) ? 1.F : -1.F; | |
} | |
process_crossover(coeff, state, input, output, scratch, FRAMES, RADIX); | |
puts("input:"); | |
print_float_array(input, IN_SIZE); | |
puts("output:"); | |
print_output_buffer(output, FRAMES, NUM_OUT); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment