Skip to content

Instantly share code, notes, and snippets.

@vermorel
Last active April 10, 2024 12:31
Show Gist options
  • Save vermorel/7ad35212df44f3a79bca8ab5fe8e7622 to your computer and use it in GitHub Desktop.
Save vermorel/7ad35212df44f3a79bca8ab5fe8e7622 to your computer and use it in GitHub Desktop.
Fast 1D convolution with AVX
// Fast 1D convolution with AXV
// By Joannes Vermorel, Lokad, January 2018
// Released under MIT license
#include <string.h>
#include <stdio.h>
#include <malloc.h>
/* A simple implementation of a 1D convolution that just iterates over
* scalar values of the input array.
*
* Returns the same as numpy.convolve(in, kernel, mode='full')
* */
extern "C" _declspec(dllexport) int convolve_naive(float* in, int input_length,
float* kernel, int kernel_length, float* out)
{
for (int i = 0; i < input_length + kernel_length - 1; i++) {
out[i] = 0.0;
int startk = i >= input_length ? i - input_length + 1 : 0;
int endk = i < kernel_length ? i : kernel_length - 1;
for (int k = startk; k <= endk; k++) {
out[i] += in[i - k] * kernel[k];
}
}
return 0;
}
/* Vectorize the algorithm to compute 4 output samples in parallel.
*
* Each kernel value is repeated 4 times, which can then be used on
* 4 input samples in parallel. Stepping over these as in naive
* means that we get 4 output samples for each inner kernel loop.
*
*/
extern "C" _declspec(dllexport) int convolve_sse(float* in, int input_length,
float* kernel, int kernel_length, float* out)
{
float* in_padded = (float*)(_alloca(sizeof(float) * (input_length + 8)));
__m128* kernel_many = (__m128*)(_alloca(16 * kernel_length));
__m128 block;
__m128 prod;
__m128 acc;
// surrounding zeroes, before and after
_mm_storeu_ps(in_padded, _mm_set1_ps(0));
memcpy(&in_padded[4], in, sizeof(float) * input_length);
_mm_storeu_ps(in_padded + input_length + 4, _mm_set1_ps(0));
// Repeat each kernal value across a 4-vector
int i;
for (i = 0; i < kernel_length; i++) {
kernel_many[i] = _mm_set1_ps(kernel[i]); // broadcast
}
for (i = 0; i < input_length + kernel_length - 4; i += 4) {
// Zero the accumulator
acc = _mm_setzero_ps();
int startk = i > (input_length - 1) ? i - (input_length - 1) : 0;
int endk = (i + 3) < kernel_length ? (i + 3) : kernel_length - 1;
/* After this loop, we have computed 4 output samples
* for the price of one.
* */
for (int k = startk; k <= endk; k++) {
// Load 4-float data block. These needs to be an unaliged
// load (_mm_loadu_ps) as we step one sample at a time.
block = _mm_loadu_ps(in_padded + 4 + i - k);
prod = _mm_mul_ps(block, kernel_many[k]);
// Accumulate the 4 parallel values
acc = _mm_add_ps(acc, prod);
}
_mm_storeu_ps(out + i, acc);
}
// Left-overs
for (; i < input_length + kernel_length - 1; i++) {
out[i] = 0.0;
int startk = i >= input_length ? i - input_length + 1 : 0;
int endk = i < kernel_length ? i : kernel_length - 1;
for (int k = startk; k <= endk; k++) {
out[i] += in[i - k] * kernel[k];
}
}
return 0;
}
/* Vectorize the algorithm to compute 8 output samples in parallel.
*
* Each kernel value is repeated 8 times, which can then be used on
* 8 input samples in parallel. Stepping over these as in naive
* means that we get 8 output samples for each inner kernel loop.
*
*/
extern "C" _declspec(dllexport) int convolve_avx(float* in, int input_length,
float* kernel, int kernel_length, float* out)
{
float* in_padded = (float*)(_alloca(sizeof(float) * (input_length + 16)));
__m256* kernel_many = (__m256*)(_alloca(sizeof(__m256) * kernel_length));
__m256 block;
__m256 prod;
__m256 acc;
// Repeat the kernel across the vector
int i;
for (i = 0; i < kernel_length; i++) {
kernel_many[i] = _mm256_broadcast_ss(&kernel[i]); // broadcast
}
/* Create a set of 4 aligned arrays
* Each array is offset by one sample from the one before
*/
block = _mm256_setzero_ps();
_mm256_storeu_ps(in_padded, block);
memcpy(&(in_padded[8]), in, input_length * sizeof(float));
_mm256_storeu_ps(in_padded + input_length + 8, block);
for (i = 0; i < input_length + kernel_length - 8; i += 8) {
// Zero the accumulator
acc = _mm256_setzero_ps();
int startk = i >(input_length - 1) ? i - (input_length - 1) : 0;
int endk = (i + 7) < kernel_length ? (i + 7) : kernel_length - 1;
int k = startk;
// Manual unrolling of the loop to trigger pipelining speed-up (x2 perf)
for (; k + 3 <= endk; k += 4)
{
block = _mm256_loadu_ps(in_padded + 8 + i - k);
prod = _mm256_mul_ps(block, kernel_many[k]);
acc = _mm256_add_ps(acc, prod);
block = _mm256_loadu_ps(in_padded + 7 + i - k);
prod = _mm256_mul_ps(block, kernel_many[k + 1]);
acc = _mm256_add_ps(acc, prod);
block = _mm256_loadu_ps(in_padded + 6 + i - k);
prod = _mm256_mul_ps(block, kernel_many[k + 2]);
acc = _mm256_add_ps(acc, prod);
block = _mm256_loadu_ps(in_padded + 5 + i - k);
prod = _mm256_mul_ps(block, kernel_many[k + 3]);
acc = _mm256_add_ps(acc, prod);
}
for (; k <= endk; k++) {
block = _mm256_loadu_ps(in_padded + 8 + i - k);
prod = _mm256_mul_ps(block, kernel_many[k]);
acc = _mm256_add_ps(acc, prod);
}
_mm256_storeu_ps(out + i, acc);
}
// Left-overs
for (; i < input_length + kernel_length - 1; i++) {
out[i] = 0.0;
int startk = i >= input_length ? i - input_length + 1 : 0;
int endk = i < kernel_length ? i : kernel_length - 1;
for (int k = startk; k <= endk; k++) {
out[i] += in[i - k] * kernel[k];
}
}
return 0;
}
@not-availabIe
Copy link

awesome!

@githublqs
Copy link

hehello sir, how to implement convolve_avx(uchar* in, int input_length,
uchar* kernel, int kernel_length, float* out)?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment