-
-
Save githublqs/2a4c136752a4c2c60eb4a18e0db6e56b to your computer and use it in GitHub Desktop.
Fast 1D convolution with AVX
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Fast 1D convolution with AXV | |
// By Joannes Vermorel, Lokad, January 2018 | |
// Released under MIT license | |
#include <string.h> | |
#include <stdio.h> | |
#include <malloc.h> | |
/* A simple implementation of a 1D convolution that just iterates over | |
* scalar values of the input array. | |
* | |
* Returns the same as numpy.convolve(in, kernel, mode='full') | |
* */ | |
extern "C" _declspec(dllexport) int convolve_naive(float* in, int input_length, | |
float* kernel, int kernel_length, float* out) | |
{ | |
for (int i = 0; i < input_length + kernel_length - 1; i++) { | |
out[i] = 0.0; | |
int startk = i >= input_length ? i - input_length + 1 : 0; | |
int endk = i < kernel_length ? i : kernel_length - 1; | |
for (int k = startk; k <= endk; k++) { | |
out[i] += in[i - k] * kernel[k]; | |
} | |
} | |
return 0; | |
} | |
/* Vectorize the algorithm to compute 4 output samples in parallel. | |
* | |
* Each kernel value is repeated 4 times, which can then be used on | |
* 4 input samples in parallel. Stepping over these as in naive | |
* means that we get 4 output samples for each inner kernel loop. | |
* | |
*/ | |
extern "C" _declspec(dllexport) int convolve_sse(float* in, int input_length, | |
float* kernel, int kernel_length, float* out) | |
{ | |
float* in_padded = (float*)(_alloca(sizeof(float) * (input_length + 8))); | |
__m128* kernel_many = (__m128*)(_alloca(16 * kernel_length)); | |
__m128 block; | |
__m128 prod; | |
__m128 acc; | |
// surrounding zeroes, before and after | |
_mm_storeu_ps(in_padded, _mm_set1_ps(0)); | |
memcpy(&in_padded[4], in, sizeof(float) * input_length); | |
_mm_storeu_ps(in_padded + input_length + 4, _mm_set1_ps(0)); | |
// Repeat each kernal value across a 4-vector | |
int i; | |
for (i = 0; i < kernel_length; i++) { | |
kernel_many[i] = _mm_set1_ps(kernel[i]); // broadcast | |
} | |
for (i = 0; i < input_length + kernel_length - 4; i += 4) { | |
// Zero the accumulator | |
acc = _mm_setzero_ps(); | |
int startk = i > (input_length - 1) ? i - (input_length - 1) : 0; | |
int endk = (i + 3) < kernel_length ? (i + 3) : kernel_length - 1; | |
/* After this loop, we have computed 4 output samples | |
* for the price of one. | |
* */ | |
for (int k = startk; k <= endk; k++) { | |
// Load 4-float data block. These needs to be an unaliged | |
// load (_mm_loadu_ps) as we step one sample at a time. | |
block = _mm_loadu_ps(in_padded + 4 + i - k); | |
prod = _mm_mul_ps(block, kernel_many[k]); | |
// Accumulate the 4 parallel values | |
acc = _mm_add_ps(acc, prod); | |
} | |
_mm_storeu_ps(out + i, acc); | |
} | |
// Left-overs | |
for (; i < input_length + kernel_length - 1; i++) { | |
out[i] = 0.0; | |
int startk = i >= input_length ? i - input_length + 1 : 0; | |
int endk = i < kernel_length ? i : kernel_length - 1; | |
for (int k = startk; k <= endk; k++) { | |
out[i] += in[i - k] * kernel[k]; | |
} | |
} | |
return 0; | |
} | |
/* Vectorize the algorithm to compute 8 output samples in parallel. | |
* | |
* Each kernel value is repeated 8 times, which can then be used on | |
* 8 input samples in parallel. Stepping over these as in naive | |
* means that we get 8 output samples for each inner kernel loop. | |
* | |
*/ | |
extern "C" _declspec(dllexport) int convolve_avx(float* in, int input_length, | |
float* kernel, int kernel_length, float* out) | |
{ | |
float* in_padded = (float*)(_alloca(sizeof(float) * (input_length + 16))); | |
__m256* kernel_many = (__m256*)(_alloca(sizeof(__m256) * kernel_length)); | |
__m256 block; | |
__m256 prod; | |
__m256 acc; | |
// Repeat the kernel across the vector | |
int i; | |
for (i = 0; i < kernel_length; i++) { | |
kernel_many[i] = _mm256_broadcast_ss(&kernel[i]); // broadcast | |
} | |
/* Create a set of 4 aligned arrays | |
* Each array is offset by one sample from the one before | |
*/ | |
block = _mm256_setzero_ps(); | |
_mm256_storeu_ps(in_padded, block); | |
memcpy(&(in_padded[8]), in, input_length * sizeof(float)); | |
_mm256_storeu_ps(in_padded + input_length + 8, block); | |
for (i = 0; i < input_length + kernel_length - 8; i += 8) { | |
// Zero the accumulator | |
acc = _mm256_setzero_ps(); | |
int startk = i >(input_length - 1) ? i - (input_length - 1) : 0; | |
int endk = (i + 7) < kernel_length ? (i + 7) : kernel_length - 1; | |
int k = startk; | |
// Manual unrolling of the loop to trigger pipelining speed-up (x2 perf) | |
for (; k + 3 <= endk; k += 4) | |
{ | |
block = _mm256_loadu_ps(in_padded + 8 + i - k); | |
prod = _mm256_mul_ps(block, kernel_many[k]); | |
acc = _mm256_add_ps(acc, prod); | |
block = _mm256_loadu_ps(in_padded + 7 + i - k); | |
prod = _mm256_mul_ps(block, kernel_many[k + 1]); | |
acc = _mm256_add_ps(acc, prod); | |
block = _mm256_loadu_ps(in_padded + 6 + i - k); | |
prod = _mm256_mul_ps(block, kernel_many[k + 2]); | |
acc = _mm256_add_ps(acc, prod); | |
block = _mm256_loadu_ps(in_padded + 5 + i - k); | |
prod = _mm256_mul_ps(block, kernel_many[k + 3]); | |
acc = _mm256_add_ps(acc, prod); | |
} | |
for (; k <= endk; k++) { | |
block = _mm256_loadu_ps(in_padded + 8 + i - k); | |
prod = _mm256_mul_ps(block, kernel_many[k]); | |
acc = _mm256_add_ps(acc, prod); | |
} | |
_mm256_storeu_ps(out + i, acc); | |
} | |
// Left-overs | |
for (; i < input_length + kernel_length - 1; i++) { | |
out[i] = 0.0; | |
int startk = i >= input_length ? i - input_length + 1 : 0; | |
int endk = i < kernel_length ? i : kernel_length - 1; | |
for (int k = startk; k <= endk; k++) { | |
out[i] += in[i - k] * kernel[k]; | |
} | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment