Created
March 4, 2020 14:55
-
-
Save mmalex/2b8a95f3c4d2b2e05d6339006ba7af2f to your computer and use it in GitHub Desktop.
quick timing test of different ways of writing a chain of serial allpasses
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "stdafx.h" | |
#include <Windows.h> // for performance counter | |
// quick test of the theory in https://gist.github.com/mmalex/3a538aaba60f0ca21eac868269525452 | |
// we try running a simple impulse train (click every 4096 samples) through 6 allpasses with random lengths | |
// we time how long it takes to process 1 million samples, structuring the loop 3 ways: | |
// - a sample at a time with self contained allpass structures, | |
// - a sample at a time with a single big buffer | |
// - a block at a time using self contained allpass structures, operating in place on a 256 sample buffer. | |
// using this naive code, on a single core of my AMD threadripper, with default release compile settings on visual studio 2015, | |
// I see | |
// 8.1ms sample-at-a-time, 7.1ms bigbuf sample-at-a-time, 17.2 block-at-a-time | |
// on small MCU (cortex m4 without a cache), the difference is even bigger (tho I haven't run this exact code on it) | |
// I'm pleasantly surprised that, unless I f*ed something up (likely!), the simplest solution - sample at a time in a big buffer - goes quickest. | |
// obviously you could make all of these techniques faster with more complex code. I was just wanting to measure the rough ballpark of how they | |
// stack up against each other with similar levels of complexity & time lavished on them (ie not very much). | |
// notably missing from my comparison is big-buffer-block-at-a-time, which as Sean noted, you have to be a little careful not to stomp on | |
// adjacent allpasses state when you do that. you can do it by spacing them out a bit in the big buffer. and once you do that, you can | |
// pull the & out of the loop; however I think it uses more memory (bad on MCU), is more complex (bad on my brain), and I am still not convinced | |
// that buffer at a time is ever better for this kind of simple chain-of-allpasses DSP. | |
template <int N> struct AllPass { | |
float buf[N]={}; | |
int i=0; | |
inline float doit(float x) { | |
float delayed=buf[i]; | |
buf[i] = x -= delayed * 0.5f; | |
if (++i == N) i=0; | |
return x * 0.5f + delayed; | |
} | |
}; | |
// lengths of 6 allpasses | |
#define AP1 123 | |
#define AP2 272 | |
#define AP3 313 | |
#define AP4 2040 | |
#define AP5 4313 | |
#define AP6 5916 | |
AllPass<AP1> a1; | |
AllPass<AP2> a2; | |
AllPass<AP3> a3; | |
AllPass<AP4> a4; | |
AllPass<AP5> a5; | |
AllPass<AP6> a6; | |
#define MASK 16383 | |
static_assert(AP1+AP2+AP3+AP4+AP5+AP6<=MASK,"the allpasses must fit in the big buffer. double MASK please"); | |
static_assert(AP1+AP2+AP3+AP4+AP5+AP6>MASK/2,"the allpasses are too small; halve MASK please"); | |
float buf[MASK+1]; | |
int delaypos; | |
#define DoAllPass(N) { int j=(i+N)&MASK;float delayed=buf[j];buf[i]=x-=delayed*0.5f;x=x*0.5f+delayed; i=j; } | |
inline float DoReverb1sampOneBuf(float x) { | |
int i=(delaypos--)&MASK; // this is the only index maintenance we need! | |
DoAllPass(AP1); | |
DoAllPass(AP2); | |
DoAllPass(AP3); | |
DoAllPass(AP4); | |
DoAllPass(AP5); | |
DoAllPass(AP6); | |
return x; | |
} | |
inline float DoReverb1samp(float x) { | |
x=a1.doit(x); | |
x=a2.doit(x); | |
x=a3.doit(x); | |
x=a4.doit(x); | |
x=a5.doit(x); | |
x=a6.doit(x); | |
return x; | |
} | |
const static int blocksize=256; | |
inline void DoReverbBlock(float *buf) { | |
for (int i=0;i<blocksize;++i) buf[i]=a1.doit(buf[i]); | |
for (int i=0;i<blocksize;++i) buf[i]=a2.doit(buf[i]); | |
for (int i=0;i<blocksize;++i) buf[i]=a3.doit(buf[i]); | |
for (int i=0;i<blocksize;++i) buf[i]=a4.doit(buf[i]); | |
for (int i=0;i<blocksize;++i) buf[i]=a5.doit(buf[i]); | |
for (int i=0;i<blocksize;++i) buf[i]=a6.doit(buf[i]); | |
} | |
inline float GetInput(int i) { // a click every 4096 samples | |
return (i&4095) ? 0 : 1.f; | |
} | |
int main() | |
{ | |
while (1) { | |
LARGE_INTEGER freq,t0,t1; | |
__int64 time_1samp=0; | |
__int64 time_block=0; | |
__int64 time_1buf=0; | |
QueryPerformanceFrequency(&freq); | |
double toms=1000.0/(double)freq.QuadPart; | |
/////////////////////////////////////// sample at a time, separate allpass structures | |
float tot=0.f; // sum up a total so the optimiser doesnt throw away the work | |
QueryPerformanceCounter(&t0); | |
for (int i=0;i<1024*1024;++i) tot+=DoReverb1samp(GetInput(i)); | |
QueryPerformanceCounter(&t1); | |
time_1samp+=t1.QuadPart-t0.QuadPart; | |
/////////////////////////////////////// sample at a time, one big buffer | |
float tot2=0.f; | |
QueryPerformanceCounter(&t0); | |
for (int i=0;i<1024*1024;++i) tot2+=DoReverb1sampOneBuf(GetInput(i)); | |
QueryPerformanceCounter(&t1); | |
time_1buf+=t1.QuadPart-t0.QuadPart; | |
/////////////////////////////////////// block at a time, separate allpass structures | |
static float buf[blocksize]; | |
float tot3=0.f; | |
QueryPerformanceCounter(&t0); | |
for (int i=0;i<1024*1024;i+=blocksize) { | |
for (int j=0;j<blocksize;++j) buf[j]=GetInput(i+j); | |
DoReverbBlock(buf); | |
for (int j=0;j<blocksize;++j) tot3+=buf[j]; | |
} | |
QueryPerformanceCounter(&t1); | |
time_block+=t1.QuadPart-t0.QuadPart; | |
printf("%0.1fms sample-at-a-time, %0.1fms bigbuf sample-at-a-time, %0.1f block-at-a-time\n", time_1samp*toms, time_1buf*toms, time_block*toms); | |
} | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment