mmalex · March 4, 2020 14:55
diff --git a/allpasstest.cpp b/allpasstest.cpp
 #include "stdafx.h"
 #include <Windows.h> // for performance counter
 // quick test of the theory in https://gist.github.com/mmalex/3a538aaba60f0ca21eac868269525452
 // we try running a simple impulse train (click every 4096 samples) through 6 allpasses with random lengths
 // we time how long it takes to process 1 million samples, structuring the loop 3 ways:
 //	- a sample at a time with self contained allpass structures, 
 //  - a sample at a time with a single big buffer 
 //  - a block at a time using self contained allpass structures, operating in place on a 256 sample buffer.
 // using this naive code, on a single core of my AMD threadripper, with default release compile settings on visual studio 2015,
 // I see 
 //		8.1ms sample-at-a-time, 7.1ms bigbuf sample-at-a-time, 17.2 block-at-a-time
 // on small MCU (cortex m4 without a cache), the difference is even bigger (tho I haven't run this exact code on it)
 // I'm pleasantly surprised that, unless I f*ed something up (likely!), the simplest solution - sample at a time in a big buffer - goes quickest.
 // obviously you could make all of these techniques faster with more complex code. I was just wanting to measure the rough ballpark of how they 
 // stack up against each other with similar levels of complexity & time lavished on them (ie not very much).
 // notably missing from my comparison is big-buffer-block-at-a-time, which as Sean noted, you have to be a little careful not to stomp on
 // adjacent allpasses state when you do that. you can do it by spacing them out a bit in the big buffer. and once you do that, you can
 // pull the & out of the loop; however I think it uses more memory (bad on MCU), is more complex (bad on my brain), and I am still not convinced
 // that buffer at a time is ever better for this kind of simple chain-of-allpasses DSP.
 template <int N> struct AllPass { 
 	float buf[N]={}; 
 	int i=0; 	
 	inline float doit(float x) { 
 		float delayed=buf[i];
 		buf[i] = x -= delayed * 0.5f;
 		if (++i == N) i=0; 
 		return x * 0.5f + delayed;
 	}
 };
 // lengths of 6 allpasses
 #define AP1 123
 #define AP2 272
 #define AP3 313
 #define AP4 2040
 #define AP5 4313
 #define AP6 5916
 AllPass<AP1> a1;
 AllPass<AP2> a2;
 AllPass<AP3> a3;
 AllPass<AP4> a4;
 AllPass<AP5> a5;
 AllPass<AP6> a6;
 #define MASK 16383
 static_assert(AP1+AP2+AP3+AP4+AP5+AP6<=MASK,"the allpasses must fit in the big buffer. double MASK please");
 static_assert(AP1+AP2+AP3+AP4+AP5+AP6>MASK/2,"the allpasses are too small; halve MASK please");
 float buf[MASK+1];
 int delaypos;

 #define DoAllPass(N) { int j=(i+N)&MASK;float delayed=buf[j];buf[i]=x-=delayed*0.5f;x=x*0.5f+delayed; i=j; }	
 inline float DoReverb1sampOneBuf(float x) {			
 	int i=(delaypos--)&MASK; // this is the only index maintenance we need!
 	DoAllPass(AP1);
 	DoAllPass(AP2);
 	DoAllPass(AP3);
 	DoAllPass(AP4);
 	DoAllPass(AP5);
 	DoAllPass(AP6);
 	return x;
 }
 inline float DoReverb1samp(float x) {
 	x=a1.doit(x);
 	x=a2.doit(x);
 	x=a3.doit(x);
 	x=a4.doit(x);
 	x=a5.doit(x);
 	x=a6.doit(x);
 	return x;
 }
 const static int blocksize=256;
 inline void DoReverbBlock(float *buf) {
 	for (int i=0;i<blocksize;++i) buf[i]=a1.doit(buf[i]);
 	for (int i=0;i<blocksize;++i) buf[i]=a2.doit(buf[i]);
 	for (int i=0;i<blocksize;++i) buf[i]=a3.doit(buf[i]);
 	for (int i=0;i<blocksize;++i) buf[i]=a4.doit(buf[i]);
 	for (int i=0;i<blocksize;++i) buf[i]=a5.doit(buf[i]);
 	for (int i=0;i<blocksize;++i) buf[i]=a6.doit(buf[i]);
 }
 inline float GetInput(int i) { // a click every 4096 samples
 	return (i&4095) ? 0 : 1.f;
 }
 int main()
 {
 	while (1) {
 		LARGE_INTEGER freq,t0,t1;
 		__int64 time_1samp=0;
 		__int64 time_block=0;
 		__int64 time_1buf=0;
 		QueryPerformanceFrequency(&freq);
 		double toms=1000.0/(double)freq.QuadPart;
 	
 		/////////////////////////////////////// sample at a time, separate allpass structures
 		float tot=0.f; // sum up a total so the optimiser doesnt throw away the work
 		QueryPerformanceCounter(&t0);
 		for (int i=0;i<1024*1024;++i) tot+=DoReverb1samp(GetInput(i));
 		QueryPerformanceCounter(&t1);
 		time_1samp+=t1.QuadPart-t0.QuadPart;

 		/////////////////////////////////////// sample at a time, one big buffer
 		float tot2=0.f;
 		QueryPerformanceCounter(&t0);
 		for (int i=0;i<1024*1024;++i) tot2+=DoReverb1sampOneBuf(GetInput(i));
 		QueryPerformanceCounter(&t1);
 		time_1buf+=t1.QuadPart-t0.QuadPart;

 		/////////////////////////////////////// block at a time, separate allpass structures
 		static float buf[blocksize];
 		float tot3=0.f;
 		QueryPerformanceCounter(&t0);
 		for (int i=0;i<1024*1024;i+=blocksize) {
 			for (int j=0;j<blocksize;++j) buf[j]=GetInput(i+j);
 			DoReverbBlock(buf);
 			for (int j=0;j<blocksize;++j) tot3+=buf[j];
 		}
 		QueryPerformanceCounter(&t1);
 		time_block+=t1.QuadPart-t0.QuadPart;
 	
 		printf("%0.1fms sample-at-a-time, %0.1fms bigbuf sample-at-a-time, %0.1f block-at-a-time\n", time_1samp*toms, time_1buf*toms, time_block*toms);
 	}
    return 0;
 }
	#include "stdafx.h"
	#include <Windows.h> // for performance counter
	// quick test of the theory in https://gist.github.com/mmalex/3a538aaba60f0ca21eac868269525452
	// we try running a simple impulse train (click every 4096 samples) through 6 allpasses with random lengths
	// we time how long it takes to process 1 million samples, structuring the loop 3 ways:
	// - a sample at a time with self contained allpass structures,
	// - a sample at a time with a single big buffer
	// - a block at a time using self contained allpass structures, operating in place on a 256 sample buffer.
	// using this naive code, on a single core of my AMD threadripper, with default release compile settings on visual studio 2015,
	// I see
	// 8.1ms sample-at-a-time, 7.1ms bigbuf sample-at-a-time, 17.2 block-at-a-time
	// on small MCU (cortex m4 without a cache), the difference is even bigger (tho I haven't run this exact code on it)
	// I'm pleasantly surprised that, unless I f*ed something up (likely!), the simplest solution - sample at a time in a big buffer - goes quickest.
	// obviously you could make all of these techniques faster with more complex code. I was just wanting to measure the rough ballpark of how they
	// stack up against each other with similar levels of complexity & time lavished on them (ie not very much).
	// notably missing from my comparison is big-buffer-block-at-a-time, which as Sean noted, you have to be a little careful not to stomp on
	// adjacent allpasses state when you do that. you can do it by spacing them out a bit in the big buffer. and once you do that, you can
	// pull the & out of the loop; however I think it uses more memory (bad on MCU), is more complex (bad on my brain), and I am still not convinced
	// that buffer at a time is ever better for this kind of simple chain-of-allpasses DSP.
	template <int N> struct AllPass {
	float buf[N]={};
	int i=0;
	inline float doit(float x) {
	float delayed=buf[i];
	buf[i] = x -= delayed * 0.5f;
	if (++i == N) i=0;
	return x * 0.5f + delayed;
	}
	};
	// lengths of 6 allpasses
	#define AP1 123
	#define AP2 272
	#define AP3 313
	#define AP4 2040
	#define AP5 4313
	#define AP6 5916
	AllPass<AP1> a1;
	AllPass<AP2> a2;
	AllPass<AP3> a3;
	AllPass<AP4> a4;
	AllPass<AP5> a5;
	AllPass<AP6> a6;
	#define MASK 16383
	static_assert(AP1+AP2+AP3+AP4+AP5+AP6<=MASK,"the allpasses must fit in the big buffer. double MASK please");
	static_assert(AP1+AP2+AP3+AP4+AP5+AP6>MASK/2,"the allpasses are too small; halve MASK please");
	float buf[MASK+1];
	int delaypos;

	#define DoAllPass(N) { int j=(i+N)&MASK;float delayed=buf[j];buf[i]=x-=delayed0.5f;x=x0.5f+delayed; i=j; }
	inline float DoReverb1sampOneBuf(float x) {
	int i=(delaypos--)&MASK; // this is the only index maintenance we need!
	DoAllPass(AP1);
	DoAllPass(AP2);
	DoAllPass(AP3);
	DoAllPass(AP4);
	DoAllPass(AP5);
	DoAllPass(AP6);
	return x;
	}
	inline float DoReverb1samp(float x) {
	x=a1.doit(x);
	x=a2.doit(x);
	x=a3.doit(x);
	x=a4.doit(x);
	x=a5.doit(x);
	x=a6.doit(x);
	return x;
	}
	const static int blocksize=256;
	inline void DoReverbBlock(float *buf) {
	for (int i=0;i<blocksize;++i) buf[i]=a1.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a2.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a3.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a4.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a5.doit(buf[i]);
	for (int i=0;i<blocksize;++i) buf[i]=a6.doit(buf[i]);
	}
	inline float GetInput(int i) { // a click every 4096 samples
	return (i&4095) ? 0 : 1.f;
	}
	int main()
	{
	while (1) {
	LARGE_INTEGER freq,t0,t1;
	__int64 time_1samp=0;
	__int64 time_block=0;
	__int64 time_1buf=0;
	QueryPerformanceFrequency(&freq);
	double toms=1000.0/(double)freq.QuadPart;

	/////////////////////////////////////// sample at a time, separate allpass structures
	float tot=0.f; // sum up a total so the optimiser doesnt throw away the work
	QueryPerformanceCounter(&t0);
	for (int i=0;i<1024*1024;++i) tot+=DoReverb1samp(GetInput(i));
	QueryPerformanceCounter(&t1);
	time_1samp+=t1.QuadPart-t0.QuadPart;

	/////////////////////////////////////// sample at a time, one big buffer
	float tot2=0.f;
	QueryPerformanceCounter(&t0);
	for (int i=0;i<1024*1024;++i) tot2+=DoReverb1sampOneBuf(GetInput(i));
	QueryPerformanceCounter(&t1);
	time_1buf+=t1.QuadPart-t0.QuadPart;

	/////////////////////////////////////// block at a time, separate allpass structures
	static float buf[blocksize];
	float tot3=0.f;
	QueryPerformanceCounter(&t0);
	for (int i=0;i<1024*1024;i+=blocksize) {
	for (int j=0;j<blocksize;++j) buf[j]=GetInput(i+j);
	DoReverbBlock(buf);
	for (int j=0;j<blocksize;++j) tot3+=buf[j];
	}
	QueryPerformanceCounter(&t1);
	time_block+=t1.QuadPart-t0.QuadPart;

	printf("%0.1fms sample-at-a-time, %0.1fms bigbuf sample-at-a-time, %0.1f block-at-a-time\n", time_1samptoms, time_1buftoms, time_block*toms);
	}
	return 0;
	}