usrlocalben · October 1, 2012 02:24 · Dr-Emann · Aug 5, 2014 · usrlocalben · Apr 10, 2016
diff --git a/stackblur.cpp b/stackblur.cpp
 /*
 * "stackblur"
 * originally, Mario Klingemann, [email protected]
 * this implementation, Benjamin Yates ([email protected])
 * http://incubator.quasimondo.com/processing/stackblur.pde
 * 
 * this blur routine seems to be quite popular.
 *
 * unfortunately, mario didn't comment anything.
 * but, it's easy to see it's just a flavor of a two-pass
 * sliding box kernel.
 *
 * this version is vectorized for float32 r/g/b/a using sse
 *
 * vec4() is just a class wrapping _mm_zzz_ps() family of SSE intrinsics
 * ( if you need one, start here:
 *  - http://fastcpp.blogspot.com/2011/12/simple-vector3-class-with-sse-support.html )
 *
 * radius is in (whole) pixels
 *
 */

 void stackblur( vec4* colorbuffer, const int w, const int h, const int radius )
 {
 	if ( radius < 1 ) return;	// nothing to do

 	vec4* const pix = reinterpret_cast<vec4*>(image.c);

 	// some values for convenience
 	const int w = image.width;
 	const int h = image.height;
 	const int wm = w-1;
 	const int hm = h-1;
 	const int wh = w*h;
 	const int r1 = radius + 1;

 	// number of divisions in the kernel
 	// D(-r), D(-r+1), ... D(0), ... D(r-1), D(r)
 	const int div = (radius * 2) + 1;

 	// temporary output space for first pass.
 	vec4* tsurface = new vec4[wh];

 	// lookup table for clamping pixel offsets
 	// as the kernel passes the right (or lower) edge
 	// of the input data
 	int* const vmin = new int[ max(w,h) ];

 	// calculate divisor for pulling an output from the kernel
 	//   the kernel is pyramid shaped.
 	//   to understand this divisor, imagine a cross-section
 	//   of the center of the pyramid.
 	//   (or ask mario to document it better)
 	//
 	// I store the reciprocal in the vector to
 	// mul_ps later instead of div_ps
 	int divsumi = ( div + 1 ) >> 1;
 	divsumi *= divsumi;
 	const vec4 divsum( 1.0f / (float)divsumi );

 	// kernel pixel data "stack"
 	// which works more like a ring-buffer,
 	// where the pointer is advanced each iteration, modulo #divisions
 	vec4* stack = new vec4[ div ];
 	int stackpointer;
 	int stackstart;



 	// input/output offsets.
 	// they are discrete because the source can be non-square
 	int yw = 0;	// current read offset in source
 	int yi = 0; // current write offset in destination


 	// blur horizontally, output to temporary buffer
 	for ( int y=0; y<h; y++ ) {

 		vec4 insum(0);
 		vec4 outsum(0);
 		vec4 sum(0);

 		// preload the kernel(stack)
 		// pixels before the left edge of the image are
 		// samples of [0] (max()).  min() handles
 		// images which are smaller than the kernel.
 		for ( int i=-radius; i<=radius; i++ ) {

 			// calcualte address of source pixel
 			const vec4& p = pix[ yi + min(wm,max(i,0)) ];

 			// put pixel in the stack
 			vec4& sir = stack[ i + radius ];
 			sir = p;

 			// rbs is a weight from (1)...(radius+1)...(1)
 			const int rbs = r1 - abs(i);

 			// add pixel to accumulators
 			sum += sir * rbs;
 			if ( i > 0 ) { insum  += sir; }
 			        else { outsum += sir; }
 		}


 		// now that the kernel is preloaded
 		// stackpointer is the index of the center of the kernel
 		stackpointer = radius;


 		// now start outputing pixels
 		for ( int x=0; x<w; x++ ) {

 			// output a pixel
 			tsurface[yi] = sum * divsum;


 			// calculate address of the next pixel
 			// to add to the kernel
 			//
 			// on first pass, make a lut to handle access
 			// past the right edge of the width image.
 			// min() will cause the last pixel to repeat.
 			if ( y == 0 ) vmin[x] = min( x+radius+1, wm );
 			vec4& p = pix[ yw + vmin[x] ];


 			// remove "past" pixels from the sum
 			sum -= outsum;


 			// remove "left" side of stack from outsum
 			stackstart = stackpointer - radius + div;
 			vec4& sir = stack[ stackstart % div ];
 			outsum -= sir;
 	
 			// now this (same) stack entry is the "right" side
 			// add new pixel to the stack, and update accumulators
 			sir = p;
 			insum += sir;
 			sum += insum;


 			// slide kernel one pixel ahead (right),
 			// update accumulators again
 			stackpointer = (stackpointer+1)%div;
 			const vec4& sirnext = stack[ stackpointer ];
 			outsum += sirnext;
 			insum -= sirnext;

 			yi++;	// advance output offset to next pixel
 		}
 		yw += w;	// advance input offset to next line
 	}//y loop



 	// now blur vertically from the temporary
 	// buffer, using the original image buffer
 	// as the output
 	for ( int x=0; x<w; x++ ) {

 		vec4 insum(0);
 		vec4 outsum(0);
 		vec4 sum(0);

 		//preload the stack
 		int yp = -radius * w;
 		for ( int i = -radius; i<=radius; i++ ) {

 			vec4& sir = stack[ i + radius ];

 			yi = max(0,yp)+x;
 			const vec4& p = tsurface[ yi ];
 			
 			sir = p;

 			const int rbs = r1 - abs(i);
 			sum += sir * rbs;
 			if ( i > 0 ) { insum  += sir; }
 			        else { outsum += sir; }

 			// only advance to the next row if there
 			// are still more rows of image left.
 			// otherwise, we keep sampling the same row
 			// as if the bottom line was duplicated
 			if ( i < hm ) yp += w;
 			
 		}//preload

 		stackpointer = radius;

 		// this loop is the same as the y-loop,
 		// except the src/dest offsets are calcuated differently
 		yi = x;	// set starting output offset by column
 		for ( int y=0; y<h; y++ ) {

 			pix[yi] = sum * divsum;

 			stackstart = stackpointer - radius + div;
 			vec4& sir = stack[ stackstart % div ];
 			sum -= outsum;
 			outsum -= sir;
 			
 			if ( x == 0 ) vmin[y] = min(y+r1,hm)*w;
 			sir = tsurface[ x + vmin[y] ];

 			insum += sir;
 			sum += insum;

 			stackpointer = (stackpointer+1)%div;
 			const vec4& sirnext = stack[ stackpointer ];
 			outsum += sirnext;
 			insum -= sirnext;

 			yi += w; // advance output offset to next line
 		}
 	}//x loop

 	delete [] vmin;
 	delete [] stack;
 	delete [] tsurface;

 }
	/*
	* "stackblur"
	* originally, Mario Klingemann, [email protected]
	* this implementation, Benjamin Yates ([email protected])
	* http://incubator.quasimondo.com/processing/stackblur.pde
	*
	* this blur routine seems to be quite popular.
	*
	* unfortunately, mario didn't comment anything.
	* but, it's easy to see it's just a flavor of a two-pass
	* sliding box kernel.
	*
	* this version is vectorized for float32 r/g/b/a using sse
	*
	* vec4() is just a class wrapping _mm_zzz_ps() family of SSE intrinsics
	* ( if you need one, start here:
	* - http://fastcpp.blogspot.com/2011/12/simple-vector3-class-with-sse-support.html )
	*
	* radius is in (whole) pixels
	*
	*/

	void stackblur( vec4* colorbuffer, const int w, const int h, const int radius )
	{
	if ( radius < 1 ) return; // nothing to do

	vec4* const pix = reinterpret_cast<vec4*>(image.c);

	// some values for convenience
	const int w = image.width;
	const int h = image.height;
	const int wm = w-1;
	const int hm = h-1;
	const int wh = w*h;
	const int r1 = radius + 1;

	// number of divisions in the kernel
	// D(-r), D(-r+1), ... D(0), ... D(r-1), D(r)
	const int div = (radius * 2) + 1;

	// temporary output space for first pass.
	vec4* tsurface = new vec4[wh];

	// lookup table for clamping pixel offsets
	// as the kernel passes the right (or lower) edge
	// of the input data
	int* const vmin = new int[ max(w,h) ];

	// calculate divisor for pulling an output from the kernel
	// the kernel is pyramid shaped.
	// to understand this divisor, imagine a cross-section
	// of the center of the pyramid.
	// (or ask mario to document it better)
	//
	// I store the reciprocal in the vector to
	// mul_ps later instead of div_ps
	int divsumi = ( div + 1 ) >> 1;
	divsumi *= divsumi;
	const vec4 divsum( 1.0f / (float)divsumi );

	// kernel pixel data "stack"
	// which works more like a ring-buffer,
	// where the pointer is advanced each iteration, modulo #divisions
	vec4* stack = new vec4[ div ];
	int stackpointer;
	int stackstart;



	// input/output offsets.
	// they are discrete because the source can be non-square
	int yw = 0; // current read offset in source
	int yi = 0; // current write offset in destination


	// blur horizontally, output to temporary buffer
	for ( int y=0; y<h; y++ ) {

	vec4 insum(0);
	vec4 outsum(0);
	vec4 sum(0);

	// preload the kernel(stack)
	// pixels before the left edge of the image are
	// samples of [0] (max()). min() handles
	// images which are smaller than the kernel.
	for ( int i=-radius; i<=radius; i++ ) {

	// calcualte address of source pixel
	const vec4& p = pix[ yi + min(wm,max(i,0)) ];

	// put pixel in the stack
	vec4& sir = stack[ i + radius ];
	sir = p;

	// rbs is a weight from (1)...(radius+1)...(1)
	const int rbs = r1 - abs(i);

	// add pixel to accumulators
	sum += sir * rbs;
	if ( i > 0 ) { insum += sir; }
	else { outsum += sir; }
	}


	// now that the kernel is preloaded
	// stackpointer is the index of the center of the kernel
	stackpointer = radius;


	// now start outputing pixels
	for ( int x=0; x<w; x++ ) {

	// output a pixel
	tsurface[yi] = sum * divsum;


	// calculate address of the next pixel
	// to add to the kernel
	//
	// on first pass, make a lut to handle access
	// past the right edge of the width image.
	// min() will cause the last pixel to repeat.
	if ( y == 0 ) vmin[x] = min( x+radius+1, wm );
	vec4& p = pix[ yw + vmin[x] ];


	// remove "past" pixels from the sum
	sum -= outsum;


	// remove "left" side of stack from outsum
	stackstart = stackpointer - radius + div;
	vec4& sir = stack[ stackstart % div ];
	outsum -= sir;

	// now this (same) stack entry is the "right" side
	// add new pixel to the stack, and update accumulators
	sir = p;
	insum += sir;
	sum += insum;


	// slide kernel one pixel ahead (right),
	// update accumulators again
	stackpointer = (stackpointer+1)%div;
	const vec4& sirnext = stack[ stackpointer ];
	outsum += sirnext;
	insum -= sirnext;

	yi++; // advance output offset to next pixel
	}
	yw += w; // advance input offset to next line
	}//y loop



	// now blur vertically from the temporary
	// buffer, using the original image buffer
	// as the output
	for ( int x=0; x<w; x++ ) {

	vec4 insum(0);
	vec4 outsum(0);
	vec4 sum(0);

	//preload the stack
	int yp = -radius * w;
	for ( int i = -radius; i<=radius; i++ ) {

	vec4& sir = stack[ i + radius ];

	yi = max(0,yp)+x;
	const vec4& p = tsurface[ yi ];

	sir = p;

	const int rbs = r1 - abs(i);
	sum += sir * rbs;
	if ( i > 0 ) { insum += sir; }
	else { outsum += sir; }

	// only advance to the next row if there
	// are still more rows of image left.
	// otherwise, we keep sampling the same row
	// as if the bottom line was duplicated
	if ( i < hm ) yp += w;

	}//preload

	stackpointer = radius;

	// this loop is the same as the y-loop,
	// except the src/dest offsets are calcuated differently
	yi = x; // set starting output offset by column
	for ( int y=0; y<h; y++ ) {

	pix[yi] = sum * divsum;

	stackstart = stackpointer - radius + div;
	vec4& sir = stack[ stackstart % div ];
	sum -= outsum;
	outsum -= sir;

	if ( x == 0 ) vmin[y] = min(y+r1,hm)*w;
	sir = tsurface[ x + vmin[y] ];

	insum += sir;
	sum += insum;

	stackpointer = (stackpointer+1)%div;
	const vec4& sirnext = stack[ stackpointer ];
	outsum += sirnext;
	insum -= sirnext;

	yi += w; // advance output offset to next line
	}
	}//x loop

	delete [] vmin;
	delete [] stack;
	delete [] tsurface;

	}