Created
October 1, 2012 02:24
-
-
Save usrlocalben/3809142 to your computer and use it in GitHub Desktop.
stackblur - c++, floating-point, vectorized
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* "stackblur" | |
* originally, Mario Klingemann, [email protected] | |
* this implementation, Benjamin Yates ([email protected]) | |
* http://incubator.quasimondo.com/processing/stackblur.pde | |
* | |
* this blur routine seems to be quite popular. | |
* | |
* unfortunately, mario didn't comment anything. | |
* but, it's easy to see it's just a flavor of a two-pass | |
* sliding box kernel. | |
* | |
* this version is vectorized for float32 r/g/b/a using sse | |
* | |
* vec4() is just a class wrapping _mm_zzz_ps() family of SSE intrinsics | |
* ( if you need one, start here: | |
* - http://fastcpp.blogspot.com/2011/12/simple-vector3-class-with-sse-support.html ) | |
* | |
* radius is in (whole) pixels | |
* | |
*/ | |
void stackblur( vec4* colorbuffer, const int w, const int h, const int radius ) | |
{ | |
if ( radius < 1 ) return; // nothing to do | |
vec4* const pix = reinterpret_cast<vec4*>(image.c); | |
// some values for convenience | |
const int w = image.width; | |
const int h = image.height; | |
const int wm = w-1; | |
const int hm = h-1; | |
const int wh = w*h; | |
const int r1 = radius + 1; | |
// number of divisions in the kernel | |
// D(-r), D(-r+1), ... D(0), ... D(r-1), D(r) | |
const int div = (radius * 2) + 1; | |
// temporary output space for first pass. | |
vec4* tsurface = new vec4[wh]; | |
// lookup table for clamping pixel offsets | |
// as the kernel passes the right (or lower) edge | |
// of the input data | |
int* const vmin = new int[ max(w,h) ]; | |
// calculate divisor for pulling an output from the kernel | |
// the kernel is pyramid shaped. | |
// to understand this divisor, imagine a cross-section | |
// of the center of the pyramid. | |
// (or ask mario to document it better) | |
// | |
// I store the reciprocal in the vector to | |
// mul_ps later instead of div_ps | |
int divsumi = ( div + 1 ) >> 1; | |
divsumi *= divsumi; | |
const vec4 divsum( 1.0f / (float)divsumi ); | |
// kernel pixel data "stack" | |
// which works more like a ring-buffer, | |
// where the pointer is advanced each iteration, modulo #divisions | |
vec4* stack = new vec4[ div ]; | |
int stackpointer; | |
int stackstart; | |
// input/output offsets. | |
// they are discrete because the source can be non-square | |
int yw = 0; // current read offset in source | |
int yi = 0; // current write offset in destination | |
// blur horizontally, output to temporary buffer | |
for ( int y=0; y<h; y++ ) { | |
vec4 insum(0); | |
vec4 outsum(0); | |
vec4 sum(0); | |
// preload the kernel(stack) | |
// pixels before the left edge of the image are | |
// samples of [0] (max()). min() handles | |
// images which are smaller than the kernel. | |
for ( int i=-radius; i<=radius; i++ ) { | |
// calcualte address of source pixel | |
const vec4& p = pix[ yi + min(wm,max(i,0)) ]; | |
// put pixel in the stack | |
vec4& sir = stack[ i + radius ]; | |
sir = p; | |
// rbs is a weight from (1)...(radius+1)...(1) | |
const int rbs = r1 - abs(i); | |
// add pixel to accumulators | |
sum += sir * rbs; | |
if ( i > 0 ) { insum += sir; } | |
else { outsum += sir; } | |
} | |
// now that the kernel is preloaded | |
// stackpointer is the index of the center of the kernel | |
stackpointer = radius; | |
// now start outputing pixels | |
for ( int x=0; x<w; x++ ) { | |
// output a pixel | |
tsurface[yi] = sum * divsum; | |
// calculate address of the next pixel | |
// to add to the kernel | |
// | |
// on first pass, make a lut to handle access | |
// past the right edge of the width image. | |
// min() will cause the last pixel to repeat. | |
if ( y == 0 ) vmin[x] = min( x+radius+1, wm ); | |
vec4& p = pix[ yw + vmin[x] ]; | |
// remove "past" pixels from the sum | |
sum -= outsum; | |
// remove "left" side of stack from outsum | |
stackstart = stackpointer - radius + div; | |
vec4& sir = stack[ stackstart % div ]; | |
outsum -= sir; | |
// now this (same) stack entry is the "right" side | |
// add new pixel to the stack, and update accumulators | |
sir = p; | |
insum += sir; | |
sum += insum; | |
// slide kernel one pixel ahead (right), | |
// update accumulators again | |
stackpointer = (stackpointer+1)%div; | |
const vec4& sirnext = stack[ stackpointer ]; | |
outsum += sirnext; | |
insum -= sirnext; | |
yi++; // advance output offset to next pixel | |
} | |
yw += w; // advance input offset to next line | |
}//y loop | |
// now blur vertically from the temporary | |
// buffer, using the original image buffer | |
// as the output | |
for ( int x=0; x<w; x++ ) { | |
vec4 insum(0); | |
vec4 outsum(0); | |
vec4 sum(0); | |
//preload the stack | |
int yp = -radius * w; | |
for ( int i = -radius; i<=radius; i++ ) { | |
vec4& sir = stack[ i + radius ]; | |
yi = max(0,yp)+x; | |
const vec4& p = tsurface[ yi ]; | |
sir = p; | |
const int rbs = r1 - abs(i); | |
sum += sir * rbs; | |
if ( i > 0 ) { insum += sir; } | |
else { outsum += sir; } | |
// only advance to the next row if there | |
// are still more rows of image left. | |
// otherwise, we keep sampling the same row | |
// as if the bottom line was duplicated | |
if ( i < hm ) yp += w; | |
}//preload | |
stackpointer = radius; | |
// this loop is the same as the y-loop, | |
// except the src/dest offsets are calcuated differently | |
yi = x; // set starting output offset by column | |
for ( int y=0; y<h; y++ ) { | |
pix[yi] = sum * divsum; | |
stackstart = stackpointer - radius + div; | |
vec4& sir = stack[ stackstart % div ]; | |
sum -= outsum; | |
outsum -= sir; | |
if ( x == 0 ) vmin[y] = min(y+r1,hm)*w; | |
sir = tsurface[ x + vmin[y] ]; | |
insum += sir; | |
sum += insum; | |
stackpointer = (stackpointer+1)%div; | |
const vec4& sirnext = stack[ stackpointer ]; | |
outsum += sirnext; | |
insum -= sirnext; | |
yi += w; // advance output offset to next line | |
} | |
}//x loop | |
delete [] vmin; | |
delete [] stack; | |
delete [] tsurface; | |
} |
Agreed. IIRC it was a fairly direct adaptation, so it may have been that way in the original.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
On line 57, is there any reason to prefer
(div + 1) >> 1
overradius + 1
? Since div is radius * 2 + 1, it should simplify to(2 * (radius + 1)) >> 1
, and because a bitshift is equivalent to division by a power of two it should further simplify toradius + 1