Created
October 2, 2013 08:26
-
-
Save inspirit/6790643 to your computer and use it in GitHub Desktop.
Separable Convolution and Gaussian Blur
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#ifndef CONVOLUTION_H_INCLUDED | |
#define CONVOLUTION_H_INCLUDED | |
/** | |
* Separable Convolution routines with SSE and NEON intrinsics | |
* | |
* this implementation is based on OpenCV Filter Class | |
* with template optimizations and SIMD intrinsic | |
* | |
* @author Eugene Zatepyakin | |
*/ | |
#include <algorithm> | |
#include <limits> | |
// SIMD | |
#define SIMD_OPT 1 | |
#include <emmintrin.h> | |
#ifdef __GNUC__ | |
# define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x))) | |
#elif defined _MSC_VER | |
# define CV_DECL_ALIGNED(x) __declspec(align(x)) | |
#else | |
#error unknown platform | |
#endif | |
// SEPARABLE CONVOLUTION ONLY | |
namespace convolve { | |
//! type of the kernel | |
enum { KERNEL_GENERAL = 0, // the kernel is generic. No any type of symmetry or other properties. | |
KERNEL_SYMMETRICAL = 1, // kernel[i] == kernel[ksize-i-1] , and the anchor is at the center | |
KERNEL_ASYMMETRICAL = 2, // kernel[i] == -kernel[ksize-i-1] , and the anchor is at the center | |
KERNEL_SMOOTH = 4, // all the kernel elements are non-negative and summed to 1 | |
KERNEL_INTEGER = 8 // all the kernel coefficients are integer numbers | |
}; | |
template<typename KT> | |
static int getKernelType(const KT* kernel, const int sz) | |
{ | |
int i; | |
float sum = 0; | |
int type = KERNEL_SMOOTH + KERNEL_INTEGER; | |
type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL); | |
for( i = 0; i < sz; i++ ) | |
{ | |
KT a = kernel[i], b = kernel[sz - i - 1]; | |
if( a != b ) | |
type &= ~KERNEL_SYMMETRICAL; | |
if( a != -b ) | |
type &= ~KERNEL_ASYMMETRICAL; | |
if( a < 0 ) | |
type &= ~KERNEL_SMOOTH; | |
if( a != static_cast<int>(a) ) | |
type &= ~KERNEL_INTEGER; | |
sum += a; | |
} | |
if( std::fabs(sum - 1) > std::numeric_limits<float>::epsilon()*(std::fabs(sum) + 1) ) | |
type &= ~KERNEL_SMOOTH; | |
return type; | |
} | |
template<typename ST, typename DT, typename KT, int cn, int _ksize> | |
struct SIMDRow | |
{ | |
SIMDRow() {} | |
int operator()(ST*, DT*, const KT*, int) const { return 0; } | |
}; | |
template<typename ST, typename DT, typename KT, int cn, int _ksize> | |
struct SIMDRowSymm : public SIMDRow<ST,DT,KT,cn,_ksize> | |
{ | |
int operator()(ST*, DT*, const KT*, int) const { return 0; } | |
}; | |
template<typename ST, typename DT, typename KT, int cn, int _ksize> | |
struct SIMDRowASymm : public SIMDRow<ST,DT,KT,cn,_ksize> | |
{ | |
int operator()(ST*, DT*, const KT*, int) const { return 0; } | |
}; | |
// | |
template<typename ST, typename DT, typename KT, int cn, int _ksize> | |
struct SIMDCol | |
{ | |
SIMDCol() {} | |
int operator()(ST**, DT*, const KT*, int) const { return 0; } | |
}; | |
template<typename ST, typename DT, typename KT, int cn, int _ksize> | |
struct SIMDColSymm : public SIMDCol<ST,DT,KT,cn,_ksize> | |
{ | |
int operator()(ST**, DT*, const KT*, int) const { return 0; } | |
}; | |
template<typename ST, typename DT, typename KT, int cn, int _ksize> | |
struct SIMDColASymm : public SIMDCol<ST,DT,KT,cn,_ksize> | |
{ | |
int operator()(ST**, DT*, const KT*, int) const { return 0; } | |
}; | |
// SPECIAL CASE (SIMD) FOR FLOAT DATA TYPE | |
// TODO: NEON version | |
#if SIMD_OPT | |
#ifdef __arm__ | |
#include <arm_neon.h> | |
//#define _mm_set1_ps(a) ( vld1q_dup_f32( &(a) ) ) | |
#define _mm_load_ss(ptr) ( vld1q_dup_f32( (ptr) ) ) | |
// in our case we just dup in above line | |
#define _mm_shuffle_ps(a, b, m) (a) | |
#define _mm_setzero_ps() ( vmovq_n_f32( 0.f ) ) | |
#define _mm_set1_ps(a) ( vmovq_n_f32( (a) ) ) | |
#define _mm_loadu_ps(ptr) ( vld1q_f32( (ptr) ) ) | |
#define _mm_load_ps(ptr) ( vld1q_f32( (ptr) ) ) | |
#define _mm_storeu_ps(ptr, a) ( vst1q_f32( (ptr), (a) ) ) | |
#define _mm_store_ps(ptr, a) ( vst1q_f32( (ptr), (a) ) ) | |
#define _mm_add_ps(a, b) ( vaddq_f32( (a), (b) ) ) | |
#define _mm_sub_ps(a, b) ( vsubq_f32( (a), (b) ) ) | |
#define _mm_mul_ps(a, b) ( vmulq_f32( (a), (b) ) ) | |
#endif | |
template<int cn, int _ksize> | |
struct SIMDRow<float,float,float,cn,_ksize> | |
{ | |
#ifdef __arm__ | |
typedef float32x4_t __m128; | |
#endif | |
int operator()(float* buffer, float* dst, const float* _kernel, int rowSize) const | |
{ | |
//rowSize *= cn; | |
int i = 0, k; | |
const float* kx = _kernel; | |
for( ; i <= rowSize - 8; i += 8 ) | |
{ | |
const float* src = buffer + i; | |
__m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1; | |
for( k = 0; k < _ksize; k++, src += cn ) | |
{ | |
f = _mm_load_ss(kx+k); | |
f = _mm_shuffle_ps(f, f, 0); | |
x0 = _mm_loadu_ps(src); | |
x1 = _mm_loadu_ps(src + 4); | |
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); | |
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); | |
} | |
_mm_store_ps(dst + i, s0); | |
_mm_store_ps(dst + i + 4, s1); | |
} | |
return i; | |
} | |
}; | |
template<int cn, int _ksize> | |
struct SIMDRowSymm<float,float,float,cn,_ksize> | |
{ | |
#ifdef __arm__ | |
typedef float32x4_t __m128; | |
#endif | |
int operator()(float* buffer, float* dst, const float* _kernel, int rowSize) const | |
{ | |
//rowSize *= cn; | |
int i = 0; | |
const int ksize2 = _ksize/2; | |
const float* src = buffer + ksize2*cn; | |
const float* kx = _kernel + ksize2; | |
if( _ksize == 3 ) | |
{ | |
if( kx[0] == 2 && kx[1] == 1 ) | |
for( ; i <= rowSize - 8; i += 8, src += 8 ) | |
{ | |
__m128 x0, x1, x2, y0, y1, y2; | |
x0 = _mm_loadu_ps(src - cn); | |
x1 = _mm_loadu_ps(src); | |
x2 = _mm_loadu_ps(src + cn); | |
y0 = _mm_loadu_ps(src - cn + 4); | |
y1 = _mm_loadu_ps(src + 4); | |
y2 = _mm_loadu_ps(src + cn + 4); | |
x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2)); | |
y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2)); | |
_mm_store_ps(dst + i, x0); | |
_mm_store_ps(dst + i + 4, y0); | |
} | |
else if( kx[0] == -2 && kx[1] == 1 ) | |
for( ; i <= rowSize - 8; i += 8, src += 8 ) | |
{ | |
__m128 x0, x1, x2, y0, y1, y2; | |
x0 = _mm_loadu_ps(src - cn); | |
x1 = _mm_loadu_ps(src); | |
x2 = _mm_loadu_ps(src + cn); | |
y0 = _mm_loadu_ps(src - cn + 4); | |
y1 = _mm_loadu_ps(src + 4); | |
y2 = _mm_loadu_ps(src + cn + 4); | |
x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1))); | |
y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1))); | |
_mm_store_ps(dst + i, x0); | |
_mm_store_ps(dst + i + 4, y0); | |
} | |
else | |
{ | |
__m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]); | |
for( ; i <= rowSize - 8; i += 8, src += 8 ) | |
{ | |
__m128 x0, x1, x2, y0, y1, y2; | |
x0 = _mm_loadu_ps(src - cn); | |
x1 = _mm_loadu_ps(src); | |
x2 = _mm_loadu_ps(src + cn); | |
y0 = _mm_loadu_ps(src - cn + 4); | |
y1 = _mm_loadu_ps(src + 4); | |
y2 = _mm_loadu_ps(src + cn + 4); | |
x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1); | |
y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1); | |
x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0)); | |
y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0)); | |
_mm_store_ps(dst + i, x0); | |
_mm_store_ps(dst + i + 4, y0); | |
} | |
} | |
} | |
else if( _ksize == 5 ) | |
{ | |
if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) | |
for( ; i <= rowSize - 8; i += 8, src += 8 ) | |
{ | |
__m128 x0, x1, x2, y0, y1, y2; | |
x0 = _mm_loadu_ps(src - cn*2); | |
x1 = _mm_loadu_ps(src); | |
x2 = _mm_loadu_ps(src + cn*2); | |
y0 = _mm_loadu_ps(src - cn*2 + 4); | |
y1 = _mm_loadu_ps(src + 4); | |
y2 = _mm_loadu_ps(src + cn*2 + 4); | |
x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1))); | |
y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1))); | |
_mm_store_ps(dst + i, x0); | |
_mm_store_ps(dst + i + 4, y0); | |
} | |
else | |
{ | |
__m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]); | |
for( ; i <= rowSize - 8; i += 8, src += 8 ) | |
{ | |
__m128 x0, x1, x2, y0, y1, y2; | |
x0 = _mm_loadu_ps(src - cn); | |
x1 = _mm_loadu_ps(src); | |
x2 = _mm_loadu_ps(src + cn); | |
y0 = _mm_loadu_ps(src - cn + 4); | |
y1 = _mm_loadu_ps(src + 4); | |
y2 = _mm_loadu_ps(src + cn + 4); | |
x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1); | |
y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1); | |
x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0)); | |
y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0)); | |
x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2)); | |
y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4)); | |
x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2)); | |
y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2)); | |
_mm_store_ps(dst + i, x0); | |
_mm_store_ps(dst + i + 4, y0); | |
} | |
} | |
} | |
else | |
{ | |
SIMDRow<float,float,float,cn,_ksize> op; | |
i = op(buffer, dst, _kernel, rowSize); | |
} | |
return i; | |
} | |
}; | |
template<int cn, int _ksize> | |
struct SIMDRowASymm<float,float,float,cn,_ksize> | |
{ | |
#ifdef __arm__ | |
typedef float32x4_t __m128; | |
#endif | |
int operator()(float* buffer, float* dst, const float* _kernel, int rowSize) const | |
{ | |
//rowSize *= cn; | |
int i = 0; | |
const int ksize2 = _ksize/2; | |
const float* src = buffer + ksize2*cn; | |
const float* kx = _kernel + ksize2; | |
if( _ksize == 3 ) | |
{ | |
if( kx[0] == 0 && kx[1] == 1 ) | |
for( ; i <= rowSize - 8; i += 8, src += 8 ) | |
{ | |
__m128 x0, x2, y0, y2; | |
x0 = _mm_loadu_ps(src + cn); | |
x2 = _mm_loadu_ps(src - cn); | |
y0 = _mm_loadu_ps(src + cn + 4); | |
y2 = _mm_loadu_ps(src - cn + 4); | |
x0 = _mm_sub_ps(x0, x2); | |
y0 = _mm_sub_ps(y0, y2); | |
_mm_store_ps(dst + i, x0); | |
_mm_store_ps(dst + i + 4, y0); | |
} | |
else | |
{ | |
__m128 k1 = _mm_set1_ps(kx[1]); | |
for( ; i <= rowSize - 8; i += 8, src += 8 ) | |
{ | |
__m128 x0, x2, y0, y2; | |
x0 = _mm_loadu_ps(src + cn); | |
x2 = _mm_loadu_ps(src - cn); | |
y0 = _mm_loadu_ps(src + cn + 4); | |
y2 = _mm_loadu_ps(src - cn + 4); | |
x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1); | |
y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1); | |
_mm_store_ps(dst + i, x0); | |
_mm_store_ps(dst + i + 4, y0); | |
} | |
} | |
} | |
else if( _ksize == 5 ) | |
{ | |
__m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]); | |
for( ; i <= rowSize - 8; i += 8, src += 8 ) | |
{ | |
__m128 x0, x2, y0, y2; | |
x0 = _mm_loadu_ps(src + cn); | |
x2 = _mm_loadu_ps(src - cn); | |
y0 = _mm_loadu_ps(src + cn + 4); | |
y2 = _mm_loadu_ps(src - cn + 4); | |
x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1); | |
y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1); | |
x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2)); | |
y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4)); | |
x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2)); | |
y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2)); | |
_mm_store_ps(dst + i, x0); | |
_mm_store_ps(dst + i + 4, y0); | |
} | |
} | |
else | |
{ | |
SIMDRow<float,float,float,cn,_ksize> op; | |
i = op(buffer, dst, _kernel, rowSize); | |
} | |
return i; | |
} | |
}; | |
template<int cn, int _ksize> | |
struct SIMDColSymm<float,float,float,cn,_ksize> | |
{ | |
#ifdef __arm__ | |
typedef float32x4_t __m128; | |
#endif | |
int operator()(float** buffer, float* dst, const float* _kernel, int rowSize) const | |
{ | |
//rowSize *= cn; | |
int i = 0, k; | |
const int ksize2 = _ksize/2; | |
const float** src = (const float**)buffer+ksize2; | |
const float* ky = _kernel + ksize2; | |
float _delta = 0.f; | |
__m128 d4 = _mm_set1_ps(_delta); | |
if(_ksize == 3) | |
{ | |
const float *S0 = src[-1], *S1 = src[0], *S2 = src[1]; | |
if( ky[0] == 2 && ky[1] == 1 ) | |
{ | |
for( ; i <= rowSize - 8; i += 8 ) | |
{ | |
__m128 s0, s1, s2, s3, s4, s5; | |
s0 = _mm_load_ps(S0 + i); | |
s1 = _mm_load_ps(S0 + i + 4); | |
s2 = _mm_load_ps(S1 + i); | |
s3 = _mm_load_ps(S1 + i + 4); | |
s4 = _mm_load_ps(S2 + i); | |
s5 = _mm_load_ps(S2 + i + 4); | |
s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2))); | |
s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3))); | |
s0 = _mm_add_ps(s0, d4); | |
s1 = _mm_add_ps(s1, d4); | |
_mm_store_ps(dst + i, s0); | |
_mm_store_ps(dst + i + 4, s1); | |
} | |
} | |
else if( ky[0] == -2 && ky[1] == 1 ) | |
{ | |
for( ; i <= rowSize - 8; i += 8 ) | |
{ | |
__m128 s0, s1, s2, s3, s4, s5; | |
s0 = _mm_load_ps(S0 + i); | |
s1 = _mm_load_ps(S0 + i + 4); | |
s2 = _mm_load_ps(S1 + i); | |
s3 = _mm_load_ps(S1 + i + 4); | |
s4 = _mm_load_ps(S2 + i); | |
s5 = _mm_load_ps(S2 + i + 4); | |
s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2))); | |
s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3))); | |
s0 = _mm_add_ps(s0, d4); | |
s1 = _mm_add_ps(s1, d4); | |
_mm_store_ps(dst + i, s0); | |
_mm_store_ps(dst + i + 4, s1); | |
} | |
} | |
else | |
{ | |
__m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]); | |
for( ; i <= rowSize - 8; i += 8 ) | |
{ | |
__m128 s0, s1, x0, x1; | |
s0 = _mm_load_ps(S1 + i); | |
s1 = _mm_load_ps(S1 + i + 4); | |
s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4); | |
s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4); | |
x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i)); | |
x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4)); | |
s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1)); | |
s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1)); | |
_mm_store_ps(dst + i, s0); | |
_mm_store_ps(dst + i + 4, s1); | |
} | |
} | |
} | |
else // kernel greater than 3 | |
{ | |
const float *S, *S2; | |
for( ; i <= rowSize - 16; i += 16 ) | |
{ | |
__m128 f = _mm_load_ss(ky); | |
f = _mm_shuffle_ps(f, f, 0); | |
__m128 s0, s1, s2, s3; | |
__m128 x0, x1; | |
S = src[0] + i; | |
s0 = _mm_load_ps(S); | |
s1 = _mm_load_ps(S+4); | |
s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); | |
s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4); | |
s2 = _mm_load_ps(S+8); | |
s3 = _mm_load_ps(S+12); | |
s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4); | |
s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4); | |
for( k = 1; k <= ksize2; k++ ) | |
{ | |
S = src[k] + i; | |
S2 = src[-k] + i; | |
f = _mm_load_ss(ky+k); | |
f = _mm_shuffle_ps(f, f, 0); | |
x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2)); | |
x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4)); | |
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); | |
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); | |
x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8)); | |
x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12)); | |
s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f)); | |
s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f)); | |
} | |
_mm_store_ps(dst + i, s0); | |
_mm_store_ps(dst + i + 4, s1); | |
_mm_store_ps(dst + i + 8, s2); | |
_mm_store_ps(dst + i + 12, s3); | |
} | |
for( ; i <= rowSize - 4; i += 4 ) | |
{ | |
__m128 f = _mm_load_ss(ky); | |
f = _mm_shuffle_ps(f, f, 0); | |
__m128 x0, s0 = _mm_load_ps(src[0] + i); | |
s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); | |
for( k = 1; k <= ksize2; k++ ) | |
{ | |
f = _mm_load_ss(ky+k); | |
f = _mm_shuffle_ps(f, f, 0); | |
S = src[k] + i; | |
S2 = src[-k] + i; | |
x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); | |
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); | |
} | |
_mm_storeu_ps(dst + i, s0); | |
} | |
} | |
return i; | |
} | |
}; | |
template<int cn, int _ksize> | |
struct SIMDColASymm<float,float,float,cn,_ksize> | |
{ | |
#ifdef __arm__ | |
typedef float32x4_t __m128; | |
#endif | |
int operator()(float** buffer, float* dst, const float* _kernel, int rowSize) const | |
{ | |
//rowSize *= cn; | |
int i = 0, k; | |
const int ksize2 = _ksize/2; | |
const float** src = (const float**)buffer+ksize2; | |
const float* ky = _kernel + ksize2; | |
float _delta = 0.f; | |
__m128 d4 = _mm_set1_ps(_delta); | |
if(_ksize == 3) | |
{ | |
const float *S0 = src[-1], *S2 = src[1]; | |
if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] ) | |
{ | |
if( ky[1] < 0 ) | |
std::swap(S0, S2); | |
for( ; i <= rowSize - 8; i += 8 ) | |
{ | |
__m128 s0, s1, s2, s3; | |
s0 = _mm_load_ps(S2 + i); | |
s1 = _mm_load_ps(S2 + i + 4); | |
s2 = _mm_load_ps(S0 + i); | |
s3 = _mm_load_ps(S0 + i + 4); | |
s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4); | |
s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4); | |
_mm_store_ps(dst + i, s0); | |
_mm_store_ps(dst + i + 4, s1); | |
} | |
} | |
else | |
{ | |
__m128 k1 = _mm_set1_ps(ky[1]); | |
for( ; i <= rowSize - 8; i += 8 ) | |
{ | |
__m128 s0 = d4, s1 = d4, x0, x1; | |
x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i)); | |
x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4)); | |
s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1)); | |
s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1)); | |
_mm_store_ps(dst + i, s0); | |
_mm_store_ps(dst + i + 4, s1); | |
} | |
} | |
} | |
else // kernel greater than 3 | |
{ | |
const float *S, *S2; | |
for( ; i <= rowSize - 16; i += 16 ) | |
{ | |
__m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4; | |
__m128 x0, x1; | |
S = src[0] + i; | |
for( k = 1; k <= ksize2; k++ ) | |
{ | |
S = src[k] + i; | |
S2 = src[-k] + i; | |
f = _mm_load_ss(ky+k); | |
f = _mm_shuffle_ps(f, f, 0); | |
x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2)); | |
x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4)); | |
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); | |
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); | |
x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8)); | |
x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12)); | |
s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f)); | |
s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f)); | |
} | |
_mm_store_ps(dst + i, s0); | |
_mm_store_ps(dst + i + 4, s1); | |
_mm_store_ps(dst + i + 8, s2); | |
_mm_store_ps(dst + i + 12, s3); | |
} | |
for( ; i <= rowSize - 4; i += 4 ) | |
{ | |
__m128 f, x0, s0 = d4; | |
for( k = 1; k <= ksize2; k++ ) | |
{ | |
f = _mm_load_ss(ky+k); | |
f = _mm_shuffle_ps(f, f, 0); | |
x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); | |
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); | |
} | |
_mm_store_ps(dst + i, s0); | |
} | |
} | |
return i; | |
} | |
}; | |
#endif | |
template<typename ST, typename DT, typename KT, int cn, int _ksize> | |
struct ConvolveRowBuffer | |
{ | |
const KT* _kernel; | |
SIMDRow<ST, DT, KT, cn, _ksize> _vecOp; | |
typedef ST stype; | |
typedef DT dtype; | |
enum { | |
Channels = cn, | |
KSize = _ksize | |
}; | |
ConvolveRowBuffer(const KT* kernel) | |
: _kernel(kernel) | |
{ | |
} | |
// horizontal pass | |
virtual void operator()(ST* buffer, DT* dst, int rowSize) const | |
{ | |
int i = 0, k; | |
const ST* S; | |
rowSize *= cn; | |
i = _vecOp(buffer, dst, _kernel, rowSize); | |
for( ; i < rowSize; i++ ) | |
{ | |
S = buffer + i; | |
DT s0 = _kernel[0]*S[0]; | |
for( k = 1; k < _ksize; k++ ) | |
{ | |
S += cn; | |
s0 += _kernel[k]*S[0]; | |
} | |
dst[i] = s0; | |
} | |
} | |
}; | |
template<typename ST, typename DT, typename KT, int cn, int _ksize> | |
struct ConvolveColBuffer | |
{ | |
const KT* _kernel; | |
SIMDCol<ST, DT, KT, cn, _ksize> _vecOp; | |
typedef ST stype; | |
typedef DT dtype; | |
enum { | |
Channels = cn, | |
KSize = _ksize | |
}; | |
ConvolveColBuffer(const KT* kernel) | |
: _kernel(kernel) | |
{ | |
} | |
// vertical pass | |
virtual void operator()(ST** buffer, DT* dst, int rowSize) const | |
{ | |
int i = 0, k; | |
rowSize *= cn; | |
i = _vecOp(buffer, dst, _kernel, rowSize); | |
DT _delta = 0; | |
for( ; i < rowSize; i++ ) | |
{ | |
ST s0 = _kernel[0]*(buffer[0])[i] + _delta; | |
for( k = 1; k < _ksize; k++ ) | |
{ | |
s0 += _kernel[k]*(buffer[k])[i]; | |
} | |
dst[i] = (s0); | |
} | |
} | |
}; | |
template<typename ST, typename DT, typename KT, int cn, int _ksize> | |
struct ConvolveRowBufferSymm : public ConvolveRowBuffer<ST,DT,KT,cn,_ksize> | |
{ | |
SIMDRowSymm<ST, DT, KT, cn, _ksize> _vecOp; | |
ConvolveRowBufferSymm(const KT* kernel) | |
:ConvolveRowBuffer<ST,DT,KT,cn,_ksize>(kernel) | |
{ } | |
// horizontal pass | |
void operator()(ST* buffer, DT* dst, int rowSize) const | |
{ | |
int i = 0, j, k; | |
rowSize *= cn; | |
i = _vecOp(buffer, dst, ConvolveRowBuffer<ST,DT,KT,cn,_ksize>::_kernel, rowSize); | |
const int ksize2 = _ksize/2; | |
const ST* S = buffer + ksize2*cn; | |
const KT* kx = ConvolveRowBuffer<ST,DT,KT,cn,_ksize>::_kernel + ksize2; | |
if( _ksize == 1 && kx[0] == 1 ) | |
{ | |
for( ; i <= rowSize - 2; i += 2 ) | |
{ | |
DT s0 = S[i], s1 = S[i+1]; | |
dst[i] = s0; dst[i+1] = s1; | |
} | |
S += i; | |
} | |
else if( _ksize == 3 ) | |
{ | |
if( kx[0] == 2 && kx[1] == 1 ) | |
for( ; i <= rowSize - 2; i += 2, S += 2 ) | |
{ | |
DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn]; | |
dst[i] = s0; dst[i+1] = s1; | |
} | |
else if( kx[0] == -2 && kx[1] == 1 ) | |
for( ; i <= rowSize - 2; i += 2, S += 2 ) | |
{ | |
DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn]; | |
dst[i] = s0; dst[i+1] = s1; | |
} | |
else | |
{ | |
KT k0 = kx[0], k1 = kx[1]; | |
for( ; i <= rowSize - 2; i += 2, S += 2 ) | |
{ | |
DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1; | |
dst[i] = s0; dst[i+1] = s1; | |
} | |
} | |
} | |
else if( _ksize == 5 ) | |
{ | |
KT k0 = kx[0], k1 = kx[1], k2 = kx[2]; | |
if( k0 == -2 && k1 == 0 && k2 == 1 ) | |
for( ; i <= rowSize - 2; i += 2, S += 2 ) | |
{ | |
DT s0 = -2*S[0] + S[-cn*2] + S[cn*2]; | |
DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2]; | |
dst[i] = s0; dst[i+1] = s1; | |
} | |
else | |
for( ; i <= rowSize - 2; i += 2, S += 2 ) | |
{ | |
DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2; | |
DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2; | |
dst[i] = s0; dst[i+1] = s1; | |
} | |
} | |
for( ; i < rowSize; i++, S++ ) | |
{ | |
DT s0 = kx[0]*S[0]; | |
for( k = 1, j = cn; k <= ksize2; k++, j += cn ) | |
s0 += kx[k]*(S[j] + S[-j]); | |
dst[i] = s0; | |
} | |
} | |
}; | |
template<typename ST, typename DT, typename KT, int cn, int _ksize> | |
struct ConvolveColBufferSymm : public ConvolveColBuffer<ST,DT,KT,cn,_ksize> | |
{ | |
SIMDColSymm<ST, DT, KT, cn, _ksize> _vecOp; | |
ConvolveColBufferSymm(const KT* kernel) | |
:ConvolveColBuffer<ST,DT,KT,cn,_ksize>(kernel) | |
{ } | |
// vertical pass | |
void operator()(ST** buffer, DT* dst, int rowSize) const | |
{ | |
int i = 0, k; | |
rowSize *= cn; | |
i = _vecOp(buffer, dst, ConvolveColBuffer<ST,DT,KT,cn,_ksize>::_kernel, rowSize); | |
const int ksize2 = _ksize/2; | |
ST** src = buffer + ksize2; | |
const KT* ky = ConvolveColBuffer<ST,DT,KT,cn,_ksize>::_kernel + ksize2; | |
ST _delta = 0; | |
for( ; i < rowSize; i++ ) | |
{ | |
ST s0 = ky[0]*(src[0])[i] + _delta; | |
for( k = 1; k <= ksize2; k++ ) | |
{ | |
s0 += ky[k]*((src[k])[i] + (src[-k])[i]); | |
} | |
dst[i] = (s0); | |
} | |
} | |
}; | |
template<typename ST, typename DT, typename KT, int cn, int _ksize> | |
struct ConvolveRowBufferASymm : public ConvolveRowBuffer<ST,DT,KT,cn,_ksize> | |
{ | |
SIMDRowASymm<ST, DT, KT, cn, _ksize> _vecOp; | |
ConvolveRowBufferASymm(const KT* kernel) | |
:ConvolveRowBuffer<ST,DT,KT,cn,_ksize>(kernel) | |
{ } | |
// horizontal pass | |
void operator()(ST* buffer, DT* dst, int rowSize) const | |
{ | |
int i = 0, j, k; | |
rowSize *= cn; | |
i = _vecOp(buffer, dst, ConvolveRowBuffer<ST,DT,KT,cn,_ksize>::_kernel, rowSize); | |
const int ksize2 = _ksize/2; | |
const ST* S = buffer + ksize2*cn; | |
const KT* kx = ConvolveRowBuffer<ST,DT,KT,cn,_ksize>::_kernel + ksize2; | |
if( _ksize == 3 ) | |
{ | |
if( kx[0] == 0 && kx[1] == 1 ) | |
for( ; i <= rowSize - 2; i += 2, S += 2 ) | |
{ | |
DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn]; | |
dst[i] = s0; dst[i+1] = s1; | |
} | |
else | |
{ | |
KT k1 = kx[1]; | |
for( ; i <= rowSize - 2; i += 2, S += 2 ) | |
{ | |
DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1; | |
dst[i] = s0; dst[i+1] = s1; | |
} | |
} | |
} | |
else if( _ksize == 5 ) | |
{ | |
KT k1 = kx[1], k2 = kx[2]; | |
for( ; i <= rowSize - 2; i += 2, S += 2 ) | |
{ | |
DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2; | |
DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2; | |
dst[i] = s0; dst[i+1] = s1; | |
} | |
} | |
for( ; i < rowSize; i++, S++ ) | |
{ | |
DT s0 = kx[0]*S[0]; | |
for( k = 1, j = cn; k <= ksize2; k++, j += cn ) | |
s0 += kx[k]*(S[j] - S[-j]); | |
dst[i] = s0; | |
} | |
} | |
}; | |
template<typename ST, typename DT, typename KT, int cn, int _ksize> | |
struct ConvolveColBufferASymm : public ConvolveColBuffer<ST,DT,KT,cn,_ksize> | |
{ | |
SIMDColASymm<ST, DT, KT, cn, _ksize> _vecOp; | |
ConvolveColBufferASymm(const KT* kernel) | |
:ConvolveColBuffer<ST,DT,KT,cn,_ksize>(kernel) | |
{ } | |
// vertical pass | |
void operator()(ST** buffer, DT* dst, int rowSize) const | |
{ | |
int i = 0, k; | |
rowSize *= cn; | |
i = _vecOp(buffer, dst, ConvolveColBuffer<ST,DT,KT,cn,_ksize>::_kernel, rowSize); | |
const int ksize2 = _ksize/2; | |
ST** src = buffer + ksize2; | |
const KT* ky = ConvolveColBuffer<ST,DT,KT,cn,_ksize>::_kernel + ksize2; | |
ST _delta = 0; | |
for( ; i < rowSize; i++ ) | |
{ | |
ST s0 = _delta; | |
for( k = 1; k <= ksize2; k++ ) | |
{ | |
s0 += ky[k]*((src[k])[i] - (src[-k])[i]); | |
} | |
dst[i] = (s0); | |
} | |
} | |
}; | |
static void getGaussianKernel( float* kernel, int ksize, double sigma=0 ) | |
{ | |
const int SMALL_GAUSSIAN_SIZE = 7; | |
static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] = | |
{ | |
{1.f}, | |
{0.25f, 0.5f, 0.25f}, | |
{0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f}, | |
{0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, 0.03125f} | |
}; | |
const float* fixed_kernel = ksize % 2 == 1 && ksize <= SMALL_GAUSSIAN_SIZE && sigma <= 0 ? | |
small_gaussian_tab[ksize>>1] : 0; | |
float* cf = kernel; | |
double sigmaX = sigma > 0 ? sigma : ((ksize-1)*0.5 - 1)*0.3 + 0.8; | |
double scale2X = -0.5/(sigmaX*sigmaX); | |
double sum = 0; | |
int i; | |
for( i = 0; i < ksize; ++i ) | |
{ | |
double x = i - (ksize-1)*0.5; | |
double t = fixed_kernel ? (double)fixed_kernel[i] : std::exp(scale2X*x*x); | |
cf[i] = static_cast<float>(t); | |
sum += t; | |
} | |
sum = 1./sum; | |
for( i = 0; i < ksize; i++ ) | |
{ | |
cf[i] = static_cast<float>(static_cast<double>(cf[i])*sum); | |
} | |
} | |
template<typename _Tp> | |
static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp)) | |
{ | |
return (_Tp*)(((size_t)ptr + n-1) & -n); | |
} | |
static inline size_t alignSize(const size_t sz, const int n) | |
{ | |
return (sz + n-1) & -n; | |
} | |
template<typename T, typename DT, int cn, int ksize> | |
static inline void fillRow(const T* src, DT* rowbuf, int w) | |
{ | |
int halfsize = ksize/2; | |
DT* row = rowbuf; | |
for(int j = 0; j < halfsize; ++j) | |
{ | |
for(int c = 0; c < cn; ++c, ++row) | |
{ | |
*row = src[c]; | |
} | |
} | |
for(int j = 0; j < w*cn; ++j, ++row) | |
{ | |
*row = src[j]; | |
} | |
const T* temp = &(src[w*cn-cn]); | |
for(int j = 0; j < halfsize; ++j) | |
{ | |
for(int c = 0; c < cn; ++c, ++row) | |
{ | |
*row = temp[c]; | |
} | |
} | |
} | |
template<typename ST, typename DT> | |
struct Cast | |
{ | |
typedef ST stype; | |
typedef DT dtype; | |
DT operator()(ST val) const { return static_cast<DT>(val); } | |
}; | |
template<typename ST, typename DT, int bits> | |
struct FixedPtCast | |
{ | |
typedef ST stype; | |
typedef DT dtype; | |
enum { SHIFT = bits, DELTA = 1 << (bits-1) }; | |
DT operator()(ST val) const { return static_cast<DT>((val + DELTA)>>SHIFT); } | |
}; | |
template<typename ST, typename DT, typename KT, int cn, int ksize> | |
struct ConvolveTraits {}; | |
/* | |
template<int cn, int ksize> | |
struct ConvolveTraits<unsigned char, unsigned char, int, cn, ksize> | |
{ | |
typedef ConvolveRowBuffer<unsigned char, int, int, cn, ksize> cbx; | |
typedef ConvolveColBuffer<int, int, int, cn, ksize> cby; | |
typedef ConvolveRowBufferSymm<unsigned char, int, int, cn, ksize> cbx_symm; | |
typedef ConvolveColBufferSymm<int, int, int, cn, ksize> cby_symm; | |
typedef FixedPtCast<int,unsigned char, 16> cast; | |
}; | |
template<int cn, int ksize> | |
struct ConvolveTraits<unsigned char, unsigned char, float, cn, ksize> | |
{ | |
typedef ConvolveRowBuffer<float, float, float, cn, ksize> cbx; | |
typedef ConvolveColBuffer<float, float, float, cn, ksize> cby; | |
typedef ConvolveRowBufferSymm<float, float, float, cn, ksize> cbx_symm; | |
typedef ConvolveColBufferSymm<float, float, float, cn, ksize> cby_symm; | |
typedef Cast<float,unsigned char> cast; | |
}; | |
template<int cn, int ksize> | |
struct ConvolveTraits<float, float, float, cn, ksize> | |
{ | |
typedef ConvolveRowBuffer<float, float, float, cn, ksize> cbx; | |
typedef ConvolveColBuffer<float, float, float, cn, ksize> cby; | |
typedef ConvolveRowBufferSymm<float, float, float, cn, ksize> cbx_symm; | |
typedef ConvolveColBufferSymm<float, float, float, cn, ksize> cby_symm; | |
typedef Cast<float,float> cast; | |
}; | |
*/ | |
template<typename T, class Cx, class Cy, class Cast> | |
static void convolve2(const T* src, int h, int w, int srcStride, T* dst, int dstStride, const Cx& convX, const Cy& convY, const Cast& castOp) | |
{ | |
typedef typename Cx::dtype wtype; | |
typedef typename Cx::stype stype; | |
typedef typename Cy::dtype dtype; | |
enum { | |
cn = Cx::Channels, | |
ksize = Cx::KSize, | |
halfsize = ksize/2, | |
}; | |
srcStride /= sizeof(T); | |
dstStride /= sizeof(T); | |
size_t row_step = alignSize((w+ksize)*cn * sizeof(wtype), 16); | |
unsigned char* _buffer = (unsigned char*)alloca(row_step*(ksize+2) + 16); | |
unsigned char* buffer = alignPtr(_buffer); | |
wtype* rows[ksize+1]; | |
for(int i=0; i<ksize+1; ++i) | |
{ | |
rows[i] = reinterpret_cast<wtype*>(&buffer[row_step*i]); | |
} | |
stype* rowbuf = reinterpret_cast<stype*>(&buffer[row_step*(ksize+1)]); | |
dtype* outbuf = reinterpret_cast<dtype*>(rowbuf); | |
const T* input = src; | |
T* output = dst; | |
for(int i = halfsize; i < ksize; ++i, input+=srcStride) | |
{ | |
fillRow<T, stype, cn, ksize>(input, rowbuf, w); | |
convX(rowbuf, rows[i], w); | |
} | |
for(int i = 0; i < halfsize; ++i) | |
{ | |
memcpy(rows[i], rows[halfsize], row_step); | |
} | |
for(int i = halfsize+1; i < h; ++i, input+=srcStride, output+= dstStride) | |
{ | |
wtype* next_row = rows[ksize]; | |
fillRow<T, stype, cn, ksize>(input, rowbuf, w); | |
convX(rowbuf, next_row, w); | |
convY(rows, outbuf, w); | |
for(int j = 0; j < w*cn; ++j) | |
{ | |
output[j] = castOp(outbuf[j]); | |
} | |
// | |
wtype* tmp = rows[0]; | |
for(int r = 0; r < ksize; ++r) | |
{ | |
rows[r] = rows[r+1]; | |
} | |
rows[ksize] = tmp; | |
} | |
assert(input == src+h*srcStride); | |
// last rows | |
for(int i = 0; i < halfsize+1; ++i, output+= dstStride) | |
{ | |
wtype* next_row = rows[ksize]; | |
memcpy(next_row, rows[ksize-1], row_step); | |
convY(rows, outbuf, w); | |
for(int j = 0; j < w*cn; ++j) | |
{ | |
output[j] = castOp(outbuf[j]); | |
} | |
// | |
wtype* tmp = rows[0]; | |
for(int r = 0; r < ksize; ++r) | |
{ | |
rows[r] = rows[r+1]; | |
} | |
rows[ksize] = tmp; | |
} | |
assert(output == dst+h*dstStride); | |
} | |
template<typename T, int cn, int ksize> | |
static void gaussianSmooth(const T* src, int h, int w, int srcStride, T* dst, int dstStride) | |
{ | |
float CV_DECL_ALIGNED(16) kernel[33]; | |
assert(ksize<=33); | |
getGaussianKernel(&kernel[0], ksize, 0); | |
/* | |
// we can use Integer kernel for Byte images | |
int CV_DECL_ALIGNED(16) ikernel[33]; | |
int bits = 8; | |
for(int i = 0; i < ksize; ++i) | |
{ | |
ikernel[i] = static_cast<int>(kernel[i] * (1 << bits)); | |
} | |
typedef ConvolveRowBufferSymm<unsigned char, int, int, cn, ksize> Cx; | |
typedef ConvolveColBufferSymm<int, int, int, cn, ksize> Cy; | |
typedef FixedPtCast<int,unsigned char, 16> cast; | |
convolve2<T, Cx, Cy>(src, h, w, srcStride, dst, dstStride, Cx(ikernel), Cy(ikernel), cast()); | |
*/ | |
// | |
typedef ConvolveRowBufferSymm<float, float, float, cn, ksize> Cx; | |
typedef ConvolveColBufferSymm<float, float, float, cn, ksize> Cy; | |
typedef Cast<float,T> cast; | |
convolve2<T, Cx, Cy>(src, h, w, srcStride, dst, dstStride, Cx(kernel), Cy(kernel), cast()); | |
// | |
} | |
} // namespace | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment