Skip to content

Instantly share code, notes, and snippets.

@yoffy
Last active June 14, 2020 13:23
Show Gist options
  • Save yoffy/71eff4df6dfd187420c9142b08f7fe30 to your computer and use it in GitHub Desktop.
Save yoffy/71eff4df6dfd187420c9142b08f7fe30 to your computer and use it in GitHub Desktop.
bilinear image resampling (with AVX2)
// clang++ -c -std=c++11 -Wall -Wextra -Ofast -g -march=native -D NDEBUG
#include <stdint.h>
#include <math.h>
#include <vector>
#include <immintrin.h>
struct image
{
int width;
int height;
image(int w, int h)
{
width = w;
height = h;
}
};
using u8 = uint8_t;
using u16 = uint16_t;
using u32 = uint32_t;
using u64 = uint64_t;
namespace {
const u16 kBiasBit = 8; // 小数部8bit
const u16 kBias = 1 << kBiasBit;
const u16 k0_5 = kBias / 2; // 0.5
//! 固定小数を整数に丸める(小数部8bit)
u16 round_fixed(u16 value)
{
return (value + k0_5) / kBias; // round(v) = floor(v + 0.5)
}
//! 出力座標に対する入力座標と係数の計算
//!
//! coefs はkBias基にした1固定小数点数で注目した画素の次のピクセルの係数が入る。
//! @code
//! indices[i] = floor(i / scale)
//! coefs[i] = (i / scale) - indices[i]
//! @endcode
void calcIndicesAndCoefs(int length, float scale, u16 * indices, u16 * coefs)
{
for ( int i = 0; i < length; i++ ) {
float iSrc;
float coef1 = modff(i / scale, &iSrc);
indices[i] = u16(iSrc);
coefs[i] = u16(coef1 * kBias);
}
}
}
//==================================================
// resize_basic
//==================================================
namespace {
//! Y方向の加算(1行のみ)
void basic_Y(const u8 * src, u16 * dst, int width, u16 coefY0, u16 coefY1)
{
for ( int srcX = 0; srcX < width; srcX++ ) {
// 入力 (小数部0bit)
u16 a = src[srcX];
u16 c = src[srcX + width];
// Y方向の係数を掛けて足す(小数部8bit→0bit)
dst[srcX] = round_fixed(a*coefY0 + c*coefY1);
}
}
//! X方向の加算(1行のみ)
void basic_X(const u16 * src, u8 * dst, int width, const u16 * indicesX, const u16 * coefsX)
{
for ( int dstX = 0; dstX < width; dstX++ ) {
u16 iSrcX = indicesX[dstX];
// 入力 (小数部0bit)
u16 ac = src[iSrcX + 0];
u16 bd = src[iSrcX + 1];
// 係数 (小数部8bit)
u16 coefX1 = coefsX[dstX];
u16 coefX0 = kBias - coefX1;
// X方向の係数を掛けて足す(小数部8bit→0bit)
dst[dstX] = round_fixed(ac*coefX0 + bd*coefX1);
}
}
}
//! 整数部8bit, 小数部8bitのbilinear
void resize_basic(const image *src, const image *dst, float scale, const u8 *src_buff, u8 *dst_buff)
{
int srcW = src->width;
int dstW = dst->width;
int dstH = dst->height;
// X軸方向の添え字と係数
std::vector<u16> indicesX(dstW);
std::vector<u16> coefsX(dstW);
calcIndicesAndCoefs(dstW, scale, &indicesX[0], &coefsX[0]);
// Y軸方向の添え字と係数
std::vector<u16> indicesY(dstH);
std::vector<u16> coefsY(dstH);
calcIndicesAndCoefs(dstH, scale, &indicesY[0], &coefsY[0]);
// 一行分のバッファ (小数部8bit)
std::vector<u16> line(srcW);
// ピクセルの並びは
// a b
// c d
// とする
for ( int dstY = 0; dstY < dstH; dstY++ ) {
u16 srcY = indicesY[dstY];
// Y方向の加算
u16 coefY1 = coefsY[dstY];
u16 coefY0 = kBias - coefY1;
basic_Y(&src_buff[srcY*srcW], &line[0], srcW, coefY0, coefY1);
// X方向の加算
basic_X(&line[0], &dst_buff[dstY*dst->width], dstW, &indicesX[0], &coefsX[0]);
}
}
//==================================================
// resize_avx2
//==================================================
namespace {
//! 固定小数を整数に丸める(小数部8bit)
__m256i round_fixed_epu16(__m256i u16x16value)
{
const __m256i u16x16k0_5 = _mm256_set1_epi16(k0_5);
// round(v) = floor(v + 0.5)
return _mm256_srli_epi16(_mm256_add_epi16(u16x16value, u16x16k0_5), kBiasBit);
}
//! Y方向の加算(1行のみ)
void avx2_Y(const u8 * src, u16 * dst, int width, u16 coefY0, u16 coefY1)
{
const int kElems = sizeof(__m256i) / sizeof(u16);
__m256i u16x16coefY0 = _mm256_set1_epi16(coefY0);
__m256i u16x16coefY1 = _mm256_set1_epi16(coefY1);
for ( int srcX = 0; srcX < width; srcX += kElems ) {
// 入力 (小数部0bit)
__m128i u8x16a = _mm_loadu_si128((const __m128i *)&src[srcX]);
__m128i u8x16c = _mm_loadu_si128((const __m128i *)&src[srcX + width]);
__m256i u16x16a = _mm256_cvtepu8_epi16(u8x16a);
__m256i u16x16c = _mm256_cvtepu8_epi16(u8x16c);
// Y方向の係数を掛けて足す (小数部8bit)
__m256i u16x16Y0 = _mm256_mullo_epi16(u16x16a, u16x16coefY0);
__m256i u16x16Y1 = _mm256_mullo_epi16(u16x16c, u16x16coefY1);
__m256i u16x16ac = _mm256_add_epi16(u16x16Y0, u16x16Y1);
// 整数に変換 (小数部0bit)
__m256i u16x16dst = round_fixed_epu16(u16x16ac);
_mm256_storeu_si256((__m256i *)&dst[srcX], u16x16dst);
}
}
//! src[indices[i]] 及び src[indices[i] + 1] の内容を返す
__m256i avx2LoadSrc(const u16 * src, const u16 * indices)
{
const __m128i k0 = _mm_setzero_si128();
// i0 i2 i4 i6 i8 i10 i12 i14
__m128i u16x8indices = _mm_loadu_si128((const __m128i *)indices);
// i0 i2 i4 i6
__m128i s32x4indices0 = _mm_unpacklo_epi16(u16x8indices, k0);
__m128i s32x4indices1 = _mm_unpackhi_epi16(u16x8indices, k0);
// i0 i2 i4 i6 i8 i10 i12 i14
__m256i s32x8indices = _mm256_set_m128i(s32x4indices1, s32x4indices0);
// x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15
// 32bitずつロードするが、隣のピクセルとの計算なので都合が良い
return _mm256_i32gather_epi32((const int *)src, s32x8indices, sizeof(u16));
}
//! coefs[i] 及び kBias - coefs[i] の内容を返す
__m256i avx2LoadCoefs(const u16 * coefs)
{
const __m128i u16x8kBias = _mm_set1_epi16(kBias);
// c1 c3 c5 c7 c9 c11 c13 c15
__m128i u16x8coefOd = _mm_loadu_si128((const __m128i *)coefs);
// c0 c2 c4 c6 c8 c10 c12 c14
__m128i u16x8coefEv = _mm_sub_epi16(u16x8kBias, u16x8coefOd);
// c0 c1 c2 c3 c4 c5 c6 c7
__m128i u16x8coef0 = _mm_unpacklo_epi16(u16x8coefEv, u16x8coefOd);
__m128i u16x8coef1 = _mm_unpackhi_epi16(u16x8coefEv, u16x8coefOd);
// c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15
return _mm256_set_m128i(u16x8coef1, u16x8coef0);
}
__m256i packus_epi32(__m256i u32x8lo, __m256i u32x8hi)
{
__m256i u16x16tmp = _mm256_packus_epi32(u32x8lo, u32x8hi);
return _mm256_permute4x64_epi64(u16x16tmp, _MM_SHUFFLE(3, 1, 2, 0));
}
__m128i cvtepi16_epi8(__m256i u16x16v)
{
__m128i u16x8lo = _mm256_castsi256_si128(u16x16v);
__m128i u16x8hi = _mm256_extracti128_si256(u16x16v, 1);
return _mm_packus_epi16(u16x8lo, u16x8hi);
}
//! X方向の加算(1行のみ)
void avx2_X(const u16 * src, u8 * dst, int width, const u16 * indicesX, const u16 * coefsX)
{
const int kElems = sizeof(__m128i) / sizeof(u16);
for ( int dstX = 0; dstX < width; dstX += kElems*2 ) {
// 入力 (小数部0bit)
__m256i u16x16src0 = avx2LoadSrc(src, &indicesX[dstX ]);
__m256i u16x16src1 = avx2LoadSrc(src, &indicesX[dstX + kElems]);
// 係数 (小数部8bit)
__m256i u16x16coef0 = avx2LoadCoefs(&coefsX[dstX ]);
__m256i u16x16coef1 = avx2LoadCoefs(&coefsX[dstX + kElems]);
// acbd = u32(src0*coef0 + src1*coef1) (小数部8bit)
__m256i u32x8acbd0 = _mm256_madd_epi16(u16x16src0, u16x16coef0);
__m256i u32x8acbd1 = _mm256_madd_epi16(u16x16src1, u16x16coef1);
// 整数に変換 (小数部0bit)
__m256i u16x16acbd = packus_epi32(u32x8acbd0, u32x8acbd1);
__m256i u16x16dst = round_fixed_epu16(u16x16acbd);
// 8bitに変換
__m128i u8x16dst = cvtepi16_epi8(u16x16dst);
_mm_storeu_si128(reinterpret_cast<__m128i *>(&dst[dstX]), u8x16dst);
}
}
}
//! 整数部8bit, 小数部8bitのbilinear
void resize_avx2(const image *src, const image *dst, float scale, const u8 *src_buff, u8 *dst_buff)
{
const int srcW = src->width;
const int dstW = dst->width;
const int dstH = dst->height;
// X軸方向の添え字と係数
std::vector<u16> indicesX(dstW);
std::vector<u16> coefsX(dstW);
calcIndicesAndCoefs(dstW, scale, &indicesX[0], &coefsX[0]);
// Y軸方向の添え字と係数
std::vector<u16> indicesY(dstH);
std::vector<u16> coefsY(dstH);
calcIndicesAndCoefs(dstH, scale, &indicesY[0], &coefsY[0]);
// 一行分のバッファ (小数部8bit)
std::vector<u16> line(srcW);
// ピクセルの並びは
// a b
// c d
// とする
for ( int dstY = 0; dstY < dstH; dstY++ ) {
u16 srcY = indicesY[dstY];
// Y方向の加算
u16 coefY1 = coefsY[dstY];
u16 coefY0 = kBias - coefY1;
avx2_Y(&src_buff[srcY * srcW], &line[0], srcW, coefY0, coefY1);
// X方向の加算
avx2_X(&line[0], &dst_buff[dstY*dstW], dstW, &indicesX[0], &coefsX[0]);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment