Last active
June 14, 2020 13:23
-
-
Save yoffy/71eff4df6dfd187420c9142b08f7fe30 to your computer and use it in GitHub Desktop.
bilinear image resampling (with AVX2)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // clang++ -c -std=c++11 -Wall -Wextra -Ofast -g -march=native -D NDEBUG | |
| #include <stdint.h> | |
| #include <math.h> | |
| #include <vector> | |
| #include <immintrin.h> | |
| struct image | |
| { | |
| int width; | |
| int height; | |
| image(int w, int h) | |
| { | |
| width = w; | |
| height = h; | |
| } | |
| }; | |
| using u8 = uint8_t; | |
| using u16 = uint16_t; | |
| using u32 = uint32_t; | |
| using u64 = uint64_t; | |
| namespace { | |
| const u16 kBiasBit = 8; // 小数部8bit | |
| const u16 kBias = 1 << kBiasBit; | |
| const u16 k0_5 = kBias / 2; // 0.5 | |
| //! 固定小数を整数に丸める(小数部8bit) | |
| u16 round_fixed(u16 value) | |
| { | |
| return (value + k0_5) / kBias; // round(v) = floor(v + 0.5) | |
| } | |
| //! 出力座標に対する入力座標と係数の計算 | |
| //! | |
| //! coefs はkBias基にした1固定小数点数で注目した画素の次のピクセルの係数が入る。 | |
| //! @code | |
| //! indices[i] = floor(i / scale) | |
| //! coefs[i] = (i / scale) - indices[i] | |
| //! @endcode | |
| void calcIndicesAndCoefs(int length, float scale, u16 * indices, u16 * coefs) | |
| { | |
| for ( int i = 0; i < length; i++ ) { | |
| float iSrc; | |
| float coef1 = modff(i / scale, &iSrc); | |
| indices[i] = u16(iSrc); | |
| coefs[i] = u16(coef1 * kBias); | |
| } | |
| } | |
| } | |
| //================================================== | |
| // resize_basic | |
| //================================================== | |
| namespace { | |
| //! Y方向の加算(1行のみ) | |
| void basic_Y(const u8 * src, u16 * dst, int width, u16 coefY0, u16 coefY1) | |
| { | |
| for ( int srcX = 0; srcX < width; srcX++ ) { | |
| // 入力 (小数部0bit) | |
| u16 a = src[srcX]; | |
| u16 c = src[srcX + width]; | |
| // Y方向の係数を掛けて足す(小数部8bit→0bit) | |
| dst[srcX] = round_fixed(a*coefY0 + c*coefY1); | |
| } | |
| } | |
| //! X方向の加算(1行のみ) | |
| void basic_X(const u16 * src, u8 * dst, int width, const u16 * indicesX, const u16 * coefsX) | |
| { | |
| for ( int dstX = 0; dstX < width; dstX++ ) { | |
| u16 iSrcX = indicesX[dstX]; | |
| // 入力 (小数部0bit) | |
| u16 ac = src[iSrcX + 0]; | |
| u16 bd = src[iSrcX + 1]; | |
| // 係数 (小数部8bit) | |
| u16 coefX1 = coefsX[dstX]; | |
| u16 coefX0 = kBias - coefX1; | |
| // X方向の係数を掛けて足す(小数部8bit→0bit) | |
| dst[dstX] = round_fixed(ac*coefX0 + bd*coefX1); | |
| } | |
| } | |
| } | |
| //! 整数部8bit, 小数部8bitのbilinear | |
| void resize_basic(const image *src, const image *dst, float scale, const u8 *src_buff, u8 *dst_buff) | |
| { | |
| int srcW = src->width; | |
| int dstW = dst->width; | |
| int dstH = dst->height; | |
| // X軸方向の添え字と係数 | |
| std::vector<u16> indicesX(dstW); | |
| std::vector<u16> coefsX(dstW); | |
| calcIndicesAndCoefs(dstW, scale, &indicesX[0], &coefsX[0]); | |
| // Y軸方向の添え字と係数 | |
| std::vector<u16> indicesY(dstH); | |
| std::vector<u16> coefsY(dstH); | |
| calcIndicesAndCoefs(dstH, scale, &indicesY[0], &coefsY[0]); | |
| // 一行分のバッファ (小数部8bit) | |
| std::vector<u16> line(srcW); | |
| // ピクセルの並びは | |
| // a b | |
| // c d | |
| // とする | |
| for ( int dstY = 0; dstY < dstH; dstY++ ) { | |
| u16 srcY = indicesY[dstY]; | |
| // Y方向の加算 | |
| u16 coefY1 = coefsY[dstY]; | |
| u16 coefY0 = kBias - coefY1; | |
| basic_Y(&src_buff[srcY*srcW], &line[0], srcW, coefY0, coefY1); | |
| // X方向の加算 | |
| basic_X(&line[0], &dst_buff[dstY*dst->width], dstW, &indicesX[0], &coefsX[0]); | |
| } | |
| } | |
| //================================================== | |
| // resize_avx2 | |
| //================================================== | |
| namespace { | |
| //! 固定小数を整数に丸める(小数部8bit) | |
| __m256i round_fixed_epu16(__m256i u16x16value) | |
| { | |
| const __m256i u16x16k0_5 = _mm256_set1_epi16(k0_5); | |
| // round(v) = floor(v + 0.5) | |
| return _mm256_srli_epi16(_mm256_add_epi16(u16x16value, u16x16k0_5), kBiasBit); | |
| } | |
| //! Y方向の加算(1行のみ) | |
| void avx2_Y(const u8 * src, u16 * dst, int width, u16 coefY0, u16 coefY1) | |
| { | |
| const int kElems = sizeof(__m256i) / sizeof(u16); | |
| __m256i u16x16coefY0 = _mm256_set1_epi16(coefY0); | |
| __m256i u16x16coefY1 = _mm256_set1_epi16(coefY1); | |
| for ( int srcX = 0; srcX < width; srcX += kElems ) { | |
| // 入力 (小数部0bit) | |
| __m128i u8x16a = _mm_loadu_si128((const __m128i *)&src[srcX]); | |
| __m128i u8x16c = _mm_loadu_si128((const __m128i *)&src[srcX + width]); | |
| __m256i u16x16a = _mm256_cvtepu8_epi16(u8x16a); | |
| __m256i u16x16c = _mm256_cvtepu8_epi16(u8x16c); | |
| // Y方向の係数を掛けて足す (小数部8bit) | |
| __m256i u16x16Y0 = _mm256_mullo_epi16(u16x16a, u16x16coefY0); | |
| __m256i u16x16Y1 = _mm256_mullo_epi16(u16x16c, u16x16coefY1); | |
| __m256i u16x16ac = _mm256_add_epi16(u16x16Y0, u16x16Y1); | |
| // 整数に変換 (小数部0bit) | |
| __m256i u16x16dst = round_fixed_epu16(u16x16ac); | |
| _mm256_storeu_si256((__m256i *)&dst[srcX], u16x16dst); | |
| } | |
| } | |
| //! src[indices[i]] 及び src[indices[i] + 1] の内容を返す | |
| __m256i avx2LoadSrc(const u16 * src, const u16 * indices) | |
| { | |
| const __m128i k0 = _mm_setzero_si128(); | |
| // i0 i2 i4 i6 i8 i10 i12 i14 | |
| __m128i u16x8indices = _mm_loadu_si128((const __m128i *)indices); | |
| // i0 i2 i4 i6 | |
| __m128i s32x4indices0 = _mm_unpacklo_epi16(u16x8indices, k0); | |
| __m128i s32x4indices1 = _mm_unpackhi_epi16(u16x8indices, k0); | |
| // i0 i2 i4 i6 i8 i10 i12 i14 | |
| __m256i s32x8indices = _mm256_set_m128i(s32x4indices1, s32x4indices0); | |
| // x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15 | |
| // 32bitずつロードするが、隣のピクセルとの計算なので都合が良い | |
| return _mm256_i32gather_epi32((const int *)src, s32x8indices, sizeof(u16)); | |
| } | |
| //! coefs[i] 及び kBias - coefs[i] の内容を返す | |
| __m256i avx2LoadCoefs(const u16 * coefs) | |
| { | |
| const __m128i u16x8kBias = _mm_set1_epi16(kBias); | |
| // c1 c3 c5 c7 c9 c11 c13 c15 | |
| __m128i u16x8coefOd = _mm_loadu_si128((const __m128i *)coefs); | |
| // c0 c2 c4 c6 c8 c10 c12 c14 | |
| __m128i u16x8coefEv = _mm_sub_epi16(u16x8kBias, u16x8coefOd); | |
| // c0 c1 c2 c3 c4 c5 c6 c7 | |
| __m128i u16x8coef0 = _mm_unpacklo_epi16(u16x8coefEv, u16x8coefOd); | |
| __m128i u16x8coef1 = _mm_unpackhi_epi16(u16x8coefEv, u16x8coefOd); | |
| // c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 | |
| return _mm256_set_m128i(u16x8coef1, u16x8coef0); | |
| } | |
| __m256i packus_epi32(__m256i u32x8lo, __m256i u32x8hi) | |
| { | |
| __m256i u16x16tmp = _mm256_packus_epi32(u32x8lo, u32x8hi); | |
| return _mm256_permute4x64_epi64(u16x16tmp, _MM_SHUFFLE(3, 1, 2, 0)); | |
| } | |
| __m128i cvtepi16_epi8(__m256i u16x16v) | |
| { | |
| __m128i u16x8lo = _mm256_castsi256_si128(u16x16v); | |
| __m128i u16x8hi = _mm256_extracti128_si256(u16x16v, 1); | |
| return _mm_packus_epi16(u16x8lo, u16x8hi); | |
| } | |
| //! X方向の加算(1行のみ) | |
| void avx2_X(const u16 * src, u8 * dst, int width, const u16 * indicesX, const u16 * coefsX) | |
| { | |
| const int kElems = sizeof(__m128i) / sizeof(u16); | |
| for ( int dstX = 0; dstX < width; dstX += kElems*2 ) { | |
| // 入力 (小数部0bit) | |
| __m256i u16x16src0 = avx2LoadSrc(src, &indicesX[dstX ]); | |
| __m256i u16x16src1 = avx2LoadSrc(src, &indicesX[dstX + kElems]); | |
| // 係数 (小数部8bit) | |
| __m256i u16x16coef0 = avx2LoadCoefs(&coefsX[dstX ]); | |
| __m256i u16x16coef1 = avx2LoadCoefs(&coefsX[dstX + kElems]); | |
| // acbd = u32(src0*coef0 + src1*coef1) (小数部8bit) | |
| __m256i u32x8acbd0 = _mm256_madd_epi16(u16x16src0, u16x16coef0); | |
| __m256i u32x8acbd1 = _mm256_madd_epi16(u16x16src1, u16x16coef1); | |
| // 整数に変換 (小数部0bit) | |
| __m256i u16x16acbd = packus_epi32(u32x8acbd0, u32x8acbd1); | |
| __m256i u16x16dst = round_fixed_epu16(u16x16acbd); | |
| // 8bitに変換 | |
| __m128i u8x16dst = cvtepi16_epi8(u16x16dst); | |
| _mm_storeu_si128(reinterpret_cast<__m128i *>(&dst[dstX]), u8x16dst); | |
| } | |
| } | |
| } | |
| //! 整数部8bit, 小数部8bitのbilinear | |
| void resize_avx2(const image *src, const image *dst, float scale, const u8 *src_buff, u8 *dst_buff) | |
| { | |
| const int srcW = src->width; | |
| const int dstW = dst->width; | |
| const int dstH = dst->height; | |
| // X軸方向の添え字と係数 | |
| std::vector<u16> indicesX(dstW); | |
| std::vector<u16> coefsX(dstW); | |
| calcIndicesAndCoefs(dstW, scale, &indicesX[0], &coefsX[0]); | |
| // Y軸方向の添え字と係数 | |
| std::vector<u16> indicesY(dstH); | |
| std::vector<u16> coefsY(dstH); | |
| calcIndicesAndCoefs(dstH, scale, &indicesY[0], &coefsY[0]); | |
| // 一行分のバッファ (小数部8bit) | |
| std::vector<u16> line(srcW); | |
| // ピクセルの並びは | |
| // a b | |
| // c d | |
| // とする | |
| for ( int dstY = 0; dstY < dstH; dstY++ ) { | |
| u16 srcY = indicesY[dstY]; | |
| // Y方向の加算 | |
| u16 coefY1 = coefsY[dstY]; | |
| u16 coefY0 = kBias - coefY1; | |
| avx2_Y(&src_buff[srcY * srcW], &line[0], srcW, coefY0, coefY1); | |
| // X方向の加算 | |
| avx2_X(&line[0], &dst_buff[dstY*dstW], dstW, &indicesX[0], &coefsX[0]); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment