yoffy · June 14, 2020 13:23
diff --git a/resize.cpp b/resize.cpp
 // clang++ -c -std=c++11 -Wall -Wextra -Ofast -g -march=native -D NDEBUG
 #include <stdint.h>
 #include <math.h>
 #include <vector>
 #include <immintrin.h>

 struct image
 {
    int width;
    int height;

    image(int w, int h)
    {
        width = w;
        height = h;
    }
 };

 using u8 = uint8_t;
 using u16 = uint16_t;
 using u32 = uint32_t;
 using u64 = uint64_t;

 namespace {

    const u16 kBiasBit = 8;             // 小数部8bit
    const u16 kBias    = 1 << kBiasBit;
    const u16 k0_5     = kBias / 2;     // 0.5

    //! 固定小数を整数に丸める(小数部8bit)
    u16 round_fixed(u16 value)
    {
        return (value + k0_5) / kBias; // round(v) = floor(v + 0.5)
    }

    //! 出力座標に対する入力座標と係数の計算
    //!
    //! coefs はkBias基にした1固定小数点数で注目した画素の次のピクセルの係数が入る。
    //! @code
    //! indices[i] = floor(i / scale)
    //! coefs[i] = (i / scale) - indices[i]
    //! @endcode
    void calcIndicesAndCoefs(int length, float scale, u16 * indices, u16 * coefs)
    {
        for ( int i = 0; i < length; i++ ) {
            float iSrc;
            float coef1 = modff(i / scale, &iSrc);
            indices[i] = u16(iSrc);
            coefs[i] = u16(coef1 * kBias);
        }
    }

 }



 //==================================================
 // resize_basic
 //==================================================

 namespace {

    //! Y方向の加算(1行のみ)
    void basic_Y(const u8 * src, u16 * dst, int width, u16 coefY0, u16 coefY1)
    {
        for ( int srcX = 0; srcX < width; srcX++ ) {
            // 入力 (小数部0bit)
            u16 a = src[srcX];
            u16 c = src[srcX + width];

            // Y方向の係数を掛けて足す(小数部8bit→0bit)
            dst[srcX] = round_fixed(a*coefY0 + c*coefY1);
        }
    }

    //! X方向の加算(1行のみ)
    void basic_X(const u16 * src, u8 * dst, int width, const u16 * indicesX, const u16 * coefsX)
    {
        for ( int dstX = 0; dstX < width; dstX++ ) {
            u16 iSrcX  = indicesX[dstX];

            // 入力 (小数部0bit)
            u16 ac = src[iSrcX + 0];
            u16 bd = src[iSrcX + 1];

            // 係数 (小数部8bit)
            u16 coefX1 = coefsX[dstX];
            u16 coefX0 = kBias - coefX1;

            // X方向の係数を掛けて足す(小数部8bit→0bit)
            dst[dstX] = round_fixed(ac*coefX0 + bd*coefX1);
        }
    }

 }

 //! 整数部8bit, 小数部8bitのbilinear
 void resize_basic(const image *src, const image *dst, float scale, const u8 *src_buff, u8 *dst_buff)
 {
    int srcW = src->width;
    int dstW = dst->width;
    int dstH = dst->height;

    // X軸方向の添え字と係数
    std::vector<u16> indicesX(dstW);
    std::vector<u16> coefsX(dstW);
    calcIndicesAndCoefs(dstW, scale, &indicesX[0], &coefsX[0]);

    // Y軸方向の添え字と係数
    std::vector<u16> indicesY(dstH);
    std::vector<u16> coefsY(dstH);
    calcIndicesAndCoefs(dstH, scale, &indicesY[0], &coefsY[0]);

    // 一行分のバッファ (小数部8bit)
    std::vector<u16> line(srcW);

    // ピクセルの並びは
    // a b
    // c d
    // とする
    for ( int dstY = 0; dstY < dstH; dstY++ ) {
        u16 srcY   = indicesY[dstY];

        // Y方向の加算
        u16 coefY1 = coefsY[dstY];
        u16 coefY0 = kBias - coefY1;
        basic_Y(&src_buff[srcY*srcW], &line[0], srcW, coefY0, coefY1);

        // X方向の加算
        basic_X(&line[0], &dst_buff[dstY*dst->width], dstW, &indicesX[0], &coefsX[0]);
    }
 }



 //==================================================
 // resize_avx2
 //==================================================

 namespace {

    //! 固定小数を整数に丸める(小数部8bit)
    __m256i round_fixed_epu16(__m256i u16x16value)
    {
        const __m256i u16x16k0_5 = _mm256_set1_epi16(k0_5);
        // round(v) = floor(v + 0.5)
        return _mm256_srli_epi16(_mm256_add_epi16(u16x16value, u16x16k0_5), kBiasBit);
    }

    //! Y方向の加算(1行のみ)
    void avx2_Y(const u8 * src, u16 * dst, int width, u16 coefY0, u16 coefY1)
    {
        const int kElems = sizeof(__m256i) / sizeof(u16);
        __m256i u16x16coefY0 = _mm256_set1_epi16(coefY0);
        __m256i u16x16coefY1 = _mm256_set1_epi16(coefY1);
        for ( int srcX = 0; srcX < width; srcX += kElems ) {
            // 入力 (小数部0bit)
            __m128i u8x16a    = _mm_loadu_si128((const __m128i *)&src[srcX]);
            __m128i u8x16c    = _mm_loadu_si128((const __m128i *)&src[srcX + width]);
            __m256i u16x16a   = _mm256_cvtepu8_epi16(u8x16a);
            __m256i u16x16c   = _mm256_cvtepu8_epi16(u8x16c);
            // Y方向の係数を掛けて足す (小数部8bit)
            __m256i u16x16Y0  = _mm256_mullo_epi16(u16x16a, u16x16coefY0);
            __m256i u16x16Y1  = _mm256_mullo_epi16(u16x16c, u16x16coefY1);
            __m256i u16x16ac  = _mm256_add_epi16(u16x16Y0, u16x16Y1);
            // 整数に変換 (小数部0bit)
            __m256i u16x16dst = round_fixed_epu16(u16x16ac);
            _mm256_storeu_si256((__m256i *)&dst[srcX], u16x16dst);
        }
    }

    //! src[indices[i]] 及び src[indices[i] + 1] の内容を返す
    __m256i avx2LoadSrc(const u16 * src, const u16 * indices)
    {
        const __m128i k0 = _mm_setzero_si128();

        // i0 i2 i4 i6 i8 i10 i12 i14
        __m128i u16x8indices   = _mm_loadu_si128((const __m128i *)indices);
        // i0 i2 i4 i6
        __m128i s32x4indices0  = _mm_unpacklo_epi16(u16x8indices, k0);
        __m128i s32x4indices1  = _mm_unpackhi_epi16(u16x8indices, k0);
        // i0 i2 i4 i6 i8 i10 i12 i14
        __m256i s32x8indices   = _mm256_set_m128i(s32x4indices1, s32x4indices0);
        // x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15
        // 32bitずつロードするが、隣のピクセルとの計算なので都合が良い
        return _mm256_i32gather_epi32((const int *)src, s32x8indices, sizeof(u16));
    }

    //! coefs[i] 及び kBias - coefs[i] の内容を返す
    __m256i avx2LoadCoefs(const u16 * coefs)
    {
        const __m128i u16x8kBias = _mm_set1_epi16(kBias);

        // c1 c3 c5 c7 c9 c11 c13 c15
        __m128i u16x8coefOd = _mm_loadu_si128((const __m128i *)coefs);
        // c0 c2 c4 c6 c8 c10 c12 c14
        __m128i u16x8coefEv = _mm_sub_epi16(u16x8kBias, u16x8coefOd);
        // c0 c1 c2 c3 c4 c5 c6 c7
        __m128i u16x8coef0 = _mm_unpacklo_epi16(u16x8coefEv, u16x8coefOd);
        __m128i u16x8coef1 = _mm_unpackhi_epi16(u16x8coefEv, u16x8coefOd);
        // c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15
        return _mm256_set_m128i(u16x8coef1, u16x8coef0);
    }

    __m256i packus_epi32(__m256i u32x8lo, __m256i u32x8hi)
    {
        __m256i u16x16tmp = _mm256_packus_epi32(u32x8lo, u32x8hi);
        return _mm256_permute4x64_epi64(u16x16tmp, _MM_SHUFFLE(3, 1, 2, 0));
    }

    __m128i cvtepi16_epi8(__m256i u16x16v)
    {
        __m128i u16x8lo = _mm256_castsi256_si128(u16x16v);
        __m128i u16x8hi = _mm256_extracti128_si256(u16x16v, 1);
        return _mm_packus_epi16(u16x8lo, u16x8hi);
    }

    //! X方向の加算(1行のみ)
    void avx2_X(const u16 * src, u8 * dst, int width, const u16 * indicesX, const u16 * coefsX)
    {
        const int kElems = sizeof(__m128i) / sizeof(u16);
        for ( int dstX = 0; dstX < width; dstX += kElems*2 ) {
            // 入力 (小数部0bit)
            __m256i u16x16src0 = avx2LoadSrc(src, &indicesX[dstX         ]);
            __m256i u16x16src1 = avx2LoadSrc(src, &indicesX[dstX + kElems]);

            // 係数 (小数部8bit)
            __m256i u16x16coef0 = avx2LoadCoefs(&coefsX[dstX         ]);
            __m256i u16x16coef1 = avx2LoadCoefs(&coefsX[dstX + kElems]);

            // acbd = u32(src0*coef0 + src1*coef1) (小数部8bit)
            __m256i u32x8acbd0 = _mm256_madd_epi16(u16x16src0, u16x16coef0);
            __m256i u32x8acbd1 = _mm256_madd_epi16(u16x16src1, u16x16coef1);

            // 整数に変換 (小数部0bit)
            __m256i u16x16acbd = packus_epi32(u32x8acbd0, u32x8acbd1);
            __m256i u16x16dst  = round_fixed_epu16(u16x16acbd);

            // 8bitに変換
            __m128i u8x16dst = cvtepi16_epi8(u16x16dst);
            _mm_storeu_si128(reinterpret_cast<__m128i *>(&dst[dstX]), u8x16dst);
        }
    }

 }

 //! 整数部8bit, 小数部8bitのbilinear
 void resize_avx2(const image *src, const image *dst, float scale, const u8 *src_buff, u8 *dst_buff)
 {
    const int srcW = src->width;
    const int dstW = dst->width;
    const int dstH = dst->height;

    // X軸方向の添え字と係数
    std::vector<u16> indicesX(dstW);
    std::vector<u16> coefsX(dstW);
    calcIndicesAndCoefs(dstW, scale, &indicesX[0], &coefsX[0]);

    // Y軸方向の添え字と係数
    std::vector<u16> indicesY(dstH);
    std::vector<u16> coefsY(dstH);
    calcIndicesAndCoefs(dstH, scale, &indicesY[0], &coefsY[0]);

    // 一行分のバッファ (小数部8bit)
    std::vector<u16> line(srcW);

    // ピクセルの並びは
    // a b
    // c d
    // とする
    for ( int dstY = 0; dstY < dstH; dstY++ ) {
        u16 srcY   = indicesY[dstY];

        // Y方向の加算
        u16 coefY1 = coefsY[dstY];
        u16 coefY0 = kBias - coefY1;
        avx2_Y(&src_buff[srcY * srcW], &line[0], srcW, coefY0, coefY1);

        // X方向の加算
        avx2_X(&line[0], &dst_buff[dstY*dstW], dstW, &indicesX[0], &coefsX[0]);
    }
 }
	// clang++ -c -std=c++11 -Wall -Wextra -Ofast -g -march=native -D NDEBUG
	#include <stdint.h>
	#include <math.h>
	#include <vector>
	#include <immintrin.h>

	struct image
	{
	int width;
	int height;

	image(int w, int h)
	{
	width = w;
	height = h;
	}
	};

	using u8 = uint8_t;
	using u16 = uint16_t;
	using u32 = uint32_t;
	using u64 = uint64_t;

	namespace {

	const u16 kBiasBit = 8; // 小数部8bit
	const u16 kBias = 1 << kBiasBit;
	const u16 k0_5 = kBias / 2; // 0.5

	//! 固定小数を整数に丸める(小数部8bit)
	u16 round_fixed(u16 value)
	{
	return (value + k0_5) / kBias; // round(v) = floor(v + 0.5)
	}

	//! 出力座標に対する入力座標と係数の計算
	//!
	//! coefs はkBias基にした1固定小数点数で注目した画素の次のピクセルの係数が入る。
	//! @code
	//! indices[i] = floor(i / scale)
	//! coefs[i] = (i / scale) - indices[i]
	//! @endcode
	void calcIndicesAndCoefs(int length, float scale, u16 * indices, u16 * coefs)
	{
	for ( int i = 0; i < length; i++ ) {
	float iSrc;
	float coef1 = modff(i / scale, &iSrc);
	indices[i] = u16(iSrc);
	coefs[i] = u16(coef1 * kBias);
	}
	}

	}



	//==================================================
	// resize_basic
	//==================================================

	namespace {

	//! Y方向の加算(1行のみ)
	void basic_Y(const u8 * src, u16 * dst, int width, u16 coefY0, u16 coefY1)
	{
	for ( int srcX = 0; srcX < width; srcX++ ) {
	// 入力 (小数部0bit)
	u16 a = src[srcX];
	u16 c = src[srcX + width];

	// Y方向の係数を掛けて足す(小数部8bit→0bit)
	dst[srcX] = round_fixed(acoefY0 + ccoefY1);
	}
	}

	//! X方向の加算(1行のみ)
	void basic_X(const u16 * src, u8 * dst, int width, const u16 * indicesX, const u16 * coefsX)
	{
	for ( int dstX = 0; dstX < width; dstX++ ) {
	u16 iSrcX = indicesX[dstX];

	// 入力 (小数部0bit)
	u16 ac = src[iSrcX + 0];
	u16 bd = src[iSrcX + 1];

	// 係数 (小数部8bit)
	u16 coefX1 = coefsX[dstX];
	u16 coefX0 = kBias - coefX1;

	// X方向の係数を掛けて足す(小数部8bit→0bit)
	dst[dstX] = round_fixed(accoefX0 + bdcoefX1);
	}
	}

	}

	//! 整数部8bit, 小数部8bitのbilinear
	void resize_basic(const image src, const image dst, float scale, const u8 src_buff, u8 dst_buff)
	{
	int srcW = src->width;
	int dstW = dst->width;
	int dstH = dst->height;

	// X軸方向の添え字と係数
	std::vector<u16> indicesX(dstW);
	std::vector<u16> coefsX(dstW);
	calcIndicesAndCoefs(dstW, scale, &indicesX[0], &coefsX[0]);

	// Y軸方向の添え字と係数
	std::vector<u16> indicesY(dstH);
	std::vector<u16> coefsY(dstH);
	calcIndicesAndCoefs(dstH, scale, &indicesY[0], &coefsY[0]);

	// 一行分のバッファ (小数部8bit)
	std::vector<u16> line(srcW);

	// ピクセルの並びは
	// a b
	// c d
	// とする
	for ( int dstY = 0; dstY < dstH; dstY++ ) {
	u16 srcY = indicesY[dstY];

	// Y方向の加算
	u16 coefY1 = coefsY[dstY];
	u16 coefY0 = kBias - coefY1;
	basic_Y(&src_buff[srcY*srcW], &line[0], srcW, coefY0, coefY1);

	// X方向の加算
	basic_X(&line[0], &dst_buff[dstY*dst->width], dstW, &indicesX[0], &coefsX[0]);
	}
	}



	//==================================================
	// resize_avx2
	//==================================================

	namespace {

	//! 固定小数を整数に丸める(小数部8bit)
	__m256i round_fixed_epu16(__m256i u16x16value)
	{
	const __m256i u16x16k0_5 = _mm256_set1_epi16(k0_5);
	// round(v) = floor(v + 0.5)
	return _mm256_srli_epi16(_mm256_add_epi16(u16x16value, u16x16k0_5), kBiasBit);
	}

	//! Y方向の加算(1行のみ)
	void avx2_Y(const u8 * src, u16 * dst, int width, u16 coefY0, u16 coefY1)
	{
	const int kElems = sizeof(__m256i) / sizeof(u16);
	__m256i u16x16coefY0 = _mm256_set1_epi16(coefY0);
	__m256i u16x16coefY1 = _mm256_set1_epi16(coefY1);
	for ( int srcX = 0; srcX < width; srcX += kElems ) {
	// 入力 (小数部0bit)
	__m128i u8x16a = _mm_loadu_si128((const __m128i *)&src[srcX]);
	__m128i u8x16c = _mm_loadu_si128((const __m128i *)&src[srcX + width]);
	__m256i u16x16a = _mm256_cvtepu8_epi16(u8x16a);
	__m256i u16x16c = _mm256_cvtepu8_epi16(u8x16c);
	// Y方向の係数を掛けて足す (小数部8bit)
	__m256i u16x16Y0 = _mm256_mullo_epi16(u16x16a, u16x16coefY0);
	__m256i u16x16Y1 = _mm256_mullo_epi16(u16x16c, u16x16coefY1);
	__m256i u16x16ac = _mm256_add_epi16(u16x16Y0, u16x16Y1);
	// 整数に変換 (小数部0bit)
	__m256i u16x16dst = round_fixed_epu16(u16x16ac);
	_mm256_storeu_si256((__m256i *)&dst[srcX], u16x16dst);
	}
	}

	//! src[indices[i]] 及び src[indices[i] + 1] の内容を返す
	__m256i avx2LoadSrc(const u16 * src, const u16 * indices)
	{
	const __m128i k0 = _mm_setzero_si128();

	// i0 i2 i4 i6 i8 i10 i12 i14
	__m128i u16x8indices = _mm_loadu_si128((const __m128i *)indices);
	// i0 i2 i4 i6
	__m128i s32x4indices0 = _mm_unpacklo_epi16(u16x8indices, k0);
	__m128i s32x4indices1 = _mm_unpackhi_epi16(u16x8indices, k0);
	// i0 i2 i4 i6 i8 i10 i12 i14
	__m256i s32x8indices = _mm256_set_m128i(s32x4indices1, s32x4indices0);
	// x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15
	// 32bitずつロードするが、隣のピクセルとの計算なので都合が良い
	return _mm256_i32gather_epi32((const int *)src, s32x8indices, sizeof(u16));
	}

	//! coefs[i] 及び kBias - coefs[i] の内容を返す
	__m256i avx2LoadCoefs(const u16 * coefs)
	{
	const __m128i u16x8kBias = _mm_set1_epi16(kBias);

	// c1 c3 c5 c7 c9 c11 c13 c15
	__m128i u16x8coefOd = _mm_loadu_si128((const __m128i *)coefs);
	// c0 c2 c4 c6 c8 c10 c12 c14
	__m128i u16x8coefEv = _mm_sub_epi16(u16x8kBias, u16x8coefOd);
	// c0 c1 c2 c3 c4 c5 c6 c7
	__m128i u16x8coef0 = _mm_unpacklo_epi16(u16x8coefEv, u16x8coefOd);
	__m128i u16x8coef1 = _mm_unpackhi_epi16(u16x8coefEv, u16x8coefOd);
	// c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15
	return _mm256_set_m128i(u16x8coef1, u16x8coef0);
	}

	__m256i packus_epi32(__m256i u32x8lo, __m256i u32x8hi)
	{
	__m256i u16x16tmp = _mm256_packus_epi32(u32x8lo, u32x8hi);
	return _mm256_permute4x64_epi64(u16x16tmp, _MM_SHUFFLE(3, 1, 2, 0));
	}

	__m128i cvtepi16_epi8(__m256i u16x16v)
	{
	__m128i u16x8lo = _mm256_castsi256_si128(u16x16v);
	__m128i u16x8hi = _mm256_extracti128_si256(u16x16v, 1);
	return _mm_packus_epi16(u16x8lo, u16x8hi);
	}

	//! X方向の加算(1行のみ)
	void avx2_X(const u16 * src, u8 * dst, int width, const u16 * indicesX, const u16 * coefsX)
	{
	const int kElems = sizeof(__m128i) / sizeof(u16);
	for ( int dstX = 0; dstX < width; dstX += kElems*2 ) {
	// 入力 (小数部0bit)
	__m256i u16x16src0 = avx2LoadSrc(src, &indicesX[dstX ]);
	__m256i u16x16src1 = avx2LoadSrc(src, &indicesX[dstX + kElems]);

	// 係数 (小数部8bit)
	__m256i u16x16coef0 = avx2LoadCoefs(&coefsX[dstX ]);
	__m256i u16x16coef1 = avx2LoadCoefs(&coefsX[dstX + kElems]);

	// acbd = u32(src0coef0 + src1coef1) (小数部8bit)
	__m256i u32x8acbd0 = _mm256_madd_epi16(u16x16src0, u16x16coef0);
	__m256i u32x8acbd1 = _mm256_madd_epi16(u16x16src1, u16x16coef1);

	// 整数に変換 (小数部0bit)
	__m256i u16x16acbd = packus_epi32(u32x8acbd0, u32x8acbd1);
	__m256i u16x16dst = round_fixed_epu16(u16x16acbd);

	// 8bitに変換
	__m128i u8x16dst = cvtepi16_epi8(u16x16dst);
	_mm_storeu_si128(reinterpret_cast<__m128i *>(&dst[dstX]), u8x16dst);
	}
	}

	}

	//! 整数部8bit, 小数部8bitのbilinear
	void resize_avx2(const image src, const image dst, float scale, const u8 src_buff, u8 dst_buff)
	{
	const int srcW = src->width;
	const int dstW = dst->width;
	const int dstH = dst->height;

	// X軸方向の添え字と係数
	std::vector<u16> indicesX(dstW);
	std::vector<u16> coefsX(dstW);
	calcIndicesAndCoefs(dstW, scale, &indicesX[0], &coefsX[0]);

	// Y軸方向の添え字と係数
	std::vector<u16> indicesY(dstH);
	std::vector<u16> coefsY(dstH);
	calcIndicesAndCoefs(dstH, scale, &indicesY[0], &coefsY[0]);

	// 一行分のバッファ (小数部8bit)
	std::vector<u16> line(srcW);

	// ピクセルの並びは
	// a b
	// c d
	// とする
	for ( int dstY = 0; dstY < dstH; dstY++ ) {
	u16 srcY = indicesY[dstY];

	// Y方向の加算
	u16 coefY1 = coefsY[dstY];
	u16 coefY0 = kBias - coefY1;
	avx2_Y(&src_buff[srcY * srcW], &line[0], srcW, coefY0, coefY1);

	// X方向の加算
	avx2_X(&line[0], &dst_buff[dstY*dstW], dstW, &indicesX[0], &coefsX[0]);
	}
	}