Created
October 6, 2020 11:37
-
-
Save lydonchandra/220eb2aeeae8a9542290a249349ba9a8 to your computer and use it in GitHub Desktop.
BilinearInterpolationSSE3.cpp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// https://fastcpp.blogspot.com/2011/06/bilinear-pixel-interpolation-using-sse.html | |
inline Pixel GetPixelSSE3(const Image<Pixel>* img, float x, float y) | |
{ | |
const int stride = img->width; | |
const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel | |
// Load the data (2 pixels in one load) | |
__m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); | |
__m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); | |
__m128 weight = CalcWeights(x, y); | |
// convert RGBA RGBA RGBA RGAB to RRRR GGGG BBBB AAAA (AoS to SoA) | |
__m128i p1234 = _mm_unpacklo_epi8(p12, p34); | |
__m128i p34xx = _mm_unpackhi_epi64(p1234, _mm_setzero_si128()); | |
__m128i p1234_8bit = _mm_unpacklo_epi8(p1234, p34xx); | |
// extend to 16bit | |
__m128i pRG = _mm_unpacklo_epi8(p1234_8bit, _mm_setzero_si128()); | |
__m128i pBA = _mm_unpackhi_epi8(p1234_8bit, _mm_setzero_si128()); | |
// convert weights to integer | |
weight = _mm_mul_ps(weight, CONST_256); | |
__m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1 | |
weighti = _mm_packs_epi32(weighti, weighti); // 32->2x16bit | |
//outRG = [w1*R1 + w2*R2 | w3*R3 + w4*R4 | w1*G1 + w2*G2 | w3*G3 + w4*G4] | |
__m128i outRG = _mm_madd_epi16(pRG, weighti); | |
//outBA = [w1*B1 + w2*B2 | w3*B3 + w4*B4 | w1*A1 + w2*A2 | w3*A3 + w4*A4] | |
__m128i outBA = _mm_madd_epi16(pBA, weighti); | |
// horizontal add that will produce the output values (in 32bit) | |
__m128i out = _mm_hadd_epi32(outRG, outBA); | |
out = _mm_srli_epi32(out, 8); // divide by 256 | |
// convert 32bit->8bit | |
out = _mm_packus_epi32(out, _mm_setzero_si128()); | |
out = _mm_packus_epi16(out, _mm_setzero_si128()); | |
// return | |
return _mm_cvtsi128_si32(out); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment