lydonchandra · October 6, 2020 11:37
diff --git a/BilinearInterpolationSSE3.cpp b/BilinearInterpolationSSE3.cpp
 // https://fastcpp.blogspot.com/2011/06/bilinear-pixel-interpolation-using-sse.html

 inline Pixel GetPixelSSE3(const Image<Pixel>* img, float x, float y)
 {
 const int stride = img->width;
 const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel

 // Load the data (2 pixels in one load)
 __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); 
 __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); 

 __m128 weight = CalcWeights(x, y);

 // convert RGBA RGBA RGBA RGAB to RRRR GGGG BBBB AAAA (AoS to SoA)
 __m128i p1234 = _mm_unpacklo_epi8(p12, p34);
 __m128i p34xx = _mm_unpackhi_epi64(p1234, _mm_setzero_si128());
 __m128i p1234_8bit = _mm_unpacklo_epi8(p1234, p34xx);

 // extend to 16bit 
 __m128i pRG = _mm_unpacklo_epi8(p1234_8bit, _mm_setzero_si128());
 __m128i pBA = _mm_unpackhi_epi8(p1234_8bit, _mm_setzero_si128());
 
 // convert weights to integer
 weight = _mm_mul_ps(weight, CONST_256); 
 __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1
         weighti = _mm_packs_epi32(weighti, weighti); // 32->2x16bit

 //outRG = [w1*R1 + w2*R2 | w3*R3 + w4*R4 | w1*G1 + w2*G2 | w3*G3 + w4*G4]
 __m128i outRG = _mm_madd_epi16(pRG, weighti);
 //outBA = [w1*B1 + w2*B2 | w3*B3 + w4*B4 | w1*A1 + w2*A2 | w3*A3 + w4*A4]
 __m128i outBA = _mm_madd_epi16(pBA, weighti);

 // horizontal add that will produce the output values (in 32bit)
 __m128i out = _mm_hadd_epi32(outRG, outBA);
 out = _mm_srli_epi32(out, 8); // divide by 256
 
 // convert 32bit->8bit
 out = _mm_packus_epi32(out, _mm_setzero_si128());
 out = _mm_packus_epi16(out, _mm_setzero_si128());

 // return
 return _mm_cvtsi128_si32(out);
 }
	// https://fastcpp.blogspot.com/2011/06/bilinear-pixel-interpolation-using-sse.html

	inline Pixel GetPixelSSE3(const Image<Pixel>* img, float x, float y)
	{
	const int stride = img->width;
	const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel

	// Load the data (2 pixels in one load)
	__m128i p12 = _mm_loadl_epi64((const __m128i)&p0[0 stride]);
	__m128i p34 = _mm_loadl_epi64((const __m128i)&p0[1 stride]);

	__m128 weight = CalcWeights(x, y);

	// convert RGBA RGBA RGBA RGAB to RRRR GGGG BBBB AAAA (AoS to SoA)
	__m128i p1234 = _mm_unpacklo_epi8(p12, p34);
	__m128i p34xx = _mm_unpackhi_epi64(p1234, _mm_setzero_si128());
	__m128i p1234_8bit = _mm_unpacklo_epi8(p1234, p34xx);

	// extend to 16bit
	__m128i pRG = _mm_unpacklo_epi8(p1234_8bit, _mm_setzero_si128());
	__m128i pBA = _mm_unpackhi_epi8(p1234_8bit, _mm_setzero_si128());

	// convert weights to integer
	weight = _mm_mul_ps(weight, CONST_256);
	__m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1
	weighti = _mm_packs_epi32(weighti, weighti); // 32->2x16bit

	//outRG = [w1R1 + w2R2 \| w3R3 + w4R4 \| w1G1 + w2G2 \| w3G3 + w4G4]
	__m128i outRG = _mm_madd_epi16(pRG, weighti);
	//outBA = [w1B1 + w2B2 \| w3B3 + w4B4 \| w1A1 + w2A2 \| w3A3 + w4A4]
	__m128i outBA = _mm_madd_epi16(pBA, weighti);

	// horizontal add that will produce the output values (in 32bit)
	__m128i out = _mm_hadd_epi32(outRG, outBA);
	out = _mm_srli_epi32(out, 8); // divide by 256

	// convert 32bit->8bit
	out = _mm_packus_epi32(out, _mm_setzero_si128());
	out = _mm_packus_epi16(out, _mm_setzero_si128());

	// return
	return _mm_cvtsi128_si32(out);
	}