Last active
September 6, 2022 18:45
-
-
Save 3outeille/c1f62980eadf5b03a931ff009beb4f12 to your computer and use it in GitHub Desktop.
fg_blend_stripe_sse4 C version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void fg_blend_stripe_sse4(int16_t *dstSampleOffsetY, int16_t *srcSampleOffsetY, int32_t *grainStripe, uint32_t widthComp, uint32_t blockHeight, uint8_t bitDepth) | |
{ | |
uint32_t k, l; | |
// Prepare SIMD SSE4 ov_clip_uintp2 | |
__m128i mask = _mm_set1_epi32((1 << bitDepth)); | |
__m128i not_mask = _mm_xor_si128(mask, mask); | |
not_mask = _mm_sub_epi32(not_mask, mask); | |
mask = _mm_sub_epi32(mask, _mm_set1_epi32(1)); | |
for (l = 0; l < blockHeight; l+=1) /* y direction */ | |
{ | |
for (k = 0; k < widthComp; k+=4) /* x direction */ | |
{ | |
__m128i grainSample = _mm_loadu_si128((__m128i*)&grainStripe[((l + 0) * widthComp) + k]); | |
grainSample = _mm_slli_epi32(grainSample, (bitDepth - 8)); | |
// Can't use load as srcSampleOffsetY is of type int16_t (thus loading 8 value instead of 4) | |
__m128i offset = _mm_set_epi32((int32_t)srcSampleOffsetY[k + 3 + ((l + 0) * widthComp)], | |
(int32_t)srcSampleOffsetY[k + 2 + ((l + 0) * widthComp)], | |
(int32_t)srcSampleOffsetY[k + 1 + ((l + 0) * widthComp)], | |
(int32_t)srcSampleOffsetY[k + 0 + ((l + 0) * widthComp)] | |
); | |
grainSample = _mm_add_epi32(grainSample, offset); | |
// SIMD SSE4 ov_clip_uintp2 | |
// Set to 0 all negative values. | |
grainSample = _mm_max_epi32(grainSample, _mm_setzero_si128()); | |
//int32_t overflow = !!(val & (~mask)); | |
__m128i overflow = _mm_and_si128(grainSample, not_mask); | |
overflow = _mm_min_epi32(overflow, _mm_set1_epi32(1)); | |
overflow = _mm_sub_epi32(_mm_set1_epi32(0), overflow); | |
// ((-overflow) & mask) | (val & mask); | |
__m128i lhs = _mm_and_si128(overflow, mask); | |
__m128i rhs = _mm_and_si128(grainSample, mask); | |
__m128i clipped_val = _mm_or_si128(lhs, rhs); | |
int32_t *val = (int32_t *)&clipped_val; | |
dstSampleOffsetY[((l + 0) * widthComp) + (k + 0)] = (int16_t)val[0]; | |
dstSampleOffsetY[((l + 0) * widthComp) + (k + 1)] = (int16_t)val[1]; | |
dstSampleOffsetY[((l + 0) * widthComp) + (k + 2)] = (int16_t)val[2]; | |
dstSampleOffsetY[((l + 0) * widthComp) + (k + 3)] = (int16_t)val[3]; | |
} | |
} | |
return; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment