Created
April 29, 2020 07:45
-
-
Save castano/98f421cec18990b3fba17380565889b5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Emulating gathers using loads and permutevar8. This made the entire compressor about 15% faster. Both methods require AVX2. | |
// Load 4 uint8 per lane. | |
__m256i packedClusterIndex = _mm256_load_si256((__m256i *)&s_fourCluster[i]); | |
if (count <= 8) { | |
// Load r_sat in one register: | |
Wide8 r07 = load8(r_sat); | |
Wide8 g07 = load8(g_sat); | |
Wide8 b07 = load8(b_sat); | |
Wide8 w07 = load8(w_sat); | |
// Load index and decrement. | |
auto c0 = _mm256_sub_epi32(_mm256_and_si256(packedClusterIndex, _mm256_set1_epi32(0xFF)), _mm256_set1_epi32(1)); | |
// if upper bit set, zero, otherwise load sat entry. | |
x0.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(r07.v, c0), _mm256_setzero_ps(), _mm256_castsi256_ps(c0)); | |
x0.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(g07.v, c0), _mm256_setzero_ps(), _mm256_castsi256_ps(c0)); | |
x0.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(b07.v, c0), _mm256_setzero_ps(), _mm256_castsi256_ps(c0)); | |
w0.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(w07.v, c0), _mm256_setzero_ps(), _mm256_castsi256_ps(c0)); | |
auto c1 = _mm256_sub_epi32(_mm256_and_si256(_mm256_srli_epi32(packedClusterIndex, 8), _mm256_set1_epi32(0xFF)), _mm256_set1_epi32(1)); | |
x1.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(r07.v, c1), _mm256_setzero_ps(), _mm256_castsi256_ps(c1)); | |
x1.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(g07.v, c1), _mm256_setzero_ps(), _mm256_castsi256_ps(c1)); | |
x1.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(b07.v, c1), _mm256_setzero_ps(), _mm256_castsi256_ps(c1)); | |
w1.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(w07.v, c1), _mm256_setzero_ps(), _mm256_castsi256_ps(c1)); | |
auto c2 = _mm256_sub_epi32(_mm256_and_si256(_mm256_srli_epi32(packedClusterIndex, 16), _mm256_set1_epi32(0xFF)), _mm256_set1_epi32(1)); | |
x2.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(r07.v, c2), _mm256_setzero_ps(), _mm256_castsi256_ps(c2)); | |
x2.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(g07.v, c2), _mm256_setzero_ps(), _mm256_castsi256_ps(c2)); | |
x2.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(b07.v, c2), _mm256_setzero_ps(), _mm256_castsi256_ps(c2)); | |
w2.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(w07.v, c2), _mm256_setzero_ps(), _mm256_castsi256_ps(c2)); | |
} | |
else { | |
// Load r_sat in two registers: | |
Wide8 rLo = load8(r_sat); Wide8 rUp = load8(r_sat + 8); | |
Wide8 gLo = load8(g_sat); Wide8 gUp = load8(g_sat + 8); | |
Wide8 bLo = load8(b_sat); Wide8 bUp = load8(b_sat + 8); | |
Wide8 wLo = load8(w_sat); Wide8 wUp = load8(w_sat + 8); | |
auto c0 = _mm256_and_si256(packedClusterIndex, _mm256_set1_epi32(0xFF)); | |
auto c0Lo = _mm256_sub_epi32(c0, _mm256_set1_epi32(1)); | |
// if upper bit set, zero, otherwise load sat entry. | |
x0.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(rLo.v, c0Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c0Lo)); | |
x0.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(gLo.v, c0Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c0Lo)); | |
x0.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(bLo.v, c0Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c0Lo)); | |
w0.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(wLo.v, c0Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c0Lo)); | |
auto c0Up = _mm256_sub_epi32(c0, _mm256_set1_epi32(9)); | |
// if upper bit set, same, otherwise load sat entry. | |
x0.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(rUp.v, c0Up), x0.x.v, _mm256_castsi256_ps(c0Up)); | |
x0.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(gUp.v, c0Up), x0.y.v, _mm256_castsi256_ps(c0Up)); | |
x0.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(bUp.v, c0Up), x0.z.v, _mm256_castsi256_ps(c0Up)); | |
w0.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(wUp.v, c0Up), w0.v, _mm256_castsi256_ps(c0Up)); | |
auto c1 = _mm256_and_si256(_mm256_srli_epi32(packedClusterIndex, 8), _mm256_set1_epi32(0xFF)); | |
auto c1Lo = _mm256_sub_epi32(c1, _mm256_set1_epi32(1)); | |
// if upper bit set, zero, otherwise load sat entry. | |
x1.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(rLo.v, c1Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c1Lo)); | |
x1.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(gLo.v, c1Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c1Lo)); | |
x1.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(bLo.v, c1Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c1Lo)); | |
w1.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(wLo.v, c1Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c1Lo)); | |
auto c1Up = _mm256_sub_epi32(c1, _mm256_set1_epi32(9)); | |
// if upper bit set, same, otherwise load sat entry. | |
x1.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(rUp.v, c1Up), x1.x.v, _mm256_castsi256_ps(c1Up)); | |
x1.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(gUp.v, c1Up), x1.y.v, _mm256_castsi256_ps(c1Up)); | |
x1.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(bUp.v, c1Up), x1.z.v, _mm256_castsi256_ps(c1Up)); | |
w1.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(wUp.v, c1Up), w1.v, _mm256_castsi256_ps(c1Up)); | |
auto c2 = _mm256_and_si256(_mm256_srli_epi32(packedClusterIndex, 16), _mm256_set1_epi32(0xFF)); | |
auto c2Lo = _mm256_sub_epi32(c2, _mm256_set1_epi32(1)); | |
// if upper bit set, zero, otherwise load sat entry. | |
x2.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(rLo.v, c2Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c2Lo)); | |
x2.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(gLo.v, c2Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c2Lo)); | |
x2.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(bLo.v, c2Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c2Lo)); | |
w2.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(wLo.v, c2Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c2Lo)); | |
auto c2Up = _mm256_sub_epi32(c2, _mm256_set1_epi32(9)); | |
// if upper bit set, same, otherwise load sat entry. | |
x2.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(rUp.v, c2Up), x2.x.v, _mm256_castsi256_ps(c2Up)); | |
x2.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(gUp.v, c2Up), x2.y.v, _mm256_castsi256_ps(c2Up)); | |
x2.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(bUp.v, c2Up), x2.z.v, _mm256_castsi256_ps(c2Up)); | |
w2.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(wUp.v, c2Up), w2.v, _mm256_castsi256_ps(c2Up)); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment