Skip to content

Instantly share code, notes, and snippets.

@castano
Created April 29, 2020 07:45
Show Gist options
  • Save castano/98f421cec18990b3fba17380565889b5 to your computer and use it in GitHub Desktop.
Save castano/98f421cec18990b3fba17380565889b5 to your computer and use it in GitHub Desktop.
// Emulating gathers using loads and permutevar8. This made the entire compressor about 15% faster. Both methods require AVX2.
// Load 4 uint8 per lane.
__m256i packedClusterIndex = _mm256_load_si256((__m256i *)&s_fourCluster[i]);
if (count <= 8) {
// Load r_sat in one register:
Wide8 r07 = load8(r_sat);
Wide8 g07 = load8(g_sat);
Wide8 b07 = load8(b_sat);
Wide8 w07 = load8(w_sat);
// Load index and decrement.
auto c0 = _mm256_sub_epi32(_mm256_and_si256(packedClusterIndex, _mm256_set1_epi32(0xFF)), _mm256_set1_epi32(1));
// if upper bit set, zero, otherwise load sat entry.
x0.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(r07.v, c0), _mm256_setzero_ps(), _mm256_castsi256_ps(c0));
x0.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(g07.v, c0), _mm256_setzero_ps(), _mm256_castsi256_ps(c0));
x0.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(b07.v, c0), _mm256_setzero_ps(), _mm256_castsi256_ps(c0));
w0.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(w07.v, c0), _mm256_setzero_ps(), _mm256_castsi256_ps(c0));
auto c1 = _mm256_sub_epi32(_mm256_and_si256(_mm256_srli_epi32(packedClusterIndex, 8), _mm256_set1_epi32(0xFF)), _mm256_set1_epi32(1));
x1.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(r07.v, c1), _mm256_setzero_ps(), _mm256_castsi256_ps(c1));
x1.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(g07.v, c1), _mm256_setzero_ps(), _mm256_castsi256_ps(c1));
x1.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(b07.v, c1), _mm256_setzero_ps(), _mm256_castsi256_ps(c1));
w1.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(w07.v, c1), _mm256_setzero_ps(), _mm256_castsi256_ps(c1));
auto c2 = _mm256_sub_epi32(_mm256_and_si256(_mm256_srli_epi32(packedClusterIndex, 16), _mm256_set1_epi32(0xFF)), _mm256_set1_epi32(1));
x2.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(r07.v, c2), _mm256_setzero_ps(), _mm256_castsi256_ps(c2));
x2.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(g07.v, c2), _mm256_setzero_ps(), _mm256_castsi256_ps(c2));
x2.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(b07.v, c2), _mm256_setzero_ps(), _mm256_castsi256_ps(c2));
w2.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(w07.v, c2), _mm256_setzero_ps(), _mm256_castsi256_ps(c2));
}
else {
// Load r_sat in two registers:
Wide8 rLo = load8(r_sat); Wide8 rUp = load8(r_sat + 8);
Wide8 gLo = load8(g_sat); Wide8 gUp = load8(g_sat + 8);
Wide8 bLo = load8(b_sat); Wide8 bUp = load8(b_sat + 8);
Wide8 wLo = load8(w_sat); Wide8 wUp = load8(w_sat + 8);
auto c0 = _mm256_and_si256(packedClusterIndex, _mm256_set1_epi32(0xFF));
auto c0Lo = _mm256_sub_epi32(c0, _mm256_set1_epi32(1));
// if upper bit set, zero, otherwise load sat entry.
x0.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(rLo.v, c0Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c0Lo));
x0.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(gLo.v, c0Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c0Lo));
x0.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(bLo.v, c0Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c0Lo));
w0.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(wLo.v, c0Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c0Lo));
auto c0Up = _mm256_sub_epi32(c0, _mm256_set1_epi32(9));
// if upper bit set, same, otherwise load sat entry.
x0.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(rUp.v, c0Up), x0.x.v, _mm256_castsi256_ps(c0Up));
x0.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(gUp.v, c0Up), x0.y.v, _mm256_castsi256_ps(c0Up));
x0.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(bUp.v, c0Up), x0.z.v, _mm256_castsi256_ps(c0Up));
w0.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(wUp.v, c0Up), w0.v, _mm256_castsi256_ps(c0Up));
auto c1 = _mm256_and_si256(_mm256_srli_epi32(packedClusterIndex, 8), _mm256_set1_epi32(0xFF));
auto c1Lo = _mm256_sub_epi32(c1, _mm256_set1_epi32(1));
// if upper bit set, zero, otherwise load sat entry.
x1.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(rLo.v, c1Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c1Lo));
x1.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(gLo.v, c1Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c1Lo));
x1.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(bLo.v, c1Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c1Lo));
w1.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(wLo.v, c1Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c1Lo));
auto c1Up = _mm256_sub_epi32(c1, _mm256_set1_epi32(9));
// if upper bit set, same, otherwise load sat entry.
x1.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(rUp.v, c1Up), x1.x.v, _mm256_castsi256_ps(c1Up));
x1.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(gUp.v, c1Up), x1.y.v, _mm256_castsi256_ps(c1Up));
x1.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(bUp.v, c1Up), x1.z.v, _mm256_castsi256_ps(c1Up));
w1.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(wUp.v, c1Up), w1.v, _mm256_castsi256_ps(c1Up));
auto c2 = _mm256_and_si256(_mm256_srli_epi32(packedClusterIndex, 16), _mm256_set1_epi32(0xFF));
auto c2Lo = _mm256_sub_epi32(c2, _mm256_set1_epi32(1));
// if upper bit set, zero, otherwise load sat entry.
x2.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(rLo.v, c2Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c2Lo));
x2.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(gLo.v, c2Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c2Lo));
x2.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(bLo.v, c2Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c2Lo));
w2.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(wLo.v, c2Lo), _mm256_setzero_ps(), _mm256_castsi256_ps(c2Lo));
auto c2Up = _mm256_sub_epi32(c2, _mm256_set1_epi32(9));
// if upper bit set, same, otherwise load sat entry.
x2.x.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(rUp.v, c2Up), x2.x.v, _mm256_castsi256_ps(c2Up));
x2.y.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(gUp.v, c2Up), x2.y.v, _mm256_castsi256_ps(c2Up));
x2.z.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(bUp.v, c2Up), x2.z.v, _mm256_castsi256_ps(c2Up));
w2.v = _mm256_blendv_ps(_mm256_permutevar8x32_ps(wUp.v, c2Up), w2.v, _mm256_castsi256_ps(c2Up));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment