Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created October 5, 2012 21:13
Show Gist options
  • Select an option

  • Save rygorous/3842461 to your computer and use it in GitHub Desktop.

Select an option

Save rygorous/3842461 to your computer and use it in GitHub Desktop.
16x tri binning setup
// Constants
static const OM_U32 cEdgeFlags[5] =
{
RAST_SCISSORS_FLAG_EDGE0,
RAST_SCISSORS_FLAG_EDGE1,
RAST_SCISSORS_FLAG_EDGE2,
RAST_SCISSORS_FLAG_EDGE3,
RAST_SCISSORS_FLAG_TRIANGLE_EXTENDS_OUTSIDE_SCISSORS
};
__mmask mRastScissor = 0;
__mmask mAnyScissor = 0;
// 1D scissor logic for x
__mmask mScissorXMin = _mm512_mask_cmplt_pi(mPrimPass, vBBoxXMin, vScissorXMin);
__mmask mScissorXMax = _mm512_mask_cmplt_pi(mPrimPass, vScissorXMax, vBBoxXMax);
_M512I XFullIntersectionMin = vIndentedSnap4BBoxXMin;
_M512I XFullIntersectionMax = vIndentedSnap4BBoxXMax;
if (!_mm512_kortestz(mScissorXMin, mScissorXMax))
{
// At least some tris extend outside of scissor rect
__mmask mScissored = _mm512_kor(mScissorXMin, mScissorXMax);
// Disable prim if it gets rejected by either original or indented bbox edges
// NOTE original code only did
_M512I vMaxXMin = _mm512_max_pi(vBBoxXMin, vIndentedSnap4BBoxXMin);
_M512I vMinXMax = _mm512_min_pi(vBBoxXMax, vIndentedSnap4BBoxXMax);
mPrimPass = _mm512_mask_cmplt_pi(mPrimPass, vScissorXMin, vMinXMax);
mPrimPass = _mm512_mask_cmplt_pi(mPrimPass, vMaxXMin, vScissorXMax);
mAnyScissor = _mm512_kor(mAnyScissor, mScissored);
// Intersection of scissor and triangle bbox
XFullIntersectionMin = _mm512_mask_max_pi(XFullIntersectionMin, mScissorXMin, vBBoxXMin, vScissorXMin);
XFullIntersectionMax = _mm512_mask_min_pi(XFullIntersectionMax, mScissorXMax, vBBoxXMax, vScissorXMax);
// Do we need to rasterize scissor edges? (Can skip that if they happen to fall on tile boundaries)
__mmask mRastScissorEdge0 = _mm512_mask_test_pi(mScissorXMin, vScissorXMin, _mm512_upconv_1to16(&pfers->Fix8TileXMask));
__mmask mRastScissorEdge1 = _mm512_mask_test_pi(mScissorXMax, vScissorXMax, _mm512_upconv_1to16(&pfers->Fix8TileXMask));
// Set scissor flag on any scissored tri we didn't reject yet
mScissored = _mm512_kand(mScissored, mPrimPass);
vFlags = _mm512_mask_or_pi(vFlags, mScissored, vFlags, _mm512_upconv_1to16(&cEdgeFlags[4]));
// Also set edge scissor flags
vFlags = _mm512_mask_or_pi(vFlags, mRastScissorEdge0, vFlags, _mm512_upconv_1to16(&cEdgeFlags[0]));
vFlags = _mm512_mask_or_pi(vFlags, mRastScissorEdge1, vFlags, _mm512_upconv_1to16(&cEdgeFlags[1]));
// Keep track of whether any tris need to rasterize scissor edges
mRastScissor = _mm512_kor(mRastScissor, mRastScissorEdge0);
mRastScissor = _mm512_kor(mRastScissor, mRastScissorEdge1);
}
// 1D scissor logic for y
__mmask mScissorYMin = _mm512_mask_cmplt_pi(mPrimPass, vBBoxYMin, vScissorYMin);
__mmask mScissorYMax = _mm512_mask_cmplt_pi(mPrimPass, vScissorYMax, vBBoxYMax);
_M512I YFullIntersectionMin = vIndentedSnap4BBoxYMin;
_M512I YFullIntersectionMax = vIndentedSnap4BBoxYMax;
if (!_mm512_kortestz(mScissorYMin, mScissorYMax))
{
// At least some tris extend outside of scissor rect
__mmask mScissored = _mm512_kor(mScissorYMin, mScissorYMax);
// Disable prim if it gets rejected by either original or indented bbox edges
_M512I vMaxYMin = _mm512_max_pi(vBBoxYMin, vIndentedSnap4BBoxYMin);
_M512I vMinYMax = _mm512_min_pi(vBBoxYMax, vIndentedSnap4BBoxYMax);
mPrimPass = _mm512_mask_cmplt_pi(mPrimPass, vScissorYMin, vMinYMax);
mPrimPass = _mm512_mask_cmplt_pi(mPrimPass, vMaxYMin, vScissorYMax);
mAnyScissor = _mm512_kor(mAnyScissor, mScissored);
// Intersection of scissor and triangle bbox
YFullIntersectionMin = _mm512_mask_max_pi(YFullIntersectionMin, mScissorYMin, vBBoxYMin, vScissorYMin);
YFullIntersectionMax = _mm512_mask_min_pi(YFullIntersectionMax, mScissorYMax, vBBoxYMax, vScissorYMax);
// Do we need to rasterize scissor edges? (Can skip that if they happen to fall on tile boundaries)
__mmask mRastScissorEdge2 = _mm512_mask_test_pi(mScissorYMin, vScissorYMin, _mm512_upconv_1to16(&pfers->Fix8TileYMask));
__mmask mRastScissorEdge3 = _mm512_mask_test_pi(mScissorYMax, vScissorYMax, _mm512_upconv_1to16(&pfers->Fix8TileYMask));
// Set scissor flag on any scissored tri we didn't reject yet
mScissored = _mm512_kand(mScissored, mPrimPass);
vFlags = _mm512_mask_or_pi(vFlags, mScissored, vFlags, _mm512_upconv_1to16(&cEdgeFlags[4]));
// Also set edge scissor flags
vFlags = _mm512_mask_or_pi(vFlags, mRastScissorEdge2, vFlags, _mm512_upconv_1to16(&cEdgeFlags[2]));
vFlags = _mm512_mask_or_pi(vFlags, mRastScissorEdge3, vFlags, _mm512_upconv_1to16(&cEdgeFlags[3]));
// Keep track of whether any tris need to rasterize scissor edges
mRastScissor = _mm512_kor(mRastScissor, mRastScissorEdge2);
mRastScissor = _mm512_kor(mRastScissor, mRastScissorEdge3);
}
// We don't care about rasterized scissor edges for prims that ended up being rejected
mRastScissor = _mm512_kand(mRastScissor, mPrimPass);
// If we actually rast scissor edges, need to use original not indented triangle bbox:
// Rast can drop triangle edges it doesn't think necessary because of the scissors.
// This can result in both scissor edge and triangle edge being dropped if the indented
// bbox is used, which can result in a blowout in the general rasterizer.
XFullIntersectionMin = _mm512_mask_max_pi(XFullIntersectionMin, mRastScissor, vBBoxXMin, vScissorXMin);
XFullIntersectionMax = _mm512_mask_min_pi(XFullIntersectionMax, mRastScissor, vBBoxXMax, vScissorXMax);
YFullIntersectionMin = _mm512_mask_max_pi(YFullIntersectionMin, mRastScissor, vBBoxYMin, vScissorYMin);
YFullIntersectionMax = _mm512_mask_min_pi(YFullIntersectionMax, mRastScissor, vBBoxYMax, vScissorYMax);
// Indented snap4 16x16 candidate test (this case can't deal with rasterized scissor edges)
__mmask mIndentedCandidate = _mm512_kandnr(mPrimPass, mRastScissor);
_M512I vIndentedBBoxW = _mm512_sub_pi(vIndentedSnap4BBoxXMax, vIndentedSnap4BBoxXMin);
_M512I vIndentedBBoxH = _mm512_sub_pi(vIndentedSnap4BBoxYMax, vIndentedSnap4BBoxYMin);
_M512I vIndentedBBoxSize = _mm512_max_pi(vIndentedBBoxW, vIndentedBBoxH);
__mmask mIndented16x16 = _mm512_mask_cmple_pi(mIndentedCandidate, vIndentedBBoxSize, _mm512_upconv_1to16(&pfers->c16Pixels));
// Tile bounds for the tris
// Because of top-left fill rule, min is inclusive whereas max is exclusive, which is
// why we subtract 1 from the latter.
_M512I vFullXIsectMaxSub1 = _mm512_sub_pi(XFullIntersectionMax, _mm512_upconv_1to16(&pfers->cOne));
_M512I vFullYIsectMaxSub1 = _mm512_sub_pi(YFullIntersectionMax, _mm512_upconv_1to16(&pfers->cOne));
_M512I vTileX = _mm512_srl_pi(XFullIntersectionMin, _mm512_upconv_1to16(&pfers->TileXShiftPlusFix8Shift));
_M512I vTileY = _mm512_srl_pi(YFullIntersectionMin, _mm512_upconv_1to16(&pfers->TileYShiftPlusFix8Shift));
_M512I vTileMaxX = _mm512_srl_pi(vFullXIsectMaxSub1, _mm512_upconv_1to16(&pfers->TileXShiftPlusFix8Shift));
_M512I vTileMaxY = _mm512_srl_pi(vFullYIsectMaxSub1, _mm512_upconv_1to16(&pfers->TileYShiftPlusFix8Shift));
_M512I vTileSizeXMinus1 = _mm512_sub_pi(vTileMaxX, vTileX);
_M512I vTileSizeYMinus1 = _mm512_sub_pi(vTileMaxY, vTileY);
_M512I vTileSizesORed = _mm512_or_pi(vTileSizeXMinus1, vTileSizeYMinus1);
__mmask mMoreThanOneTile = _mm512_mask_test_pi(mPrimPass, vTileSizesORed, vTileSizesORed);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment