Created
April 2, 2018 15:23
-
-
Save reinsteam/5c5f5a4e41c0f13f08c75688d563b222 to your computer and use it in GitHub Desktop.
Profiling stats of simple triangle filtering shader from [Pyramid](https://github.com/jbarczak/Pyramid)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*----------------------------------------------------------------------------------------------------------------------- | |
* Output from Pyramid: | |
* | |
* SGPRs: 30 / 102 | |
* VGPRs: 20 / 256 | |
* LDS bytes/tg 32 / 32768 | |
* Waves/Group: 4 | |
* Occupancy: | |
* S: 10 waves/SIMD | |
* V: 10 waves/SIMD | |
* L: 2048 groups/CU | |
* 8192 waves/CU | |
* 2048.00 waves/SIMD | |
* Ops: | |
* VALU: 88 | |
* S: 35 | |
* VMEM: 5 | |
* | |
*--------------------------------------------------------------------------------------------------------------------*/ | |
ByteAddressBuffer IdxBuffer; | |
// Assume position buffer contains clip space X, Y and view space Z (clip space W) for simplicity | |
ByteAddressBuffer PosBuffer; | |
RWByteAddressBuffer OutIdxBuffer; | |
uint cbCullingFlags; | |
float Min3(float3 v) | |
{ | |
return min(min(v.x, v.y), v.z); | |
} | |
float Max3(float3 v) | |
{ | |
return max(max(v.x, v.y), v.z); | |
} | |
groupshared uint Mask[8]; | |
[numthreads(256, 1, 1)] | |
void MainCS(uint ThreadId : SV_DispatchThreadID, uint LocalId : SV_GroupID) | |
{ | |
[branch] if (LocalId < 8) | |
{ | |
Mask[LocalId] = 0; | |
} | |
GroupMemoryBarrierWithGroupSync(); | |
// Assume 16-bit indices (32-bit are obvious) | |
const uint Uint3Idx = ThreadId >> 1; | |
const uint Uint3Ofs = ThreadId & 1; | |
const uint2 Packet = IdxBuffer.Load2((Uint3Idx * 3 + Uint3Ofs) << 2); | |
uint Idx0, Idx1, Idx2; | |
if (Uint3Ofs) | |
{ | |
Idx0 = Packet.x >> 16; | |
Idx1 = Packet.y & 0xffff; | |
Idx2 = Packet.y >> 16; | |
} | |
else | |
{ | |
Idx0 = Packet.x & 0xffff; | |
Idx1 = Packet.x >> 16; | |
Idx2 = Packet.y & 0xffff; | |
} | |
/* Load vertices */ | |
const float3 Vtx0 = asfloat(PosBuffer.Load4(Idx0 * 12).xyz); | |
const float3 Vtx1 = asfloat(PosBuffer.Load4(Idx1 * 12).xyz); | |
const float3 Vtx2 = asfloat(PosBuffer.Load4(Idx2 * 12).xyz); | |
/* Cull degenerate triangles */ | |
bool culled = (Idx0 == Idx1) || (Idx1 == Idx2) || (Idx2 == Idx0); | |
/* Backface Culling */ | |
[flatten] if (cbCullingFlags & 0x1) | |
{ | |
culled = culled || determinant(float3x3(Vtx0, Vtx1, Vtx2)) > 0.0; | |
} | |
float3 VtxX = float3(Vtx0.x, Vtx1.x, Vtx2.x); | |
float3 VtxY = float3(Vtx0.y, Vtx1.y, Vtx2.y); | |
float3 VtxW = float3(Vtx0.z, Vtx1.z, Vtx2.z); | |
/* Near Plane Culling */ | |
culled = culled || all(VtxW < 0.0); | |
/* Clip Space Culling */ | |
// Convert from clip space to NDC and then to screen space | |
VtxW = rcp(abs(VtxW)); | |
VtxX *= VtxW * 0.5; | |
VtxY *= VtxW * -0.5; | |
VtxX += 0.5; | |
VtxY += 0.5; | |
float2 Min = float2(Min3(VtxX), Min3(VtxY)); | |
float2 Max = float2(Max3(VtxX), Max3(VtxY)); | |
culled = culled || any(round(Min) == round(Max)); | |
/* Frustum Culling */ | |
culled = culled || any(Max < 0.0) || any(Min > 1.0); | |
/* Update local mask */ | |
const uint SlotIdx = LocalId >> 5; | |
const uint MaskBit = LocalId & 0x1f; | |
InterlockedOr(Mask[SlotIdx], (!culled) << MaskBit); | |
GroupMemoryBarrierWithGroupSync(); | |
// Output global mask: | |
[branch] if (LocalId < 2) | |
{ | |
const uint GroupId = ThreadId >> 8; | |
const uint MemSlotId = ((GroupId << 1) + LocalId) << 4; | |
const uint LdsSlotId = LocalId << 2; | |
OutIdxBuffer.Store4(MemSlotId << 2, uint4(Mask[LdsSlotId], Mask[LdsSlotId + 1], Mask[LdsSlotId + 2], Mask[LdsSlotId + 3])); | |
} | |
// Output indices: | |
//OutIdxBuffer.Store2((ThreadId * 3) << 2, uint2(Idx0, Idx1)); | |
//OutIdxBuffer.Store((ThreadId * 3 + 2) << 2, Idx2); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment