Created
November 2, 2019 20:19
-
-
Save nyorain/afad58f105844e8f642838cc706c6413 to your computer and use it in GitHub Desktop.
Alternative luminance mipmapping using SIMD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/src/deferred/luminance.cpp b/src/deferred/luminance.cpp | |
index 92351e1..7e2e922 100644 | |
--- a/src/deferred/luminance.cpp | |
+++ b/src/deferred/luminance.cpp | |
@@ -256,7 +256,7 @@ void LuminancePass::initBuffers(InitBufferData& data, vk::ImageView light, | |
dsu.apply(); | |
// mip levels | |
- const auto mf = (mipGroupDimSize * 4); // minification factor | |
+ const auto mf = (mipGroupDimSize * 8); // minification factor | |
const u32 shift = std::log2(mf); // mf is power always of two | |
auto i = shift; | |
@@ -363,7 +363,7 @@ void LuminancePass::record(vk::CommandBuffer cb, vk::Extent2D size) { | |
tkn::cmdBindComputeDescriptors(cb, extract_.pipeLayout, 0, {extract_.ds}); | |
vk::cmdDispatch(cb, cx, cy, 1); | |
- const auto mf = (mipGroupDimSize * 4); // minification factor | |
+ const auto mf = (mipGroupDimSize * 8); // minification factor | |
auto prevLevel = 0u; | |
auto i = mip_.target0; | |
vk::cmdBindPipeline(cb, vk::PipelineBindPoint::compute, mip_.pipe); | |
diff --git a/src/deferred/luminanceMip.comp b/src/deferred/luminanceMip.comp | |
index 5d3711e..82b9038 100644 | |
--- a/src/deferred/luminanceMip.comp | |
+++ b/src/deferred/luminanceMip.comp | |
@@ -33,8 +33,16 @@ layout(push_constant) uniform PCR { | |
const uint size = gl_WorkGroupSize.x; // == gl_WorkGroupSize.y | |
vec2 pixelSize = 1.f / textureSize(inLum, 0); | |
+// TODO: optimiziation | |
+// make groups smaller and shared memory larger (locally reduce a whole vec4 per invocation) | |
+// bandwidth is probably our bottleneck anyways, no gpu has that many | |
+// memory lanes. | |
+// even more? half the invocations are not being used even in the first iteration | |
+// currently | |
+ | |
// contains tee current summed-up luminance | |
-shared float lum[size][size]; | |
+// we use vec4s to be able to use simd when adding them up | |
+shared vec4 lum[size][size]; | |
float load(vec2 pixel) { | |
vec2 dist = clamp(pcr.inSize - (pixel - 0.5), 0, 1); | |
@@ -42,20 +50,45 @@ float load(vec2 pixel) { | |
return fac * texture(inLum, min(pixel, pcr.inSize - 0.5) * pixelSize).r; | |
} | |
-// no early returns due to all the barriers. We use a sampler | |
-// with a black border and clampToBorder instead. | |
+float load4(vec2 pixel) { | |
+ return load(pixel + vec2(0, 0)) | |
+ + load(pixel + vec2(2, 0)) | |
+ + load(pixel + vec2(0, 2)) | |
+ + load(pixel + vec2(2, 2)); | |
+} | |
+ | |
+// no early returns due to all the barriers. | |
void main() { | |
uvec2 l = gl_LocalInvocationID.xy; | |
- vec2 pixel = 4 * gl_GlobalInvocationID.xy; // top-left of sampled pixels | |
- pixel += 1; // for linear sampling | |
+ vec2 pixel = 8 * gl_GlobalInvocationID.xy; // top-left of sampled pixels | |
+ pixel += 1; // for linear sampling, allows us to load 4 pixels per texture call | |
// first reduction locally | |
- float val = 0.0; | |
- val += load(pixel + vec2(0, 0)); | |
- val += load(pixel + vec2(2, 0)); | |
- val += load(pixel + vec2(0, 2)); | |
- val += load(pixel + vec2(2, 2)); | |
- lum[l.x][l.y] = val; | |
+ // float val = 0.0; | |
+ // val += load(pixel + vec2(0, 0)); | |
+ // val += load(pixel + vec2(2, 0)); | |
+ // val += load(pixel + vec2(0, 2)); | |
+ // val += load(pixel + vec2(2, 2)); | |
+ | |
+ // uint id; | |
+ // if(l.x % 2 == 0 && l.y % 2 == 0) lum[l.x / 2][l.y / 2][0] = val; | |
+ // else if(l.x % 2 == 1 && l.y % 2 == 0) lum[l.x / 2][l.y / 2][1] = val; | |
+ // else if(l.x % 2 == 0 && l.y % 2 == 1) lum[l.x / 2][l.y / 2][2] = val; | |
+ // else if(l.x % 2 == 1 && l.y % 2 == 1) lum[l.x / 2][l.y / 2][3] = val; | |
+ | |
+ // uint id = 2 * (l.y % 2) + l.x % 2; // unique mapping {0, 1}^2 -> {0, 1, 2, 3} | |
+ // lum[l.x / 2][l.y / 2][id] = val; | |
+ | |
+ lum[l.x][l.y] = vec4( | |
+ load4(pixel + vec2(0, 0)), | |
+ load4(pixel + vec2(4, 0)), | |
+ load4(pixel + vec2(0, 4)), | |
+ load4(pixel + vec2(4, 4))); | |
+ // lum[l.x][l.y] = vec4( | |
+ // load(pixel + vec2(0, 0)), | |
+ // load(pixel + vec2(2, 0)), | |
+ // load(pixel + vec2(0, 2)), | |
+ // load(pixel + vec2(2, 2))); | |
for(uint isize = size / 2; isize > 0; isize /= 2) { | |
// one barrier is enough, memoryBarrierShared is not needed. | |
// See GL_KHR_vulkan_glsl and spirv spec | |
@@ -70,7 +103,8 @@ void main() { | |
} | |
if(l.x == 0 && l.y == 0) { | |
- float avg = lum[0][0] / (4 * size * size); | |
+ float avg = dot(lum[0][0], vec4(1.0)) / (16 * size * size); | |
+ // float avg = dot(lum[0][0], vec4(1.0)) / (4 * size * size); | |
imageStore(outLum, ivec2(gl_WorkGroupID.xy), vec4(avg)); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Doesn't seem to be faster though (rather 0.1 ms slower). Not sure why, it should be faster in theory i guess.