nyorain · November 2, 2019 20:19 · nyorain · Nov 2, 2019
diff --git a/luminance.diff b/luminance.diff
 diff --git a/src/deferred/luminance.cpp b/src/deferred/luminance.cpp
 index 92351e1..7e2e922 100644
 --- a/src/deferred/luminance.cpp
 +++ b/src/deferred/luminance.cpp
 @@ -256,7 +256,7 @@ void LuminancePass::initBuffers(InitBufferData& data, vk::ImageView light,
 		dsu.apply();
 
 		// mip levels
 -		const auto mf = (mipGroupDimSize * 4); // minification factor
 +		const auto mf = (mipGroupDimSize * 8); // minification factor
 		const u32 shift = std::log2(mf); // mf is power always of two
 		auto i = shift;
 
 @@ -363,7 +363,7 @@ void LuminancePass::record(vk::CommandBuffer cb, vk::Extent2D size) {
 		tkn::cmdBindComputeDescriptors(cb, extract_.pipeLayout, 0, {extract_.ds});
 		vk::cmdDispatch(cb, cx, cy, 1);
 
 -		const auto mf = (mipGroupDimSize * 4); // minification factor
 +		const auto mf = (mipGroupDimSize * 8); // minification factor
 		auto prevLevel = 0u;
 		auto i = mip_.target0;
 		vk::cmdBindPipeline(cb, vk::PipelineBindPoint::compute, mip_.pipe);
 diff --git a/src/deferred/luminanceMip.comp b/src/deferred/luminanceMip.comp
 index 5d3711e..82b9038 100644
 --- a/src/deferred/luminanceMip.comp
 +++ b/src/deferred/luminanceMip.comp
 @@ -33,8 +33,16 @@ layout(push_constant) uniform PCR {
 const uint size = gl_WorkGroupSize.x; // == gl_WorkGroupSize.y
 vec2 pixelSize = 1.f / textureSize(inLum, 0);
 
 +// TODO: optimiziation
 +// make groups smaller and shared memory larger (locally reduce a whole vec4 per invocation)
 +// bandwidth is probably our bottleneck anyways, no gpu has that many
 +// memory lanes.
 +// even more? half the invocations are not being used even in the first iteration
 +// currently
 +
 // contains tee current summed-up luminance
 -shared float lum[size][size];
 +// we use vec4s to be able to use simd when adding them up
 +shared vec4 lum[size][size];
 
 float load(vec2 pixel) {
 	vec2 dist = clamp(pcr.inSize - (pixel - 0.5), 0, 1);
 @@ -42,20 +50,45 @@ float load(vec2 pixel) {
 	return fac * texture(inLum, min(pixel, pcr.inSize - 0.5) * pixelSize).r;
 }
 
 -// no early returns due to all the barriers. We use a sampler
 -// with a black border and clampToBorder instead.
 +float load4(vec2 pixel) {
 +	return load(pixel + vec2(0, 0))
 +		+ load(pixel + vec2(2, 0))
 +		+ load(pixel + vec2(0, 2))
 +		+ load(pixel + vec2(2, 2));
 +}
 +
 +// no early returns due to all the barriers.
 void main() {
 	uvec2 l = gl_LocalInvocationID.xy;
 -	vec2 pixel = 4 * gl_GlobalInvocationID.xy; // top-left of sampled pixels
 -	pixel += 1; // for linear sampling
 +	vec2 pixel = 8 * gl_GlobalInvocationID.xy; // top-left of sampled pixels
 +	pixel += 1; // for linear sampling, allows us to load 4 pixels per texture call
 
 	// first reduction locally
 -	float val = 0.0;
 -	val += load(pixel + vec2(0, 0));
 -	val += load(pixel + vec2(2, 0));
 -	val += load(pixel + vec2(0, 2));
 -	val += load(pixel + vec2(2, 2));
 -	lum[l.x][l.y] = val;
 +	// float val = 0.0;
 +	// val += load(pixel + vec2(0, 0));
 +	// val += load(pixel + vec2(2, 0));
 +	// val += load(pixel + vec2(0, 2));
 +	// val += load(pixel + vec2(2, 2));
 +
 +	// uint id;
 +	// if(l.x % 2 == 0 && l.y % 2 == 0) lum[l.x / 2][l.y / 2][0] = val;
 +	// else if(l.x % 2 == 1 && l.y % 2 == 0) lum[l.x / 2][l.y / 2][1] = val;
 +	// else if(l.x % 2 == 0 && l.y % 2 == 1) lum[l.x / 2][l.y / 2][2] = val;
 +	// else if(l.x % 2 == 1 && l.y % 2 == 1) lum[l.x / 2][l.y / 2][3] = val;
 +
 +	// uint id = 2 * (l.y % 2) + l.x % 2; // unique mapping {0, 1}^2 -> {0, 1, 2, 3}
 +	// lum[l.x / 2][l.y / 2][id] = val;
 +
 +	lum[l.x][l.y] = vec4(
 +		load4(pixel + vec2(0, 0)),
 +		load4(pixel + vec2(4, 0)),
 +		load4(pixel + vec2(0, 4)),
 +		load4(pixel + vec2(4, 4)));
 +	// lum[l.x][l.y] = vec4(
 +	// 	load(pixel + vec2(0, 0)),
 +	// 	load(pixel + vec2(2, 0)),
 +	// 	load(pixel + vec2(0, 2)),
 +	// 	load(pixel + vec2(2, 2)));
 	for(uint isize = size / 2; isize > 0; isize /= 2) {
 		// one barrier is enough, memoryBarrierShared is not needed.
 		// See GL_KHR_vulkan_glsl and spirv spec
 @@ -70,7 +103,8 @@ void main() {
 	}
 
 	if(l.x == 0 && l.y == 0) {
 -		float avg = lum[0][0] / (4 * size * size);
 +		float avg = dot(lum[0][0], vec4(1.0)) / (16 * size * size);
 +		// float avg = dot(lum[0][0], vec4(1.0)) / (4 * size * size);
 		imageStore(outLum, ivec2(gl_WorkGroupID.xy), vec4(avg));
 	}
 }
	diff --git a/src/deferred/luminance.cpp b/src/deferred/luminance.cpp
	index 92351e1..7e2e922 100644
	--- a/src/deferred/luminance.cpp
	+++ b/src/deferred/luminance.cpp
	@@ -256,7 +256,7 @@ void LuminancePass::initBuffers(InitBufferData& data, vk::ImageView light,
	dsu.apply();

	// mip levels
	- const auto mf = (mipGroupDimSize * 4); // minification factor
	+ const auto mf = (mipGroupDimSize * 8); // minification factor
	const u32 shift = std::log2(mf); // mf is power always of two
	auto i = shift;

	@@ -363,7 +363,7 @@ void LuminancePass::record(vk::CommandBuffer cb, vk::Extent2D size) {
	tkn::cmdBindComputeDescriptors(cb, extract_.pipeLayout, 0, {extract_.ds});
	vk::cmdDispatch(cb, cx, cy, 1);

	- const auto mf = (mipGroupDimSize * 4); // minification factor
	+ const auto mf = (mipGroupDimSize * 8); // minification factor
	auto prevLevel = 0u;
	auto i = mip_.target0;
	vk::cmdBindPipeline(cb, vk::PipelineBindPoint::compute, mip_.pipe);
	diff --git a/src/deferred/luminanceMip.comp b/src/deferred/luminanceMip.comp
	index 5d3711e..82b9038 100644
	--- a/src/deferred/luminanceMip.comp
	+++ b/src/deferred/luminanceMip.comp
	@@ -33,8 +33,16 @@ layout(push_constant) uniform PCR {
	const uint size = gl_WorkGroupSize.x; // == gl_WorkGroupSize.y
	vec2 pixelSize = 1.f / textureSize(inLum, 0);

	+// TODO: optimiziation
	+// make groups smaller and shared memory larger (locally reduce a whole vec4 per invocation)
	+// bandwidth is probably our bottleneck anyways, no gpu has that many
	+// memory lanes.
	+// even more? half the invocations are not being used even in the first iteration
	+// currently
	+
	// contains tee current summed-up luminance
	-shared float lum[size][size];
	+// we use vec4s to be able to use simd when adding them up
	+shared vec4 lum[size][size];

	float load(vec2 pixel) {
	vec2 dist = clamp(pcr.inSize - (pixel - 0.5), 0, 1);
	@@ -42,20 +50,45 @@ float load(vec2 pixel) {
	return fac * texture(inLum, min(pixel, pcr.inSize - 0.5) * pixelSize).r;
	}

	-// no early returns due to all the barriers. We use a sampler
	-// with a black border and clampToBorder instead.
	+float load4(vec2 pixel) {
	+ return load(pixel + vec2(0, 0))
	+ + load(pixel + vec2(2, 0))
	+ + load(pixel + vec2(0, 2))
	+ + load(pixel + vec2(2, 2));
	+}
	+
	+// no early returns due to all the barriers.
	void main() {
	uvec2 l = gl_LocalInvocationID.xy;
	- vec2 pixel = 4 * gl_GlobalInvocationID.xy; // top-left of sampled pixels
	- pixel += 1; // for linear sampling
	+ vec2 pixel = 8 * gl_GlobalInvocationID.xy; // top-left of sampled pixels
	+ pixel += 1; // for linear sampling, allows us to load 4 pixels per texture call

	// first reduction locally
	- float val = 0.0;
	- val += load(pixel + vec2(0, 0));
	- val += load(pixel + vec2(2, 0));
	- val += load(pixel + vec2(0, 2));
	- val += load(pixel + vec2(2, 2));
	- lum[l.x][l.y] = val;
	+ // float val = 0.0;
	+ // val += load(pixel + vec2(0, 0));
	+ // val += load(pixel + vec2(2, 0));
	+ // val += load(pixel + vec2(0, 2));
	+ // val += load(pixel + vec2(2, 2));
	+
	+ // uint id;
	+ // if(l.x % 2 == 0 && l.y % 2 == 0) lum[l.x / 2][l.y / 2][0] = val;
	+ // else if(l.x % 2 == 1 && l.y % 2 == 0) lum[l.x / 2][l.y / 2][1] = val;
	+ // else if(l.x % 2 == 0 && l.y % 2 == 1) lum[l.x / 2][l.y / 2][2] = val;
	+ // else if(l.x % 2 == 1 && l.y % 2 == 1) lum[l.x / 2][l.y / 2][3] = val;
	+
	+ // uint id = 2 * (l.y % 2) + l.x % 2; // unique mapping {0, 1}^2 -> {0, 1, 2, 3}
	+ // lum[l.x / 2][l.y / 2][id] = val;
	+
	+ lum[l.x][l.y] = vec4(
	+ load4(pixel + vec2(0, 0)),
	+ load4(pixel + vec2(4, 0)),
	+ load4(pixel + vec2(0, 4)),
	+ load4(pixel + vec2(4, 4)));
	+ // lum[l.x][l.y] = vec4(
	+ // load(pixel + vec2(0, 0)),
	+ // load(pixel + vec2(2, 0)),
	+ // load(pixel + vec2(0, 2)),
	+ // load(pixel + vec2(2, 2)));
	for(uint isize = size / 2; isize > 0; isize /= 2) {
	// one barrier is enough, memoryBarrierShared is not needed.
	// See GL_KHR_vulkan_glsl and spirv spec
	@@ -70,7 +103,8 @@ void main() {
	}

	if(l.x == 0 && l.y == 0) {
	- float avg = lum[0][0] / (4 * size * size);
	+ float avg = dot(lum[0][0], vec4(1.0)) / (16 * size * size);
	+ // float avg = dot(lum[0][0], vec4(1.0)) / (4 * size * size);
	imageStore(outLum, ivec2(gl_WorkGroupID.xy), vec4(avg));
	}
	}