Created
February 6, 2023 22:39
-
-
Save devshgraphicsprogramming/e6733bd4cb89076b6d3ba7dbd28c3635 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1 "direct.comp" | |
# 1 "<built-in>" | |
# 1 "<command-line>" | |
# 1 "direct.comp" | |
layout(local_size_x = 256) in; | |
# 1 "../../../../nbl/builtin/glsl/scan/descriptors.glsl" 1 | |
# 13 "../../../../nbl/builtin/glsl/scan/descriptors.glsl" | |
# 1 "../../../../nbl/builtin/glsl/scan/declarations.glsl" 1 | |
# 1 "../../../../nbl/builtin/glsl/scan/parameters_struct.glsl" 1 | |
# 10 "../../../../nbl/builtin/glsl/scan/parameters_struct.glsl" | |
struct nbl_glsl_scan_Parameters_t | |
{ | |
uint topLevel; | |
uint lastElement[7 / 2 + 1]; | |
uint temporaryStorageOffset[7 / 2]; | |
}; | |
# 6 "../../../../nbl/builtin/glsl/scan/declarations.glsl" 2 | |
nbl_glsl_scan_Parameters_t nbl_glsl_scan_getParameters(); | |
# 19 "../../../../nbl/builtin/glsl/scan/declarations.glsl" | |
void nbl_glsl_scan_getData( | |
inout _NBL_GLSL_SCAN_STORAGE_TYPE_ data, | |
in uint levelInvocationIndex, | |
in uint localWorkgroupIndex, | |
in uint treeLevel, | |
in uint pseudoLevel); | |
void nbl_glsl_scan_setData( | |
in _NBL_GLSL_SCAN_STORAGE_TYPE_ data, | |
in uint levelInvocationIndex, | |
in uint localWorkgroupIndex, | |
in uint treeLevel, | |
in uint pseudoLevel, | |
in bool inRange); | |
# 14 "../../../../nbl/builtin/glsl/scan/descriptors.glsl" 2 | |
layout(set = 0, binding = 0, std430) restrict buffer ScanBuffer | |
{ | |
_NBL_GLSL_SCAN_STORAGE_TYPE_ data[]; | |
} | |
scanBuffer; | |
layout(set = 0, binding = 1, std430) restrict coherent buffer ScanScratchBuffer | |
{ | |
uint workgroupsStarted; | |
uint data[]; | |
} | |
scanScratch; | |
void nbl_glsl_scan_getData( | |
inout _NBL_GLSL_SCAN_STORAGE_TYPE_ data, | |
in uint levelInvocationIndex, | |
in uint localWorkgroupIndex, | |
in uint treeLevel, | |
in uint pseudoLevel) | |
{ | |
const nbl_glsl_scan_Parameters_t params = nbl_glsl_scan_getParameters(); | |
uint offset = levelInvocationIndex; | |
const bool notFirstOrLastLevel = bool(pseudoLevel); | |
if (notFirstOrLastLevel) | |
offset += params.temporaryStorageOffset[pseudoLevel - 1u]; | |
if (pseudoLevel != treeLevel) | |
{ | |
const bool notFirstInvocationInGroup = gl_LocalInvocationIndex != 0u; | |
if (bool(localWorkgroupIndex) && gl_LocalInvocationIndex == 0u) | |
data = scanScratch.data[localWorkgroupIndex + params.temporaryStorageOffset[pseudoLevel]]; | |
if (notFirstOrLastLevel) | |
{ | |
if (notFirstInvocationInGroup) | |
data = scanScratch.data[offset - 1u]; | |
} | |
else | |
{ | |
data += scanBuffer.data[offset]; | |
} | |
} | |
else | |
{ | |
if (notFirstOrLastLevel) | |
data = scanScratch.data[offset]; | |
else | |
data = scanBuffer.data[offset]; | |
} | |
} | |
void nbl_glsl_scan_setData( | |
in _NBL_GLSL_SCAN_STORAGE_TYPE_ data, | |
in uint levelInvocationIndex, | |
in uint localWorkgroupIndex, | |
in uint treeLevel, | |
in uint pseudoLevel, | |
in bool inRange) | |
{ | |
const nbl_glsl_scan_Parameters_t params = nbl_glsl_scan_getParameters(); | |
if (treeLevel < params.topLevel) | |
{ | |
const bool lastInvocationInGroup = gl_LocalInvocationIndex == (256 - 1); | |
if (lastInvocationInGroup) | |
scanScratch.data[localWorkgroupIndex + params.temporaryStorageOffset[treeLevel]] = data; | |
} | |
else if (inRange) | |
{ | |
if (bool(pseudoLevel)) | |
{ | |
const uint offset = params.temporaryStorageOffset[pseudoLevel - 1u]; | |
scanScratch.data[levelInvocationIndex + offset] = data; | |
} | |
else | |
scanBuffer.data[levelInvocationIndex] = data; | |
} | |
} | |
# 8 "direct.comp" 2 | |
# 1 "../../../../nbl/builtin/glsl/scan/virtual_workgroup.glsl" 1 | |
# 1 "../../../../nbl/builtin/glsl/limits/numeric.glsl" 1 | |
# 42 "../../../../nbl/builtin/glsl/limits/numeric.glsl" | |
# 1 "../../../../nbl/builtin/glsl/ieee754.glsl" 1 | |
uint nbl_glsl_ieee754_exponent_bias(in uint exponentBits) | |
{ | |
return (0x1u << (exponentBits - 1)) - 1; | |
} | |
uint nbl_glsl_ieee754_extract_biased_exponent(float x) | |
{ | |
return bitfieldExtract(floatBitsToUint(x), 23, 8); | |
} | |
int nbl_glsl_ieee754_extract_exponent(float x) | |
{ | |
return int(nbl_glsl_ieee754_extract_biased_exponent(x) - nbl_glsl_ieee754_exponent_bias(8)); | |
} | |
uint nbl_glsl_ieee754_compute_exponent_mask(in uint exponentBits, in uint mantissaBits) | |
{ | |
return ((1 << exponentBits) - 1) << mantissaBits; | |
} | |
float nbl_glsl_ieee754_replace_biased_exponent(float x, uint exp_plus_bias) | |
{ | |
return uintBitsToFloat(bitfieldInsert(floatBitsToUint(x), exp_plus_bias, 23, 8)); | |
} | |
float nbl_glsl_ieee754_fast_mul_exp2(float x, int n) | |
{ | |
return nbl_glsl_ieee754_replace_biased_exponent(x, nbl_glsl_ieee754_extract_biased_exponent(x) + uint(n)); | |
} | |
uint nbl_glsl_ieee754_compute_mantissa_mask(in uint mantissaBits) | |
{ | |
return (0x1u << mantissaBits) - 1; | |
} | |
uint nbl_glsl_ieee754_extract_mantissa(in float x) | |
{ | |
return (floatBitsToUint(x) & 0x7fffffu); | |
} | |
float nbl_glsl_ieee754_true_min(in uint exponentBits, in uint mantissaBits) | |
{ | |
return exp2(1 - int(nbl_glsl_ieee754_exponent_bias(exponentBits)) - mantissaBits); | |
} | |
float nbl_glsl_ieee754_min(in uint exponentBits, in uint mantissaBits) | |
{ | |
const float e = exp2(1 - int(nbl_glsl_ieee754_exponent_bias(exponentBits))); | |
const uint m = 0x1u << (23 - mantissaBits); | |
return uintBitsToFloat(floatBitsToUint(e) | m); | |
} | |
float nbl_glsl_ieee754_max(in uint exponentBits, in uint mantissaBits) | |
{ | |
const uint biasedMaxExp = (((1 << exponentBits) - 1) - 1); | |
const float e = exp2(biasedMaxExp - int(nbl_glsl_ieee754_exponent_bias(exponentBits))); | |
const uint m = 0x7fFFffu & (0x7fFFffu << (23 - mantissaBits)); | |
return uintBitsToFloat(floatBitsToUint(e) | m); | |
} | |
uint nbl_glsl_ieee754_encode_ufloat_impl(in int exponent, in uint exponentBits, in uint mantissa, in uint mantissaBits) | |
{ | |
const uint expBias = nbl_glsl_ieee754_exponent_bias(exponentBits); | |
const uint e = uint(exponent + expBias); | |
const uint m = mantissa >> (23 - mantissaBits); | |
const uint encodedValue = (e << mantissaBits) | m; | |
return encodedValue; | |
} | |
float nbl_glsl_numeric_limits_float_epsilon(float n); | |
float nbl_glsl_numeric_limits_float_epsilon(int n); | |
float nbl_glsl_numeric_limits_float_epsilon(); | |
float nbl_glsl_ieee754_gamma(float n) | |
{ | |
const float a = nbl_glsl_numeric_limits_float_epsilon(n); | |
return a / (1.f - a); | |
} | |
float nbl_glsl_ieee754_rcpgamma(float n) | |
{ | |
const float a = nbl_glsl_numeric_limits_float_epsilon(n); | |
return 1.f / a - 1.f; | |
} | |
float nbl_glsl_ieee754_gamma(uint n) | |
{ | |
return nbl_glsl_ieee754_gamma(float(n)); | |
} | |
float nbl_glsl_ieee754_rcpgamma(uint n) | |
{ | |
return nbl_glsl_ieee754_rcpgamma(float(n)); | |
} | |
vec3 nbl_glsl_ieee754_add_with_bounds_wo_gamma(out vec3 error, in vec3 a, in vec3 a_error, in vec3 b, in vec3 b_error) | |
{ | |
error = (a_error + b_error) / nbl_glsl_numeric_limits_float_epsilon(1u); | |
vec3 sum = a + b; | |
error += abs(sum); | |
return sum; | |
} | |
vec3 nbl_glsl_ieee754_sub_with_bounds_wo_gamma(out vec3 error, in vec3 a, in vec3 a_error, in vec3 b, in vec3 b_error) | |
{ | |
error = (a_error + b_error) / nbl_glsl_numeric_limits_float_epsilon(1u); | |
vec3 sum = a - b; | |
error += abs(sum); | |
return sum; | |
} | |
vec3 nbl_glsl_ieee754_mul_with_bounds_wo_gamma(out vec3 error, in vec3 a, in vec3 a_error, in float b, in float b_error) | |
{ | |
vec3 crossCorrelationA = abs(a) * b_error; | |
vec3 crossCorrelationB = a_error * abs(b); | |
error = (crossCorrelationB + crossCorrelationA + crossCorrelationB * crossCorrelationA) / nbl_glsl_numeric_limits_float_epsilon(1u); | |
vec3 product = a * b; | |
error += abs(product); | |
return product; | |
} | |
# 43 "../../../../nbl/builtin/glsl/limits/numeric.glsl" 2 | |
float nbl_glsl_numeric_limits_float_epsilon(float n) | |
{ | |
return nbl_glsl_ieee754_fast_mul_exp2(n, -24); | |
} | |
float nbl_glsl_numeric_limits_float_epsilon(int n) | |
{ | |
return nbl_glsl_numeric_limits_float_epsilon(float(n)); | |
} | |
float nbl_glsl_numeric_limits_float_epsilon() | |
{ | |
return 5.96046447754e-08; | |
} | |
# 5 "../../../../nbl/builtin/glsl/scan/virtual_workgroup.glsl" 2 | |
# 1 "../../../../nbl/builtin/glsl/math/typeless_arithmetic.glsl" 1 | |
int nbl_glsl_identityFunction(in int x) | |
{ | |
return x; | |
} | |
uint nbl_glsl_identityFunction(in uint x) { return x; } | |
float nbl_glsl_identityFunction(in float x) { return x; } | |
int nbl_glsl_and(in int x, in int y) { return x & y; } | |
uint nbl_glsl_and(in uint x, in uint y) { return x & y; } | |
int nbl_glsl_xor(in int x, in int y) { return x ^ y; } | |
uint nbl_glsl_xor(in uint x, in uint y) { return x ^ y; } | |
int nbl_glsl_or(in int x, in int y) { return x | y; } | |
uint nbl_glsl_or(in uint x, in uint y) { return x | y; } | |
int nbl_glsl_add(in int x, in int y) { return x + y; } | |
uint nbl_glsl_add(in uint x, in uint y) { return x + y; } | |
float nbl_glsl_add(in float x, in float y) { return x + y; } | |
int nbl_glsl_mul(in int x, in int y) { return x * y; } | |
uint nbl_glsl_mul(in uint x, in uint y) { return x * y; } | |
float nbl_glsl_mul(in float x, in float y) { return x * y; } | |
# 6 "../../../../nbl/builtin/glsl/scan/virtual_workgroup.glsl" 2 | |
# 1 "../../../../nbl/builtin/glsl/workgroup/arithmetic.glsl" 1 | |
# 1 "../../../../nbl/builtin/glsl/workgroup/shared_arithmetic.glsl" 1 | |
# 1 "../../../../nbl/builtin/glsl/workgroup/shared_clustered.glsl" 1 | |
# 1 "../../../../nbl/builtin/glsl/workgroup/shared_ballot.glsl" 1 | |
# 1 "../../../../nbl/builtin/glsl/workgroup/basic.glsl" 1 | |
# 1 "../../../../nbl/builtin/glsl/subgroup/basic_portability.glsl" 1 | |
# 1 "../../../../nbl/builtin/glsl/macros.glsl" 1 | |
# 7 "../../../../nbl/builtin/glsl/subgroup/basic_portability.glsl" 2 | |
# 99 "../../../../nbl/builtin/glsl/subgroup/basic_portability.glsl" | |
void nbl_glsl_subgroupBarrier() | |
{ | |
} | |
void nbl_glsl_subgroupMemoryBarrier() | |
{ | |
memoryBarrier(); | |
} | |
void nbl_glsl_subgroupMemoryBarrierBuffer() | |
{ | |
memoryBarrierBuffer(); | |
} | |
void nbl_glsl_subgroupMemoryBarrierShared() | |
{ | |
memoryBarrierShared(); | |
} | |
void nbl_glsl_subgroupMemoryBarrierImage() | |
{ | |
memoryBarrierImage(); | |
} | |
# 7 "../../../../nbl/builtin/glsl/workgroup/basic.glsl" 2 | |
bool nbl_glsl_workgroupElect() | |
{ | |
return gl_LocalInvocationIndex == 0u; | |
} | |
# 7 "../../../../nbl/builtin/glsl/workgroup/shared_ballot.glsl" 2 | |
# 1 "../../../../nbl/builtin/glsl/subgroup/shared_arithmetic_portability.glsl" 1 | |
# 8 "../../../../nbl/builtin/glsl/workgroup/shared_ballot.glsl" 2 | |
# 7 "../../../../nbl/builtin/glsl/workgroup/shared_clustered.glsl" 2 | |
# 7 "../../../../nbl/builtin/glsl/workgroup/shared_arithmetic.glsl" 2 | |
# 6 "../../../../nbl/builtin/glsl/workgroup/arithmetic.glsl" 2 | |
# 16 "../../../../nbl/builtin/glsl/workgroup/arithmetic.glsl" | |
shared uint nbl_glsl_workgroupArithmeticScratchShared[(((((256 - 1 & (-(0x1 << 2))) << 1) | (256 - 1 & ((0x1 << 2) - 1))) + ((0x1 << 2) >> 1) + 1) + (256 - 1 >> (2)) + (256 - 1 >> (2 * 2)) + (256 - 1 >> (2 * 3)) + (256 - 1 >> (2 * 4)) + (256 - 1 >> (2 * 5)) + 5)]; | |
# 1 "../../../../nbl/builtin/glsl/workgroup/clustered.glsl" 1 | |
# 21 "../../../../nbl/builtin/glsl/workgroup/clustered.glsl" | |
# 1 "../../../../nbl/builtin/glsl/workgroup/ballot.glsl" 1 | |
# 45 "../../../../nbl/builtin/glsl/workgroup/ballot.glsl" | |
# 1 "../../../../nbl/builtin/glsl/subgroup/arithmetic_portability_impl.glsl" 1 | |
# 14 "../../../../nbl/builtin/glsl/subgroup/arithmetic_portability_impl.glsl" | |
uint nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(in uint loMask, in uint invocationIndex) | |
{ | |
return invocationIndex & (~loMask); | |
} | |
uint nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(in uint loMask, in uint invocationIndex) | |
{ | |
return invocationIndex & loMask; | |
} | |
uint nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(in uint pseudoSubgroupElectedInvocation) | |
{ | |
return pseudoSubgroupElectedInvocation << 1u; | |
} | |
uint nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(in uint subgroupMemoryStart, in uint pseudoSubgroupInvocation, out uint lastLoadOffset) | |
{ | |
lastLoadOffset = (subgroupMemoryStart | pseudoSubgroupInvocation); | |
return lastLoadOffset + ((0x1 << 2) >> 1); | |
} | |
uint nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(in uint subgroupMemoryStart, in uint pseudoSubgroupInvocation) | |
{ | |
uint dummy; | |
return nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, dummy); | |
} | |
uint nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(in uint loMask, in uint invocationIndex) | |
{ | |
return nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset( | |
nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart( | |
nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, invocationIndex)), | |
nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, invocationIndex)); | |
} | |
# 116 "../../../../nbl/builtin/glsl/subgroup/arithmetic_portability_impl.glsl" | |
uint nbl_glsl_subgroupAnd_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0xffFFffFFu); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_and(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_and(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_and(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
uint lastSubgroupInvocation = loMask; | |
if (pseudoSubgroupElectedInvocation == nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u)) | |
lastSubgroupInvocation &= 256 - 1u; | |
const uint lastItem = nbl_glsl_workgroupArithmeticScratchShared[nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, lastSubgroupInvocation)]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return nbl_glsl_identityFunction(lastItem); | |
; | |
} | |
int nbl_glsl_subgroupAnd_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupAnd_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupAnd_impl(in bool clearScratchToIdentity, float value) | |
{ | |
return uintBitsToFloat(nbl_glsl_subgroupAnd_impl(clearScratchToIdentity, floatBitsToUint(value))); | |
} | |
uint nbl_glsl_subgroupXor_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_xor(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_xor(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_xor(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
uint lastSubgroupInvocation = loMask; | |
if (pseudoSubgroupElectedInvocation == nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u)) | |
lastSubgroupInvocation &= 256 - 1u; | |
const uint lastItem = nbl_glsl_workgroupArithmeticScratchShared[nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, lastSubgroupInvocation)]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return nbl_glsl_identityFunction(lastItem); | |
; | |
} | |
int nbl_glsl_subgroupXor_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupXor_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupXor_impl(in bool clearScratchToIdentity, float value) | |
{ | |
return uintBitsToFloat(nbl_glsl_subgroupXor_impl(clearScratchToIdentity, floatBitsToUint(value))); | |
} | |
uint nbl_glsl_subgroupOr_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_or(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_or(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_or(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
uint lastSubgroupInvocation = loMask; | |
if (pseudoSubgroupElectedInvocation == nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u)) | |
lastSubgroupInvocation &= 256 - 1u; | |
const uint lastItem = nbl_glsl_workgroupArithmeticScratchShared[nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, lastSubgroupInvocation)]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return nbl_glsl_identityFunction(lastItem); | |
; | |
} | |
int nbl_glsl_subgroupOr_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupOr_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupOr_impl(in bool clearScratchToIdentity, float value) | |
{ | |
return uintBitsToFloat(nbl_glsl_subgroupOr_impl(clearScratchToIdentity, floatBitsToUint(value))); | |
} | |
uint nbl_glsl_subgroupAdd_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
uint lastSubgroupInvocation = loMask; | |
if (pseudoSubgroupElectedInvocation == nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u)) | |
lastSubgroupInvocation &= 256 - 1u; | |
const uint lastItem = nbl_glsl_workgroupArithmeticScratchShared[nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, lastSubgroupInvocation)]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return nbl_glsl_identityFunction(lastItem); | |
; | |
} | |
int nbl_glsl_subgroupAdd_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupAdd_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupAdd_impl(in bool clearScratchToIdentity, float value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = floatBitsToUint(0.0); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
uint lastSubgroupInvocation = loMask; | |
if (pseudoSubgroupElectedInvocation == nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u)) | |
lastSubgroupInvocation &= 256 - 1u; | |
const uint lastItem = nbl_glsl_workgroupArithmeticScratchShared[nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, lastSubgroupInvocation)]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return uintBitsToFloat(lastItem); | |
; | |
} | |
uint nbl_glsl_subgroupMul_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(1u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
uint lastSubgroupInvocation = loMask; | |
if (pseudoSubgroupElectedInvocation == nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u)) | |
lastSubgroupInvocation &= 256 - 1u; | |
const uint lastItem = nbl_glsl_workgroupArithmeticScratchShared[nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, lastSubgroupInvocation)]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return nbl_glsl_identityFunction(lastItem); | |
; | |
} | |
int nbl_glsl_subgroupMul_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupMul_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupMul_impl(in bool clearScratchToIdentity, float value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = floatBitsToUint(1.0); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
uint lastSubgroupInvocation = loMask; | |
if (pseudoSubgroupElectedInvocation == nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u)) | |
lastSubgroupInvocation &= 256 - 1u; | |
const uint lastItem = nbl_glsl_workgroupArithmeticScratchShared[nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, lastSubgroupInvocation)]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return uintBitsToFloat(lastItem); | |
; | |
} | |
uint nbl_glsl_subgroupMin_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(4294967295u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
uint lastSubgroupInvocation = loMask; | |
if (pseudoSubgroupElectedInvocation == nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u)) | |
lastSubgroupInvocation &= 256 - 1u; | |
const uint lastItem = nbl_glsl_workgroupArithmeticScratchShared[nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, lastSubgroupInvocation)]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return nbl_glsl_identityFunction(lastItem); | |
; | |
} | |
int nbl_glsl_subgroupMin_impl(in bool clearScratchToIdentity, int value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = uint(2147483647); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, int(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, int(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, int(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
uint lastSubgroupInvocation = loMask; | |
if (pseudoSubgroupElectedInvocation == nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u)) | |
lastSubgroupInvocation &= 256 - 1u; | |
const uint lastItem = nbl_glsl_workgroupArithmeticScratchShared[nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, lastSubgroupInvocation)]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return int(lastItem); | |
; | |
} | |
float nbl_glsl_subgroupMin_impl(in bool clearScratchToIdentity, float value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = floatBitsToUint((1.f / 0.f)); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
uint lastSubgroupInvocation = loMask; | |
if (pseudoSubgroupElectedInvocation == nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u)) | |
lastSubgroupInvocation &= 256 - 1u; | |
const uint lastItem = nbl_glsl_workgroupArithmeticScratchShared[nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, lastSubgroupInvocation)]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return uintBitsToFloat(lastItem); | |
; | |
} | |
uint nbl_glsl_subgroupMax_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
uint lastSubgroupInvocation = loMask; | |
if (pseudoSubgroupElectedInvocation == nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u)) | |
lastSubgroupInvocation &= 256 - 1u; | |
const uint lastItem = nbl_glsl_workgroupArithmeticScratchShared[nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, lastSubgroupInvocation)]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return nbl_glsl_identityFunction(lastItem); | |
; | |
} | |
int nbl_glsl_subgroupMax_impl(in bool clearScratchToIdentity, int value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = uint(-2147483648); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, int(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, int(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, int(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
uint lastSubgroupInvocation = loMask; | |
if (pseudoSubgroupElectedInvocation == nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u)) | |
lastSubgroupInvocation &= 256 - 1u; | |
const uint lastItem = nbl_glsl_workgroupArithmeticScratchShared[nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, lastSubgroupInvocation)]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return int(lastItem); | |
; | |
} | |
float nbl_glsl_subgroupMax_impl(in bool clearScratchToIdentity, float value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = floatBitsToUint(-(1.f / 0.f)); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
uint lastSubgroupInvocation = loMask; | |
if (pseudoSubgroupElectedInvocation == nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u)) | |
lastSubgroupInvocation &= 256 - 1u; | |
const uint lastItem = nbl_glsl_workgroupArithmeticScratchShared[nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, lastSubgroupInvocation)]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return uintBitsToFloat(lastItem); | |
; | |
} | |
# 224 "../../../../nbl/builtin/glsl/subgroup/arithmetic_portability_impl.glsl" | |
uint nbl_glsl_subgroupInclusiveAnd_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0xffFFffFFu); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_and(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_and(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_and(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return value; | |
} | |
int nbl_glsl_subgroupInclusiveAnd_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupInclusiveAnd_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupInclusiveAnd_impl(in bool clearScratchToIdentity, float value) | |
{ | |
return uintBitsToFloat(nbl_glsl_subgroupInclusiveAnd_impl(clearScratchToIdentity, floatBitsToUint(value))); | |
} | |
uint nbl_glsl_subgroupExclusiveAnd_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0xffFFffFFu); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_and(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_and(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_and(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
const uint prevItem = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return nbl_glsl_identityFunction(prevItem); | |
; | |
} | |
int nbl_glsl_subgroupExclusiveAnd_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupExclusiveAnd_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupExclusiveAnd_impl(in bool clearScratchToIdentity, float value) | |
{ | |
return uintBitsToFloat(nbl_glsl_subgroupExclusiveAnd_impl(clearScratchToIdentity, floatBitsToUint(value))); | |
} | |
uint nbl_glsl_subgroupInclusiveXor_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_xor(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_xor(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_xor(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return value; | |
} | |
int nbl_glsl_subgroupInclusiveXor_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupInclusiveXor_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupInclusiveXor_impl(in bool clearScratchToIdentity, float value) | |
{ | |
return uintBitsToFloat(nbl_glsl_subgroupInclusiveXor_impl(clearScratchToIdentity, floatBitsToUint(value))); | |
} | |
uint nbl_glsl_subgroupExclusiveXor_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_xor(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_xor(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_xor(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
const uint prevItem = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return nbl_glsl_identityFunction(prevItem); | |
; | |
} | |
int nbl_glsl_subgroupExclusiveXor_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupExclusiveXor_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupExclusiveXor_impl(in bool clearScratchToIdentity, float value) | |
{ | |
return uintBitsToFloat(nbl_glsl_subgroupExclusiveXor_impl(clearScratchToIdentity, floatBitsToUint(value))); | |
} | |
uint nbl_glsl_subgroupInclusiveOr_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_or(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_or(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_or(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return value; | |
} | |
int nbl_glsl_subgroupInclusiveOr_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupInclusiveOr_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupInclusiveOr_impl(in bool clearScratchToIdentity, float value) | |
{ | |
return uintBitsToFloat(nbl_glsl_subgroupInclusiveOr_impl(clearScratchToIdentity, floatBitsToUint(value))); | |
} | |
uint nbl_glsl_subgroupExclusiveOr_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_or(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_or(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_or(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
const uint prevItem = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return nbl_glsl_identityFunction(prevItem); | |
; | |
} | |
int nbl_glsl_subgroupExclusiveOr_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupExclusiveOr_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupExclusiveOr_impl(in bool clearScratchToIdentity, float value) | |
{ | |
return uintBitsToFloat(nbl_glsl_subgroupExclusiveOr_impl(clearScratchToIdentity, floatBitsToUint(value))); | |
} | |
uint nbl_glsl_subgroupInclusiveAdd_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return value; | |
} | |
int nbl_glsl_subgroupInclusiveAdd_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupInclusiveAdd_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupInclusiveAdd_impl(in bool clearScratchToIdentity, float value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = floatBitsToUint(0.0); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return value; | |
} | |
uint nbl_glsl_subgroupExclusiveAdd_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
const uint prevItem = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return nbl_glsl_identityFunction(prevItem); | |
; | |
} | |
int nbl_glsl_subgroupExclusiveAdd_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupExclusiveAdd_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupExclusiveAdd_impl(in bool clearScratchToIdentity, float value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = floatBitsToUint(0.0); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_add(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
const uint prevItem = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return uintBitsToFloat(prevItem); | |
; | |
} | |
uint nbl_glsl_subgroupInclusiveMul_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(1u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return value; | |
} | |
int nbl_glsl_subgroupInclusiveMul_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupInclusiveMul_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupInclusiveMul_impl(in bool clearScratchToIdentity, float value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = floatBitsToUint(1.0); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return value; | |
} | |
uint nbl_glsl_subgroupExclusiveMul_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(1u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
const uint prevItem = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return nbl_glsl_identityFunction(prevItem); | |
; | |
} | |
int nbl_glsl_subgroupExclusiveMul_impl(in bool clearScratchToIdentity, int value) | |
{ | |
return int(nbl_glsl_subgroupExclusiveMul_impl(clearScratchToIdentity, uint(value))); | |
} | |
float nbl_glsl_subgroupExclusiveMul_impl(in bool clearScratchToIdentity, float value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = floatBitsToUint(1.0); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = nbl_glsl_mul(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
const uint prevItem = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return uintBitsToFloat(prevItem); | |
; | |
} | |
uint nbl_glsl_subgroupInclusiveMin_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(4294967295u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return value; | |
} | |
int nbl_glsl_subgroupInclusiveMin_impl(in bool clearScratchToIdentity, int value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = uint(2147483647); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, int(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, int(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, int(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return value; | |
} | |
float nbl_glsl_subgroupInclusiveMin_impl(in bool clearScratchToIdentity, float value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = floatBitsToUint((1.f / 0.f)); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return value; | |
} | |
uint nbl_glsl_subgroupExclusiveMin_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(4294967295u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
const uint prevItem = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return nbl_glsl_identityFunction(prevItem); | |
; | |
} | |
int nbl_glsl_subgroupExclusiveMin_impl(in bool clearScratchToIdentity, int value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = uint(2147483647); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, int(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, int(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, int(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
const uint prevItem = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return int(prevItem); | |
; | |
} | |
float nbl_glsl_subgroupExclusiveMin_impl(in bool clearScratchToIdentity, float value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = floatBitsToUint((1.f / 0.f)); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = min(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
const uint prevItem = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return uintBitsToFloat(prevItem); | |
; | |
} | |
uint nbl_glsl_subgroupInclusiveMax_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return value; | |
} | |
int nbl_glsl_subgroupInclusiveMax_impl(in bool clearScratchToIdentity, int value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = uint(-2147483648); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, int(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, int(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, int(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return value; | |
} | |
float nbl_glsl_subgroupInclusiveMax_impl(in bool clearScratchToIdentity, float value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = floatBitsToUint(-(1.f / 0.f)); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return value; | |
} | |
uint nbl_glsl_subgroupExclusiveMax_impl(in bool clearScratchToIdentity, uint value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = nbl_glsl_identityFunction(0u); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
const uint prevItem = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return nbl_glsl_identityFunction(prevItem); | |
; | |
} | |
int nbl_glsl_subgroupExclusiveMax_impl(in bool clearScratchToIdentity, int value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = uint(-2147483648); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, int(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, int(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, int(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
const uint prevItem = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return int(prevItem); | |
; | |
} | |
float nbl_glsl_subgroupExclusiveMax_impl(in bool clearScratchToIdentity, float value) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
if (clearScratchToIdentity) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
if (pseudoSubgroupInvocation < ((0x1 << 2) >> 1)) | |
nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset] = floatBitsToUint(-(1.f / 0.f)); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u])); | |
for (uint stp = 2u; stp < ((0x1 << 2) >> 1); stp <<= 1u) | |
{ | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - stp])); | |
} | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
value = max(value, uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[lastLoadOffset])); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(value); | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
const uint prevItem = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset - 1u]; | |
nbl_glsl_subgroupBarrier(); | |
nbl_glsl_subgroupMemoryBarrierShared(); | |
return uintBitsToFloat(prevItem); | |
; | |
} | |
# 46 "../../../../nbl/builtin/glsl/workgroup/ballot.glsl" 2 | |
void nbl_glsl_workgroupBallot_noBarriers(in bool value) | |
{ | |
if (gl_LocalInvocationIndex < (256 + 31 >> 5)) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex] = 0u; | |
barrier(); | |
if (value) | |
atomicOr(nbl_glsl_workgroupArithmeticScratchShared[(gl_LocalInvocationIndex >> 5)], 1u << (gl_LocalInvocationIndex & 31u)); | |
} | |
void nbl_glsl_workgroupBallot(in bool value) | |
{ | |
barrier(); | |
nbl_glsl_workgroupBallot_noBarriers(value); | |
barrier(); | |
} | |
bool nbl_glsl_workgroupBallotBitExtract_noEndBarriers(in uint index) | |
{ | |
return (nbl_glsl_workgroupArithmeticScratchShared[(index >> 5)] & (1u << (index & 31u))) != 0u; | |
} | |
bool nbl_glsl_workgroupBallotBitExtract(in uint index) | |
{ | |
barrier(); | |
const bool retval = nbl_glsl_workgroupBallotBitExtract_noEndBarriers(index); | |
barrier(); | |
return retval; | |
} | |
bool nbl_glsl_workgroupInverseBallot_noEndBarriers() | |
{ | |
return nbl_glsl_workgroupBallotBitExtract_noEndBarriers(gl_LocalInvocationIndex); | |
} | |
bool nbl_glsl_workgroupInverseBallot() | |
{ | |
return nbl_glsl_workgroupBallotBitExtract(gl_LocalInvocationIndex); | |
} | |
uint nbl_glsl_workgroupBallotBitCount_noEndBarriers() | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[(256 + 31 >> 5)] = 0u; | |
barrier(); | |
if (gl_LocalInvocationIndex < (256 + 31 >> 5)) | |
{ | |
const uint localBallot = nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]; | |
const uint localBallotBitCount = bitCount(localBallot); | |
atomicAdd(nbl_glsl_workgroupArithmeticScratchShared[(256 + 31 >> 5)], localBallotBitCount); | |
} | |
barrier(); | |
return nbl_glsl_workgroupArithmeticScratchShared[(256 + 31 >> 5)]; | |
} | |
uint nbl_glsl_workgroupBallotBitCount() | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupBallotBitCount_noEndBarriers(); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupBroadcast_noBarriers(in uint val, in uint id) | |
{ | |
if (gl_LocalInvocationIndex == id) | |
nbl_glsl_workgroupArithmeticScratchShared[(256 + 31 >> 5)] = nbl_glsl_identityFunction(val); | |
barrier(); | |
return nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[(256 + 31 >> 5)]); | |
} | |
bool nbl_glsl_workgroupBroadcast_noBarriers(in bool val, in uint id) | |
{ | |
if (gl_LocalInvocationIndex == id) | |
nbl_glsl_workgroupArithmeticScratchShared[(256 + 31 >> 5)] = uint(val); | |
barrier(); | |
return bool(nbl_glsl_workgroupArithmeticScratchShared[(256 + 31 >> 5)]); | |
} | |
float nbl_glsl_workgroupBroadcast_noBarriers(in float val, in uint id) | |
{ | |
if (gl_LocalInvocationIndex == id) | |
nbl_glsl_workgroupArithmeticScratchShared[(256 + 31 >> 5)] = floatBitsToUint(val); | |
barrier(); | |
return uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[(256 + 31 >> 5)]); | |
} | |
int nbl_glsl_workgroupBroadcast_noBarriers(in int val, in uint id) | |
{ | |
if (gl_LocalInvocationIndex == id) | |
nbl_glsl_workgroupArithmeticScratchShared[(256 + 31 >> 5)] = uint(val); | |
barrier(); | |
return int(nbl_glsl_workgroupArithmeticScratchShared[(256 + 31 >> 5)]); | |
} | |
# 144 "../../../../nbl/builtin/glsl/workgroup/ballot.glsl" | |
uint nbl_glsl_workgroupBroadcast(in uint val, in uint id) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupBroadcast_noBarriers(val, id); | |
barrier(); | |
return retval; | |
} | |
bool nbl_glsl_workgroupBroadcast(in bool val, in uint id) | |
{ | |
barrier(); | |
const bool retval = nbl_glsl_workgroupBroadcast_noBarriers(val, id); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupBroadcast(in float val, in uint id) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupBroadcast_noBarriers(val, id); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupBroadcast(in int val, in uint id) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupBroadcast_noBarriers(val, id); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupBroadcastFirst_noBarriers(in uint val) | |
{ | |
if (nbl_glsl_workgroupElect()) | |
nbl_glsl_workgroupArithmeticScratchShared[(256 + 31 >> 5)] = val; | |
barrier(); | |
return nbl_glsl_workgroupArithmeticScratchShared[(256 + 31 >> 5)]; | |
} | |
uint nbl_glsl_workgroupBroadcastFirst(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupBroadcastFirst_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
bool nbl_glsl_workgroupBroadcastFirst(in bool val) { return nbl_glsl_workgroupBroadcast(val, 0u); } | |
float nbl_glsl_workgroupBroadcastFirst(in float val) { return nbl_glsl_workgroupBroadcast(val, 0u); } | |
int nbl_glsl_workgroupBroadcastFirst(in int val) { return nbl_glsl_workgroupBroadcast(val, 0u); } | |
# 256 "../../../../nbl/builtin/glsl/workgroup/ballot.glsl" | |
uint nbl_glsl_workgroupBallotScanBitCount_impl(in bool exclusive); | |
uint nbl_glsl_workgroupBallotInclusiveBitCount() | |
{ | |
return nbl_glsl_workgroupBallotScanBitCount_impl(false); | |
} | |
uint nbl_glsl_workgroupBallotExclusiveBitCount() | |
{ | |
return nbl_glsl_workgroupBallotScanBitCount_impl(true); | |
} | |
uint nbl_glsl_workgroupBallotScanBitCount_impl_impl(in uint localBitCount) | |
{ | |
barrier(); | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(localBitCount); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, (256 + 31 >> 5) - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = (256 + 31 >> 5) - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAdd_impl(false, localBitCount)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAdd_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAdd_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(nbl_glsl_add(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_add(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (true) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 0u; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
} | |
uint nbl_glsl_workgroupBallotScanBitCount_impl(in bool exclusive) | |
{ | |
const uint _dword = (gl_LocalInvocationIndex >> 5); | |
const uint localBitfield = nbl_glsl_workgroupArithmeticScratchShared[_dword]; | |
uint globalCount; | |
{ | |
uint localBitfieldBackup; | |
if (gl_LocalInvocationIndex < (256 + 31 >> 5)) | |
localBitfieldBackup = nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]; | |
nbl_glsl_workgroupBallotScanBitCount_impl_impl(bitCount(localBitfieldBackup)); | |
globalCount = _dword != 0u ? nbl_glsl_workgroupArithmeticScratchShared[_dword] : 0u; | |
barrier(); | |
if (gl_LocalInvocationIndex < (256 + 31 >> 5)) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex] = localBitfieldBackup; | |
barrier(); | |
} | |
const uint mask = (exclusive ? 0x7fffffffu : 0xffffffffu) >> (31u - (gl_LocalInvocationIndex & 31u)); | |
return globalCount + bitCount(localBitfield & mask); | |
} | |
# 22 "../../../../nbl/builtin/glsl/workgroup/clustered.glsl" 2 | |
# 22 "../../../../nbl/builtin/glsl/workgroup/arithmetic.glsl" 2 | |
# 53 "../../../../nbl/builtin/glsl/workgroup/arithmetic.glsl" | |
uint nbl_glsl_workgroupAnd_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0xffFFffFFu); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0xffFFffFFu); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAnd_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAnd_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAnd_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return nbl_glsl_identityFunction(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
int nbl_glsl_workgroupAnd_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupAnd_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupAnd_noBarriers(in float val) | |
{ | |
return uintBitsToFloat(nbl_glsl_workgroupAnd_noBarriers(floatBitsToUint(val))); | |
} | |
uint nbl_glsl_workgroupAnd(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupAnd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupAnd(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupAnd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupAnd(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupAnd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupOr_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveOr_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveOr_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveOr_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return nbl_glsl_identityFunction(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
int nbl_glsl_workgroupOr_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupOr_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupOr_noBarriers(in float val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = floatBitsToUint(0.0); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = floatBitsToUint(0.0); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = floatBitsToUint(nbl_glsl_subgroupInclusiveOr_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveOr_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveOr_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return uintBitsToFloat(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
uint nbl_glsl_workgroupOr(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupOr_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupOr(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupOr_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupOr(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupOr_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupXor_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveXor_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveXor_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveXor_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return nbl_glsl_identityFunction(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
int nbl_glsl_workgroupXor_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupXor_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupXor_noBarriers(in float val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = floatBitsToUint(0.0); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = floatBitsToUint(0.0); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = floatBitsToUint(nbl_glsl_subgroupInclusiveXor_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveXor_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveXor_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return uintBitsToFloat(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
uint nbl_glsl_workgroupXor(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupXor_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupXor(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupXor_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupXor(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupXor_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupAdd_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAdd_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAdd_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAdd_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return nbl_glsl_identityFunction(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
int nbl_glsl_workgroupAdd_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupAdd_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupAdd_noBarriers(in float val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = floatBitsToUint(0.0); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = floatBitsToUint(0.0); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = floatBitsToUint(nbl_glsl_subgroupInclusiveAdd_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveAdd_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveAdd_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return uintBitsToFloat(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
uint nbl_glsl_workgroupAdd(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupAdd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupAdd(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupAdd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupAdd(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupAdd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupMul_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(1u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(1u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMul_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMul_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMul_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return nbl_glsl_identityFunction(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
int nbl_glsl_workgroupMul_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupMul_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupMul_noBarriers(in float val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = floatBitsToUint(1.0); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = floatBitsToUint(1.0); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = floatBitsToUint(nbl_glsl_subgroupInclusiveMul_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMul_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMul_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return uintBitsToFloat(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
uint nbl_glsl_workgroupMul(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupMul_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupMul(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupMul_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupMul(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupMul_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupMin_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(4294967295u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(4294967295u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMin_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMin_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMin_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return nbl_glsl_identityFunction(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
int nbl_glsl_workgroupMin_noBarriers(in int val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = uint(2147483647); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = uint(2147483647); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = uint(nbl_glsl_subgroupInclusiveMin_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = uint(nbl_glsl_subgroupInclusiveMin_impl(false, int(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = uint(nbl_glsl_subgroupInclusiveMin_impl(false, int(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return int(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
float nbl_glsl_workgroupMin_noBarriers(in float val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = floatBitsToUint((1.f / 0.f)); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = floatBitsToUint((1.f / 0.f)); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = floatBitsToUint(nbl_glsl_subgroupInclusiveMin_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMin_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMin_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return uintBitsToFloat(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
uint nbl_glsl_workgroupMin(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupMin_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupMin(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupMin_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupMin(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupMin_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupMax_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMax_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMax_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMax_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return nbl_glsl_identityFunction(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
int nbl_glsl_workgroupMax_noBarriers(in int val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = uint(-2147483648); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = uint(-2147483648); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = uint(nbl_glsl_subgroupInclusiveMax_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = uint(nbl_glsl_subgroupInclusiveMax_impl(false, int(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = uint(nbl_glsl_subgroupInclusiveMax_impl(false, int(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return int(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
float nbl_glsl_workgroupMax_noBarriers(in float val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = floatBitsToUint(-(1.f / 0.f)); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = floatBitsToUint(-(1.f / 0.f)); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = floatBitsToUint(nbl_glsl_subgroupInclusiveMax_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMax_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (false) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMax_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (false) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
}; | |
barrier(); | |
return uintBitsToFloat(nbl_glsl_workgroupBroadcast_noBarriers(scan, lastInvocationInLevel)); | |
} | |
uint nbl_glsl_workgroupMax(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupMax_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupMax(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupMax_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupMax(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupMax_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
# 186 "../../../../nbl/builtin/glsl/workgroup/arithmetic.glsl" | |
uint nbl_glsl_workgroupInclusiveAnd_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0xffFFffFFu); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0xffFFffFFu); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAnd_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAnd_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAnd_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(nbl_glsl_and(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_and(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (false) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 0xffFFffFFu; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
; | |
} | |
int nbl_glsl_workgroupInclusiveAnd_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupInclusiveAnd_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupInclusiveAnd_noBarriers(in float val) | |
{ | |
return uintBitsToFloat(nbl_glsl_workgroupInclusiveAnd_noBarriers(floatBitsToUint(val))); | |
} | |
uint nbl_glsl_workgroupInclusiveAnd(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupInclusiveAnd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupInclusiveAnd(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupInclusiveAnd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupInclusiveAnd(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupInclusiveAnd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupExclusiveAnd_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0xffFFffFFu); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0xffFFffFFu); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAnd_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAnd_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAnd_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(nbl_glsl_and(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_and(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (true) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 0xffFFffFFu; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
; | |
} | |
int nbl_glsl_workgroupExclusiveAnd_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupExclusiveAnd_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupExclusiveAnd_noBarriers(in float val) | |
{ | |
return uintBitsToFloat(nbl_glsl_workgroupExclusiveAnd_noBarriers(floatBitsToUint(val))); | |
} | |
uint nbl_glsl_workgroupExclusiveAnd(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupExclusiveAnd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupExclusiveAnd(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupExclusiveAnd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupExclusiveAnd(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupExclusiveAnd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupInclusiveOr_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveOr_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveOr_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveOr_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(nbl_glsl_or(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_or(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (false) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 0u; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
; | |
} | |
int nbl_glsl_workgroupInclusiveOr_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupInclusiveOr_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupInclusiveOr_noBarriers(in float val) | |
{ | |
return uintBitsToFloat(nbl_glsl_workgroupInclusiveOr_noBarriers(floatBitsToUint(val))); | |
} | |
uint nbl_glsl_workgroupInclusiveOr(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupInclusiveOr_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupInclusiveOr(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupInclusiveOr_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupInclusiveOr(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupInclusiveOr_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupExclusiveOr_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveOr_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveOr_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveOr_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(nbl_glsl_or(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_or(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (true) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 0u; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
; | |
} | |
int nbl_glsl_workgroupExclusiveOr_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupExclusiveOr_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupExclusiveOr_noBarriers(in float val) | |
{ | |
return uintBitsToFloat(nbl_glsl_workgroupExclusiveOr_noBarriers(floatBitsToUint(val))); | |
} | |
uint nbl_glsl_workgroupExclusiveOr(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupExclusiveOr_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupExclusiveOr(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupExclusiveOr_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupExclusiveOr(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupExclusiveOr_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupInclusiveXor_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveXor_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveXor_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveXor_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(nbl_glsl_xor(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_xor(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (false) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 0u; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
; | |
} | |
int nbl_glsl_workgroupInclusiveXor_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupInclusiveXor_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupInclusiveXor_noBarriers(in float val) | |
{ | |
return uintBitsToFloat(nbl_glsl_workgroupInclusiveXor_noBarriers(floatBitsToUint(val))); | |
} | |
uint nbl_glsl_workgroupInclusiveXor(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupInclusiveXor_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupInclusiveXor(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupInclusiveXor_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupInclusiveXor(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupInclusiveXor_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupExclusiveXor_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveXor_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveXor_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveXor_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(nbl_glsl_xor(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_xor(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (true) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 0u; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
; | |
} | |
int nbl_glsl_workgroupExclusiveXor_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupExclusiveXor_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupExclusiveXor_noBarriers(in float val) | |
{ | |
return uintBitsToFloat(nbl_glsl_workgroupExclusiveXor_noBarriers(floatBitsToUint(val))); | |
} | |
uint nbl_glsl_workgroupExclusiveXor(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupExclusiveXor_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupExclusiveXor(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupExclusiveXor_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupExclusiveXor(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupExclusiveXor_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupInclusiveAdd_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAdd_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAdd_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAdd_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(nbl_glsl_add(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_add(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (false) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 0u; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
; | |
} | |
int nbl_glsl_workgroupInclusiveAdd_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupInclusiveAdd_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupInclusiveAdd_noBarriers(in float val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = floatBitsToUint(0.0); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = floatBitsToUint(0.0); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = floatBitsToUint(nbl_glsl_subgroupInclusiveAdd_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveAdd_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveAdd_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = floatBitsToUint(nbl_glsl_add(uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = floatBitsToUint(nbl_glsl_add(uintBitsToFloat(higherLevelExclusive), uintBitsToFloat(firstLevelScan))); | |
} | |
} | |
if (false) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 0.0; | |
} | |
else | |
return uintBitsToFloat(firstLevelScan); | |
; | |
} | |
uint nbl_glsl_workgroupInclusiveAdd(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupInclusiveAdd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupInclusiveAdd(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupInclusiveAdd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupInclusiveAdd(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupInclusiveAdd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupExclusiveAdd_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAdd_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAdd_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveAdd_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(nbl_glsl_add(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_add(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (true) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 0u; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
; | |
} | |
int nbl_glsl_workgroupExclusiveAdd_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupExclusiveAdd_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupExclusiveAdd_noBarriers(in float val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = floatBitsToUint(0.0); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = floatBitsToUint(0.0); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = floatBitsToUint(nbl_glsl_subgroupInclusiveAdd_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveAdd_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveAdd_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = floatBitsToUint(nbl_glsl_add(uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = floatBitsToUint(nbl_glsl_add(uintBitsToFloat(higherLevelExclusive), uintBitsToFloat(firstLevelScan))); | |
} | |
} | |
if (true) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 0.0; | |
} | |
else | |
return uintBitsToFloat(firstLevelScan); | |
; | |
} | |
uint nbl_glsl_workgroupExclusiveAdd(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupExclusiveAdd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupExclusiveAdd(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupExclusiveAdd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupExclusiveAdd(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupExclusiveAdd_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupInclusiveMul_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(1u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(1u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMul_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMul_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMul_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(nbl_glsl_mul(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_mul(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (false) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 1u; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
; | |
} | |
int nbl_glsl_workgroupInclusiveMul_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupInclusiveMul_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupInclusiveMul_noBarriers(in float val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = floatBitsToUint(1.0); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = floatBitsToUint(1.0); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = floatBitsToUint(nbl_glsl_subgroupInclusiveMul_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMul_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMul_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = floatBitsToUint(nbl_glsl_mul(uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = floatBitsToUint(nbl_glsl_mul(uintBitsToFloat(higherLevelExclusive), uintBitsToFloat(firstLevelScan))); | |
} | |
} | |
if (false) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 1.0; | |
} | |
else | |
return uintBitsToFloat(firstLevelScan); | |
; | |
} | |
uint nbl_glsl_workgroupInclusiveMul(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupInclusiveMul_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupInclusiveMul(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupInclusiveMul_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupInclusiveMul(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupInclusiveMul_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupExclusiveMul_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(1u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(1u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMul_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMul_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMul_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(nbl_glsl_mul(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_mul(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (true) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 1u; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
; | |
} | |
int nbl_glsl_workgroupExclusiveMul_noBarriers(in int val) | |
{ | |
return int(nbl_glsl_workgroupExclusiveMul_noBarriers(uint(val))); | |
} | |
float nbl_glsl_workgroupExclusiveMul_noBarriers(in float val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = floatBitsToUint(1.0); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = floatBitsToUint(1.0); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = floatBitsToUint(nbl_glsl_subgroupInclusiveMul_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMul_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMul_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = floatBitsToUint(nbl_glsl_mul(uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = floatBitsToUint(nbl_glsl_mul(uintBitsToFloat(higherLevelExclusive), uintBitsToFloat(firstLevelScan))); | |
} | |
} | |
if (true) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 1.0; | |
} | |
else | |
return uintBitsToFloat(firstLevelScan); | |
; | |
} | |
uint nbl_glsl_workgroupExclusiveMul(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupExclusiveMul_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupExclusiveMul(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupExclusiveMul_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupExclusiveMul(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupExclusiveMul_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupInclusiveMin_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(4294967295u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(4294967295u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMin_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMin_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMin_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(min(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(min(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (false) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 4294967295u; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
; | |
} | |
int nbl_glsl_workgroupInclusiveMin_noBarriers(in int val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = uint(2147483647); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = uint(2147483647); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = uint(nbl_glsl_subgroupInclusiveMin_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = uint(nbl_glsl_subgroupInclusiveMin_impl(false, int(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = uint(nbl_glsl_subgroupInclusiveMin_impl(false, int(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = uint(min(int(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), int(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = uint(min(int(higherLevelExclusive), int(firstLevelScan))); | |
} | |
} | |
if (false) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? int(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 2147483647; | |
} | |
else | |
return int(firstLevelScan); | |
; | |
} | |
float nbl_glsl_workgroupInclusiveMin_noBarriers(in float val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = floatBitsToUint((1.f / 0.f)); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = floatBitsToUint((1.f / 0.f)); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = floatBitsToUint(nbl_glsl_subgroupInclusiveMin_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMin_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMin_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = floatBitsToUint(min(uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = floatBitsToUint(min(uintBitsToFloat(higherLevelExclusive), uintBitsToFloat(firstLevelScan))); | |
} | |
} | |
if (false) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : (1.f / 0.f); | |
} | |
else | |
return uintBitsToFloat(firstLevelScan); | |
; | |
} | |
uint nbl_glsl_workgroupInclusiveMin(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupInclusiveMin_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupInclusiveMin(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupInclusiveMin_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupInclusiveMin(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupInclusiveMin_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupExclusiveMin_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(4294967295u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(4294967295u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMin_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMin_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMin_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(min(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(min(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (true) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 4294967295u; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
; | |
} | |
int nbl_glsl_workgroupExclusiveMin_noBarriers(in int val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = uint(2147483647); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = uint(2147483647); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = uint(nbl_glsl_subgroupInclusiveMin_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = uint(nbl_glsl_subgroupInclusiveMin_impl(false, int(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = uint(nbl_glsl_subgroupInclusiveMin_impl(false, int(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = uint(min(int(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), int(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = uint(min(int(higherLevelExclusive), int(firstLevelScan))); | |
} | |
} | |
if (true) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? int(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 2147483647; | |
} | |
else | |
return int(firstLevelScan); | |
; | |
} | |
float nbl_glsl_workgroupExclusiveMin_noBarriers(in float val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = floatBitsToUint((1.f / 0.f)); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = floatBitsToUint((1.f / 0.f)); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = floatBitsToUint(nbl_glsl_subgroupInclusiveMin_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMin_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMin_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = floatBitsToUint(min(uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = floatBitsToUint(min(uintBitsToFloat(higherLevelExclusive), uintBitsToFloat(firstLevelScan))); | |
} | |
} | |
if (true) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : (1.f / 0.f); | |
} | |
else | |
return uintBitsToFloat(firstLevelScan); | |
; | |
} | |
uint nbl_glsl_workgroupExclusiveMin(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupExclusiveMin_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupExclusiveMin(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupExclusiveMin_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupExclusiveMin(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupExclusiveMin_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupInclusiveMax_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMax_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMax_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMax_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(max(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(max(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (false) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 0u; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
; | |
} | |
int nbl_glsl_workgroupInclusiveMax_noBarriers(in int val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = uint(-2147483648); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = uint(-2147483648); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = uint(nbl_glsl_subgroupInclusiveMax_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = uint(nbl_glsl_subgroupInclusiveMax_impl(false, int(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = uint(nbl_glsl_subgroupInclusiveMax_impl(false, int(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = uint(max(int(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), int(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = uint(max(int(higherLevelExclusive), int(firstLevelScan))); | |
} | |
} | |
if (false) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? int(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : -2147483648; | |
} | |
else | |
return int(firstLevelScan); | |
; | |
} | |
float nbl_glsl_workgroupInclusiveMax_noBarriers(in float val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = floatBitsToUint(-(1.f / 0.f)); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = floatBitsToUint(-(1.f / 0.f)); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = floatBitsToUint(nbl_glsl_subgroupInclusiveMax_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMax_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMax_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = floatBitsToUint(max(uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = floatBitsToUint(max(uintBitsToFloat(higherLevelExclusive), uintBitsToFloat(firstLevelScan))); | |
} | |
} | |
if (false) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : -(1.f / 0.f); | |
} | |
else | |
return uintBitsToFloat(firstLevelScan); | |
; | |
} | |
uint nbl_glsl_workgroupInclusiveMax(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupInclusiveMax_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupInclusiveMax(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupInclusiveMax_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupInclusiveMax(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupInclusiveMax_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
uint nbl_glsl_workgroupExclusiveMax_noBarriers(in uint val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = nbl_glsl_identityFunction(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = nbl_glsl_identityFunction(0u); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = nbl_glsl_identityFunction(0u); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMax_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMax_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = nbl_glsl_identityFunction(nbl_glsl_subgroupInclusiveMax_impl(false, nbl_glsl_identityFunction(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = nbl_glsl_identityFunction(max(nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = nbl_glsl_identityFunction(max(nbl_glsl_identityFunction(higherLevelExclusive), nbl_glsl_identityFunction(firstLevelScan))); | |
} | |
} | |
if (true) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? nbl_glsl_identityFunction(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : 0u; | |
} | |
else | |
return nbl_glsl_identityFunction(firstLevelScan); | |
; | |
} | |
int nbl_glsl_workgroupExclusiveMax_noBarriers(in int val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = uint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = uint(-2147483648); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = uint(-2147483648); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = uint(nbl_glsl_subgroupInclusiveMax_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = uint(nbl_glsl_subgroupInclusiveMax_impl(false, int(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = uint(nbl_glsl_subgroupInclusiveMax_impl(false, int(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = uint(max(int(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), int(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = uint(max(int(higherLevelExclusive), int(firstLevelScan))); | |
} | |
} | |
if (true) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? int(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : -2147483648; | |
} | |
else | |
return int(firstLevelScan); | |
; | |
} | |
float nbl_glsl_workgroupExclusiveMax_noBarriers(in float val) | |
{ | |
const uint loMask = (0x1 << 2) - 1u; | |
const uint pseudoSubgroupElectedInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, gl_LocalInvocationIndex); | |
const uint pseudoSubgroupInvocation = nbl_glsl_subgroup_impl_pseudoSubgroupInvocation(loMask, gl_LocalInvocationIndex); | |
const uint subgroupMemoryStart = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStart(pseudoSubgroupElectedInvocation); | |
uint lastLoadOffset = 0xdeadbeefu; | |
const uint subgroupScanStoreOffset = nbl_glsl_subgroup_impl_getSubgroupEmulationMemoryStoreOffset(subgroupMemoryStart, pseudoSubgroupInvocation, lastLoadOffset); | |
{ | |
nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset] = floatBitsToUint(val); | |
const uint halfMask = loMask >> 1u; | |
nbl_glsl_workgroupArithmeticScratchShared[((((gl_LocalInvocationIndex) & (~halfMask)) << 2u) | ((gl_LocalInvocationIndex)&halfMask))] = floatBitsToUint(-(1.f / 0.f)); | |
if (256 < ((0x1 << 2) >> 1)) | |
{ | |
const uint maxItemsToClear = (nbl_glsl_subgroup_impl_pseudoSubgroupElectedInvocation(loMask, 256 - 1u) >> 1u) + ((0x1 << 2) >> 1); | |
for (uint ix = gl_LocalInvocationIndex + 256; ix < maxItemsToClear; ix += 256) | |
nbl_glsl_workgroupArithmeticScratchShared[((((ix) & (~halfMask)) << 2u) | ((ix)&halfMask))] = floatBitsToUint(-(1.f / 0.f)); | |
} | |
barrier(); | |
} | |
const uint lastInvocation = 256 - 1u; | |
uint lastInvocationInLevel = lastInvocation; | |
uint firstLevelScan = floatBitsToUint(nbl_glsl_subgroupInclusiveMax_impl(false, val)); | |
uint scan = firstLevelScan; | |
const bool possibleProp = pseudoSubgroupInvocation == loMask; | |
const uint pseudoSubgroupID = gl_LocalInvocationIndex >> 2; | |
const uint nextStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, pseudoSubgroupID); | |
uint scanStoreIndex = nbl_glsl_subgroup_getSubgroupEmulationMemoryStoreOffset(loMask, lastInvocation) + gl_LocalInvocationIndex + 1u; | |
bool participate = gl_LocalInvocationIndex <= lastInvocationInLevel; | |
while (lastInvocationInLevel >= (0x1 << 2) * (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMax_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
if (true) | |
scanStoreIndex += lastInvocationInLevel + 1u; | |
} | |
if (lastInvocationInLevel >= (0x1 << 2)) | |
{ | |
barrier(); | |
if (participate) | |
{ | |
if (any(bvec2(gl_LocalInvocationIndex == lastInvocationInLevel, possibleProp))) | |
nbl_glsl_workgroupArithmeticScratchShared[nextStoreIndex] = scan; | |
} | |
barrier(); | |
participate = gl_LocalInvocationIndex <= (lastInvocationInLevel >>= 2); | |
if (participate) | |
{ | |
const uint prevLevelScan = nbl_glsl_workgroupArithmeticScratchShared[subgroupScanStoreOffset]; | |
scan = floatBitsToUint(nbl_glsl_subgroupInclusiveMax_impl(false, uintBitsToFloat(prevLevelScan))); | |
if (true) | |
nbl_glsl_workgroupArithmeticScratchShared[scanStoreIndex] = scan; | |
} | |
} | |
barrier(); | |
if (lastInvocation >= (0x1 << 2)) | |
{ | |
uint scanLoadIndex = scanStoreIndex + (0x1 << 2); | |
const uint shiftedInvocationIndex = gl_LocalInvocationIndex + (0x1 << 2); | |
const uint currentToHighLevel = pseudoSubgroupID - shiftedInvocationIndex; | |
for (uint logShift = (findMSB(lastInvocation) / 2 - 1u) * 2; logShift > 0u; logShift -= 2) | |
{ | |
lastInvocationInLevel = lastInvocation >> logShift; | |
barrier(); | |
const uint currentLevelIndex = scanLoadIndex - (lastInvocationInLevel + 1u); | |
if (shiftedInvocationIndex <= lastInvocationInLevel) | |
nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex] = floatBitsToUint(max(uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel]), uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[currentLevelIndex]))); | |
scanLoadIndex = currentLevelIndex; | |
} | |
barrier(); | |
if (gl_LocalInvocationIndex <= lastInvocation && pseudoSubgroupID != 0u) | |
{ | |
const uint higherLevelExclusive = nbl_glsl_workgroupArithmeticScratchShared[scanLoadIndex + currentToHighLevel - 1u]; | |
firstLevelScan = floatBitsToUint(max(uintBitsToFloat(higherLevelExclusive), uintBitsToFloat(firstLevelScan))); | |
} | |
} | |
if (true) | |
{ | |
if (gl_LocalInvocationIndex < lastInvocation) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex + 1u] = firstLevelScan; | |
barrier(); | |
return any(bvec2(gl_LocalInvocationIndex != 0u, gl_LocalInvocationIndex <= lastInvocation)) ? uintBitsToFloat(nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex]) : -(1.f / 0.f); | |
} | |
else | |
return uintBitsToFloat(firstLevelScan); | |
; | |
} | |
uint nbl_glsl_workgroupExclusiveMax(in uint val) | |
{ | |
barrier(); | |
const uint retval = nbl_glsl_workgroupExclusiveMax_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
int nbl_glsl_workgroupExclusiveMax(in int val) | |
{ | |
barrier(); | |
const int retval = nbl_glsl_workgroupExclusiveMax_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
float nbl_glsl_workgroupExclusiveMax(in float val) | |
{ | |
barrier(); | |
const float retval = nbl_glsl_workgroupExclusiveMax_noBarriers(val); | |
barrier(); | |
return retval; | |
} | |
# 7 "../../../../nbl/builtin/glsl/scan/virtual_workgroup.glsl" 2 | |
void nbl_glsl_scan_virtualWorkgroup(in uint treeLevel, in uint localWorkgroupIndex) | |
{ | |
const nbl_glsl_scan_Parameters_t params = nbl_glsl_scan_getParameters(); | |
const uint levelInvocationIndex = localWorkgroupIndex * 256 + gl_LocalInvocationIndex; | |
const bool lastInvocationInGroup = gl_LocalInvocationIndex == (256 - 1); | |
const uint lastLevel = params.topLevel << 1u; | |
const uint pseudoLevel = treeLevel > params.topLevel ? (lastLevel - treeLevel) : treeLevel; | |
const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; | |
# 61 "../../../../nbl/builtin/glsl/scan/virtual_workgroup.glsl" | |
_NBL_GLSL_SCAN_STORAGE_TYPE_ data = IDENTITY; | |
if (inRange) | |
nbl_glsl_scan_getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel); | |
if (treeLevel < params.topLevel) | |
data = REDUCTION(data); | |
else if (params.topLevel == 0u) | |
data = INCLUSIVE(data); | |
else if (treeLevel != params.topLevel) | |
data = INCLUSIVE(data); | |
else | |
data = EXCLUSIVE(data); | |
nbl_glsl_scan_setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange); | |
} | |
# 1 "../../../../nbl/builtin/glsl/scan/default_scheduler.glsl" 1 | |
# 9 "../../../../nbl/builtin/glsl/scan/default_scheduler.glsl" | |
struct nbl_glsl_scan_DefaultSchedulerParameters_t | |
{ | |
uint finishedFlagOffset[7 - 1]; | |
uint cumulativeWorkgroupCount[7]; | |
}; | |
void nbl_glsl_scan_scheduler_computeParameters(in uint elementCount, out nbl_glsl_scan_Parameters_t _scanParams, out nbl_glsl_scan_DefaultSchedulerParameters_t _schedulerParams) | |
{ | |
_scanParams.lastElement[0] = elementCount - 1u; | |
_scanParams.topLevel = findMSB(_scanParams.lastElement[0]) / _NBL_GLSL_WORKGROUP_SIZE_LOG2_; | |
for (int i = 0; i < 7 / 2;) | |
{ | |
const int next = i + 1; | |
_scanParams.lastElement[next] = _scanParams.lastElement[i] >> _NBL_GLSL_WORKGROUP_SIZE_LOG2_; | |
i = next; | |
} | |
_schedulerParams.cumulativeWorkgroupCount[0] = (_scanParams.lastElement[0 + 1] + 1u); | |
_schedulerParams.finishedFlagOffset[0] = 0u; | |
switch (_scanParams.topLevel) | |
{ | |
case 1u: | |
_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0] + 1u; | |
_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1] + (_scanParams.lastElement[0 + 1] + 1u); | |
_schedulerParams.finishedFlagOffset[1] = 1u; | |
_scanParams.temporaryStorageOffset[0] = 2u; | |
break; | |
case 2u: | |
_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0] + (_scanParams.lastElement[1 + 1] + 1u); | |
_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1] + 1u; | |
_schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2] + (_scanParams.lastElement[1 + 1] + 1u); | |
_schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3] + (_scanParams.lastElement[0 + 1] + 1u); | |
_schedulerParams.finishedFlagOffset[1] = (_scanParams.lastElement[1 + 1] + 1u); | |
_schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1] + 1u; | |
_schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1] + 2u; | |
_scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3] + (_scanParams.lastElement[1 + 1] + 1u); | |
_scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0] + (_scanParams.lastElement[0 + 1] + 1u); | |
break; | |
case 3u: | |
_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0] + (_scanParams.lastElement[1 + 1] + 1u); | |
_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1] + (_scanParams.lastElement[2 + 1] + 1u); | |
_schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2] + 1u; | |
_schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3] + (_scanParams.lastElement[2 + 1] + 1u); | |
_schedulerParams.cumulativeWorkgroupCount[5] = _schedulerParams.cumulativeWorkgroupCount[4] + (_scanParams.lastElement[1 + 1] + 1u); | |
_schedulerParams.cumulativeWorkgroupCount[6] = _schedulerParams.cumulativeWorkgroupCount[5] + (_scanParams.lastElement[0 + 1] + 1u); | |
_schedulerParams.finishedFlagOffset[1] = (_scanParams.lastElement[1 + 1] + 1u); | |
_schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1] + (_scanParams.lastElement[2 + 1] + 1u); | |
_schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[2] + 1u; | |
_schedulerParams.finishedFlagOffset[4] = _schedulerParams.finishedFlagOffset[2] + 2u; | |
_schedulerParams.finishedFlagOffset[5] = _schedulerParams.finishedFlagOffset[4] + (_scanParams.lastElement[2 + 1] + 1u); | |
_scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[5] + (_scanParams.lastElement[1 + 1] + 1u); | |
_scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0] + (_scanParams.lastElement[0 + 1] + 1u); | |
_scanParams.temporaryStorageOffset[2] = _scanParams.temporaryStorageOffset[1] + (_scanParams.lastElement[1 + 1] + 1u); | |
break; | |
default: | |
break; | |
} | |
} | |
bool nbl_glsl_scan_scheduler_getWork(in nbl_glsl_scan_DefaultSchedulerParameters_t params, in uint topLevel, out uint treeLevel, out uint localWorkgroupIndex) | |
{ | |
if (gl_LocalInvocationIndex == 0u) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex] = atomicAdd(scanScratch.workgroupsStarted, 1u); | |
else if (gl_LocalInvocationIndex == 1u) | |
nbl_glsl_workgroupArithmeticScratchShared[gl_LocalInvocationIndex] = 0u; | |
barrier(); | |
const uint globalWorkgroupIndex = nbl_glsl_workgroupArithmeticScratchShared[0u]; | |
const uint lastLevel = topLevel << 1u; | |
if (gl_LocalInvocationIndex <= lastLevel && globalWorkgroupIndex >= params.cumulativeWorkgroupCount[gl_LocalInvocationIndex]) | |
atomicAdd(nbl_glsl_workgroupArithmeticScratchShared[1u], 1u); | |
barrier(); | |
treeLevel = nbl_glsl_workgroupArithmeticScratchShared[1u]; | |
if (treeLevel > lastLevel) | |
return true; | |
localWorkgroupIndex = globalWorkgroupIndex; | |
const bool dependantLevel = treeLevel != 0u; | |
if (dependantLevel) | |
{ | |
const uint prevLevel = treeLevel - 1u; | |
localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel]; | |
if (gl_LocalInvocationIndex == 0u) | |
{ | |
uint dependentsCount = 1u; | |
if (treeLevel <= topLevel) | |
{ | |
dependentsCount = 256; | |
const bool lastWorkgroup = (globalWorkgroupIndex + 1u) == params.cumulativeWorkgroupCount[treeLevel]; | |
if (lastWorkgroup) | |
{ | |
const nbl_glsl_scan_Parameters_t scanParams = nbl_glsl_scan_getParameters(); | |
dependentsCount = scanParams.lastElement[treeLevel] + 1u; | |
if (treeLevel < topLevel) | |
dependentsCount -= scanParams.lastElement[treeLevel + 1u] * 256; | |
} | |
} | |
uint dependentsFinishedFlagOffset = localWorkgroupIndex; | |
if (treeLevel > topLevel) | |
dependentsFinishedFlagOffset /= 256; | |
dependentsFinishedFlagOffset += params.finishedFlagOffset[prevLevel]; | |
while (scanScratch.data[dependentsFinishedFlagOffset] != dependentsCount) | |
memoryBarrierBuffer(); | |
} | |
} | |
barrier(); | |
memoryBarrierBuffer(); | |
return false; | |
} | |
void nbl_glsl_scan_scheduler_markComplete(in nbl_glsl_scan_DefaultSchedulerParameters_t params, in uint topLevel, in uint treeLevel, in uint localWorkgroupIndex) | |
{ | |
memoryBarrierBuffer(); | |
if (gl_LocalInvocationIndex == 0u) | |
{ | |
uint finishedFlagOffset = params.finishedFlagOffset[treeLevel]; | |
if (treeLevel < topLevel) | |
{ | |
finishedFlagOffset += localWorkgroupIndex / 256; | |
atomicAdd(scanScratch.data[finishedFlagOffset], 1u); | |
} | |
else if (treeLevel != (topLevel << 1u)) | |
{ | |
finishedFlagOffset += localWorkgroupIndex; | |
scanScratch.data[finishedFlagOffset] = 1u; | |
} | |
} | |
} | |
# 86 "../../../../nbl/builtin/glsl/scan/virtual_workgroup.glsl" 2 | |
nbl_glsl_scan_DefaultSchedulerParameters_t nbl_glsl_scan_getSchedulerParameters(); | |
void nbl_glsl_scan_main() | |
{ | |
const nbl_glsl_scan_DefaultSchedulerParameters_t schedulerParams = nbl_glsl_scan_getSchedulerParameters(); | |
const uint topLevel = nbl_glsl_scan_getParameters().topLevel; | |
while (true) | |
{ | |
uint treeLevel, localWorkgroupIndex; | |
if (nbl_glsl_scan_scheduler_getWork(schedulerParams, topLevel, treeLevel, localWorkgroupIndex)) | |
return; | |
nbl_glsl_scan_virtualWorkgroup(treeLevel, localWorkgroupIndex); | |
nbl_glsl_scan_scheduler_markComplete(schedulerParams, topLevel, treeLevel, localWorkgroupIndex); | |
} | |
} | |
# 9 "direct.comp" 2 | |
layout(push_constant) uniform PushConstants | |
{ | |
nbl_glsl_scan_Parameters_t scanParams; | |
nbl_glsl_scan_DefaultSchedulerParameters_t schedulerParams; | |
} | |
pc; | |
nbl_glsl_scan_Parameters_t nbl_glsl_scan_getParameters() | |
{ | |
return pc.scanParams; | |
} | |
nbl_glsl_scan_DefaultSchedulerParameters_t nbl_glsl_scan_getSchedulerParameters() | |
{ | |
return pc.schedulerParams; | |
} | |
void main() | |
{ | |
nbl_glsl_scan_main(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment