Skip to content

Instantly share code, notes, and snippets.

@zeux
Last active September 30, 2025 02:44
Show Gist options
  • Save zeux/6a282d99f10a76d67e07ac9104561335 to your computer and use it in GitHub Desktop.
Save zeux/6a282d99f10a76d67e07ac9104561335 to your computer and use it in GitHub Desktop.
/*
* Thread-local cache for meshoptimizer allocations. Planned for inclusion into future meshoptimizer versions.
*
* Copyright (C) 2016-2025, by Arseny Kapoulkine ([email protected])
* This code is distributed under the MIT License.
*/
// reconfigure thread cache for meshopt_ allocations for N threads x M bytes per thread
// can't be called concurrently with meshopt_ or clod functions
void clodUseThreadCache(size_t thread_count, size_t size_per_thread)
{
struct Global
{
void* data;
size_t block_size;
uint64_t all_blocks;
std::atomic<uint64_t> blocks{0};
};
struct Local
{
void* block;
size_t offset;
uint64_t block_mask;
};
static Global global;
thread_local Local local;
// reset prior global state
// note: all previously allocated blocks must have been returned at this point; this is guaranteed by the absence of concurrent execution with meshopt_/clod functions
assert(global.blocks.load() == global.all_blocks);
::operator delete(global.data);
global.data = NULL;
global.block_size = 0;
global.all_blocks = 0;
global.blocks = 0;
thread_count = std::min(thread_count, size_t(64));
size_per_thread &= ~size_t(15);
if (thread_count == 0 || size_per_thread == 0)
{
meshopt_setAllocator(::operator new, ::operator delete);
return;
}
// allocate a block for each thread and mark each block as available
global.data = ::operator new(thread_count * size_per_thread);
global.block_size = size_per_thread;
global.blocks = global.all_blocks = (thread_count == 64) ? ~0ull : (1ull << thread_count) - 1;
// override allocation callbacks
void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t) = [](size_t size) -> void*
{
// try to grab an available local block
if (local.block == NULL && global.blocks.load() != 0 && size < global.block_size)
{
uint64_t blocks, mask;
do
{
blocks = global.blocks.load();
// prefer last index for coherency, but settle for lowest bit otherwise
mask = (blocks & local.block_mask) ? local.block_mask : blocks & -blocks;
// no available block, unlikely to get one soon
if (blocks == 0)
break;
} while (!global.blocks.compare_exchange_weak(blocks, blocks & ~mask));
if (mask)
{
// extract block index from mask (must only have one bit set)
int index = -1;
for (int i = 0; i < 64; ++i)
if (mask & (1ull << i))
{
index = i;
break;
}
assert(index >= 0);
assert(mask && (mask & (mask - 1)) == 0);
local.block = static_cast<char*>(global.data) + index * global.block_size;
local.block_mask = mask;
}
}
// allocate from local block if any
if (local.block && size < global.block_size && local.offset < global.block_size - size)
{
void* ptr = static_cast<char*>(local.block) + local.offset;
local.offset += size;
local.offset = (local.offset + 15) & ~size_t(15); // align future allocations to 16b
return ptr;
}
// fall back to system allocator
return ::operator new(size);
};
void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*) = [](void* ptr)
{
// has our allocation come from thread cache?
if (local.block && ptr >= local.block && ptr < static_cast<char*>(local.block) + global.block_size)
{
// meshopt allocations are guaranteed to be stack ordered
assert(ptr <= static_cast<char*>(local.block) + local.offset);
local.offset = static_cast<char*>(ptr) - static_cast<char*>(local.block);
// return local block to the pool
if (local.offset == 0)
{
assert(local.block_mask);
global.blocks |= local.block_mask;
local.block = NULL;
// keep block_mask as an affinity hint for the next allocation
}
}
else
::operator delete(ptr);
};
meshopt_setAllocator(allocate, deallocate);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment