zeux · September 30, 2025 02:44
diff --git a/threadcache.h b/threadcache.h
 /*
 * Thread-local cache for meshoptimizer allocations. Planned for inclusion into future meshoptimizer versions.
 *
 * Copyright (C) 2016-2025, by Arseny Kapoulkine ([email protected])
 * This code is distributed under the MIT License.
 */

 // reconfigure thread cache for meshopt_ allocations for N threads x M bytes per thread
 // can't be called concurrently with meshopt_ or clod functions
 void clodUseThreadCache(size_t thread_count, size_t size_per_thread)
 {
 	struct Global
 	{
 		void* data;
 		size_t block_size;
 		uint64_t all_blocks;

 		std::atomic<uint64_t> blocks{0};
 	};

 	struct Local
 	{
 		void* block;
 		size_t offset;
 		uint64_t block_mask;
 	};

 	static Global global;
 	thread_local Local local;

 	// reset prior global state
 	// note: all previously allocated blocks must have been returned at this point; this is guaranteed by the absence of concurrent execution with meshopt_/clod functions
 	assert(global.blocks.load() == global.all_blocks);
 	::operator delete(global.data);
 	global.data = NULL;
 	global.block_size = 0;
 	global.all_blocks = 0;
 	global.blocks = 0;

 	thread_count = std::min(thread_count, size_t(64));
 	size_per_thread &= ~size_t(15);

 	if (thread_count == 0 || size_per_thread == 0)
 	{
 		meshopt_setAllocator(::operator new, ::operator delete);
 		return;
 	}

 	// allocate a block for each thread and mark each block as available
 	global.data = ::operator new(thread_count * size_per_thread);
 	global.block_size = size_per_thread;
 	global.blocks = global.all_blocks = (thread_count == 64) ? ~0ull : (1ull << thread_count) - 1;

 	// override allocation callbacks
 	void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t) = [](size_t size) -> void*
 	{
 		// try to grab an available local block
 		if (local.block == NULL && global.blocks.load() != 0 && size < global.block_size)
 		{
 			uint64_t blocks, mask;

 			do
 			{
 				blocks = global.blocks.load();
 				// prefer last index for coherency, but settle for lowest bit otherwise
 				mask = (blocks & local.block_mask) ? local.block_mask : blocks & -blocks;
 				// no available block, unlikely to get one soon
 				if (blocks == 0)
 					break;
 			} while (!global.blocks.compare_exchange_weak(blocks, blocks & ~mask));

 			if (mask)
 			{
 				// extract block index from mask (must only have one bit set)
 				int index = -1;
 				for (int i = 0; i < 64; ++i)
 					if (mask & (1ull << i))
 					{
 						index = i;
 						break;
 					}

 				assert(index >= 0);
 				assert(mask && (mask & (mask - 1)) == 0);

 				local.block = static_cast<char*>(global.data) + index * global.block_size;
 				local.block_mask = mask;
 			}
 		}

 		// allocate from local block if any
 		if (local.block && size < global.block_size && local.offset < global.block_size - size)
 		{
 			void* ptr = static_cast<char*>(local.block) + local.offset;
 			local.offset += size;
 			local.offset = (local.offset + 15) & ~size_t(15); // align future allocations to 16b
 			return ptr;
 		}

 		// fall back to system allocator
 		return ::operator new(size);
 	};

 	void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*) = [](void* ptr)
 	{
 		// has our allocation come from thread cache?
 		if (local.block && ptr >= local.block && ptr < static_cast<char*>(local.block) + global.block_size)
 		{
 			// meshopt allocations are guaranteed to be stack ordered
 			assert(ptr <= static_cast<char*>(local.block) + local.offset);
 			local.offset = static_cast<char*>(ptr) - static_cast<char*>(local.block);

 			// return local block to the pool
 			if (local.offset == 0)
 			{
 				assert(local.block_mask);
 				global.blocks |= local.block_mask;
 				local.block = NULL;
 				// keep block_mask as an affinity hint for the next allocation
 			}
 		}
 		else
 			::operator delete(ptr);
 	};

 	meshopt_setAllocator(allocate, deallocate);
 }
	/*
	* Thread-local cache for meshoptimizer allocations. Planned for inclusion into future meshoptimizer versions.
	*
	* Copyright (C) 2016-2025, by Arseny Kapoulkine ([email protected])
	* This code is distributed under the MIT License.
	*/

	// reconfigure thread cache for meshopt_ allocations for N threads x M bytes per thread
	// can't be called concurrently with meshopt_ or clod functions
	void clodUseThreadCache(size_t thread_count, size_t size_per_thread)
	{
	struct Global
	{
	void* data;
	size_t block_size;
	uint64_t all_blocks;

	std::atomic<uint64_t> blocks{0};
	};

	struct Local
	{
	void* block;
	size_t offset;
	uint64_t block_mask;
	};

	static Global global;
	thread_local Local local;

	// reset prior global state
	// note: all previously allocated blocks must have been returned at this point; this is guaranteed by the absence of concurrent execution with meshopt_/clod functions
	assert(global.blocks.load() == global.all_blocks);
	::operator delete(global.data);
	global.data = NULL;
	global.block_size = 0;
	global.all_blocks = 0;
	global.blocks = 0;

	thread_count = std::min(thread_count, size_t(64));
	size_per_thread &= ~size_t(15);

	if (thread_count == 0 \|\| size_per_thread == 0)
	{
	meshopt_setAllocator(::operator new, ::operator delete);
	return;
	}

	// allocate a block for each thread and mark each block as available
	global.data = ::operator new(thread_count * size_per_thread);
	global.block_size = size_per_thread;
	global.blocks = global.all_blocks = (thread_count == 64) ? ~0ull : (1ull << thread_count) - 1;

	// override allocation callbacks
	void* (MESHOPTIMIZER_ALLOC_CALLCONV allocate)(size_t) = [](size_t size) -> void
	{
	// try to grab an available local block
	if (local.block == NULL && global.blocks.load() != 0 && size < global.block_size)
	{
	uint64_t blocks, mask;

	do
	{
	blocks = global.blocks.load();
	// prefer last index for coherency, but settle for lowest bit otherwise
	mask = (blocks & local.block_mask) ? local.block_mask : blocks & -blocks;
	// no available block, unlikely to get one soon
	if (blocks == 0)
	break;
	} while (!global.blocks.compare_exchange_weak(blocks, blocks & ~mask));

	if (mask)
	{
	// extract block index from mask (must only have one bit set)
	int index = -1;
	for (int i = 0; i < 64; ++i)
	if (mask & (1ull << i))
	{
	index = i;
	break;
	}

	assert(index >= 0);
	assert(mask && (mask & (mask - 1)) == 0);

	local.block = static_cast<char>(global.data) + index global.block_size;
	local.block_mask = mask;
	}
	}

	// allocate from local block if any
	if (local.block && size < global.block_size && local.offset < global.block_size - size)
	{
	void* ptr = static_cast<char*>(local.block) + local.offset;
	local.offset += size;
	local.offset = (local.offset + 15) & ~size_t(15); // align future allocations to 16b
	return ptr;
	}

	// fall back to system allocator
	return ::operator new(size);
	};

	void (MESHOPTIMIZER_ALLOC_CALLCONV deallocate)(void) = [](void* ptr)
	{
	// has our allocation come from thread cache?
	if (local.block && ptr >= local.block && ptr < static_cast<char*>(local.block) + global.block_size)
	{
	// meshopt allocations are guaranteed to be stack ordered
	assert(ptr <= static_cast<char*>(local.block) + local.offset);
	local.offset = static_cast<char>(ptr) - static_cast<char>(local.block);

	// return local block to the pool
	if (local.offset == 0)
	{
	assert(local.block_mask);
	global.blocks \|= local.block_mask;
	local.block = NULL;
	// keep block_mask as an affinity hint for the next allocation
	}
	}
	else
	::operator delete(ptr);
	};

	meshopt_setAllocator(allocate, deallocate);
	}