Skip to content

Instantly share code, notes, and snippets.

@HurricanKai
Created June 5, 2022 10:48
Show Gist options
  • Save HurricanKai/0125b844666b26b531bb204f099d513d to your computer and use it in GitHub Desktop.
Save HurricanKai/0125b844666b26b531bb204f099d513d to your computer and use it in GitHub Desktop.
A demo achieving 90% throughput using Vulkan & Silk.NET. Written in C#
using System.Collections.Concurrent;
using Silk.NET.Core;
using Silk.NET.Core.Native;
using Silk.NET.Vulkan;
using Buffer = Silk.NET.Vulkan.Buffer;
using Semaphore = Silk.NET.Vulkan.Semaphore;
unsafe
{
var instanceExtensions = new string[] { };
var deviceExtensions = new string[] { };
Instance instance;
Vk vk = Vk.GetApi();
uint enumVersion = 0;
vk.EnumerateInstanceVersion(ref enumVersion);
Console.WriteLine("Device Supports " + ((Version)(Version32)enumVersion));
using (var appName = SilkMarshal.StringToMemory("Copying Goes Brrrr"))
using (var ppEnabledInstanceExtensions = SilkMarshal.StringArrayToMemory(instanceExtensions))
{
var appInfo = new ApplicationInfo(pApplicationName: appName.AsPtr<byte>(), applicationVersion: 1, apiVersion: Vk.Version13);
vk.CreateInstance(
new InstanceCreateInfo(pApplicationInfo: &appInfo, enabledExtensionCount: (uint)instanceExtensions.Length,
ppEnabledExtensionNames: (byte**)ppEnabledInstanceExtensions.AsPtr<byte>()), null, out instance);
}
var physicalDevice = vk.GetPhysicalDevices(instance).First();
var availableFeatures = PhysicalDeviceFeatures2.Chain(out var availablePhysicalDeviceFeatures2)
.AddNext(out PhysicalDeviceVulkan13Features availableVulkan13Features)
.AddNext(out PhysicalDeviceVulkan12Features availableVulkan12Features)
.AddNext(out PhysicalDeviceVulkan11Features availableVulkan11Features);
vk.GetPhysicalDeviceFeatures2(physicalDevice, &availableFeatures);
var deviceProperties = PhysicalDeviceProperties2.Chain(out var physicalDeviceProperties2)
.AddNext(out PhysicalDeviceVulkan13Properties physicalDeviceVulkan13Properties)
.AddNext(out PhysicalDeviceVulkan12Properties physicalDeviceVulkan12Properties)
.AddNext(out PhysicalDeviceVulkan11Properties physicalDeviceVulkan11Properties);
vk.GetPhysicalDeviceProperties2(physicalDevice, &deviceProperties);
Console.WriteLine(SilkMarshal.PtrToString((nint)physicalDeviceProperties2.Properties.DeviceName));
var enabledFeatures = PhysicalDeviceFeatures2.Chain(out var enabledPhysicalDeviceFeatures2)
.AddNext(out PhysicalDeviceVulkan13Features enabledVulkan13Features)
.AddNext(out PhysicalDeviceVulkan12Features enabledVulkan12Features)
.AddNext(out PhysicalDeviceVulkan11Features enabledVulkan11Features);
// Get Transfer-supporting queues, excluding Compute & Graphics.
uint totalQueueCount = 0;
vk.GetPhysicalDeviceQueueFamilyProperties2(physicalDevice, ref totalQueueCount, null);
var allQueues = new QueueFamilyProperties2[totalQueueCount];
for (int i = 0; i < totalQueueCount; i++) allQueues[i] = new QueueFamilyProperties2(pNext: null);
fixed(QueueFamilyProperties2* pAllQueues = allQueues)
vk.GetPhysicalDeviceQueueFamilyProperties2(physicalDevice, ref totalQueueCount, pAllQueues);
var viableQueues = allQueues.Select((x, i) => (x, i)).Where(x
=> x.x.QueueFamilyProperties.QueueFlags.HasFlag(QueueFlags.QueueTransferBit) &&
!x.x.QueueFamilyProperties.QueueFlags.HasFlag(QueueFlags.QueueGraphicsBit) &&
!x.x.QueueFamilyProperties.QueueFlags.HasFlag(QueueFlags.QueueComputeBit))
.ToArray();
var maxQueueCount = viableQueues.Max(x => x.x.QueueFamilyProperties.QueueCount);
var queuePriorities = stackalloc float[(int)maxQueueCount];
for (int i = 0; i < maxQueueCount; i++) queuePriorities[i] = 1.0f;
var createInfos = viableQueues.Select(x => new DeviceQueueCreateInfo(queueFamilyIndex: (uint)x.i,
queueCount: x.x.QueueFamilyProperties.QueueCount, pQueuePriorities: queuePriorities)).ToArray();
enabledVulkan13Features.Synchronization2 = true;
enabledVulkan12Features.TimelineSemaphore = true;
Device device;
using (var ppEnabledExtensionNames = SilkMarshal.StringArrayToMemory(deviceExtensions))
fixed(DeviceQueueCreateInfo* pQueueCreateInfos = createInfos)
vk.CreateDevice(physicalDevice,
new DeviceCreateInfo(queueCreateInfoCount: (uint)createInfos.Length,
pQueueCreateInfos: pQueueCreateInfos,
enabledExtensionCount: (uint)deviceExtensions.Length,
ppEnabledExtensionNames: (byte**)ppEnabledExtensionNames.AsPtr<byte>(),
pEnabledFeatures: null, pNext: &enabledFeatures), null, out device);
var transferQueues = viableQueues.SelectMany(x => Enumerable.Range(0, (int)x.x.QueueFamilyProperties.QueueCount)
.Select(e =>
{
vk.GetDeviceQueue2(device, new DeviceQueueInfo2(queueFamilyIndex: (uint)x.i, queueIndex: (uint)e),
out var queue);
return (queue, (uint)x.i);
})).ToArray();
const ulong localBlockSize = 1 * 1000 * 1000 * 1000;
var uploadCount = 6;
ulong uploadSize = (ulong)uploadCount * localBlockSize;
var transferManager = new MultiQueueSequentialTransferManager(vk, physicalDevice, instance, device, transferQueues, ulong.MaxValue);
// HACK: We just assume all queues share one index here.
// Fix for your own code.
var q = transferQueues[0].Item2;
Console.WriteLine($"Transfering into a {uploadSize} byte buffer in chunks of {localBlockSize} bytes.");
vk.CreateBuffer(device,
new BufferCreateInfo(size: uploadSize, usage: BufferUsageFlags.BufferUsageTransferDstBit,
sharingMode: SharingMode.Exclusive, queueFamilyIndexCount: 1, pQueueFamilyIndices: &q), null, out var gpuBuffer);
vk.GetBufferMemoryRequirements(device, gpuBuffer, out var gpuMemoryReqs);
vk.GetPhysicalDeviceMemoryProperties(physicalDevice, out var physicalDeviceMemoryProperties);
int? memoryTypeIndex = null;
Console.WriteLine("Looking to allocate " + gpuMemoryReqs.Size);
Console.WriteLine("Heap Sizes:");
for (var index = 0; index < physicalDeviceMemoryProperties.MemoryHeapCount; index++)
{
var heap = physicalDeviceMemoryProperties.MemoryHeaps[index];
Console.WriteLine("\t" + heap.Size + " Flags: " + heap.Flags);
}
var span2 = physicalDeviceMemoryProperties.MemoryTypes.AsSpan();
for (var i = 0; i < physicalDeviceMemoryProperties.MemoryTypeCount; i++)
{
var type = span2[i];
Console.WriteLine("Type: " + i + " Heap (Size:" +
physicalDeviceMemoryProperties.MemoryHeaps[(int)type.HeapIndex].Size + ", Flags:" +
physicalDeviceMemoryProperties.MemoryHeaps[(int)type.HeapIndex].Flags + ") Flags:" +
type.PropertyFlags);
Console.WriteLine("Supported: " + ((gpuMemoryReqs.MemoryTypeBits & 1u << i) == 0));
if (!physicalDeviceMemoryProperties.MemoryHeaps[(int)type.HeapIndex].Flags
.HasFlag(MemoryHeapFlags.MemoryHeapDeviceLocalBit) ||
(physicalDeviceMemoryProperties.MemoryHeaps[(int)type.HeapIndex].Size < gpuMemoryReqs.Size))
{
continue;
}
if ((gpuMemoryReqs.MemoryTypeBits & 1u << i) == 0) continue;
if ((type.PropertyFlags & MemoryPropertyFlags.MemoryPropertyDeviceLocalBit) != 0 &&
(type.PropertyFlags & MemoryPropertyFlags.MemoryPropertyLazilyAllocatedBit) == 0)
{
memoryTypeIndex = i;
break;
}
}
if (!memoryTypeIndex.HasValue)
{
throw new Exception();
}
var selectedType = physicalDeviceMemoryProperties.MemoryTypes[memoryTypeIndex.Value];
Console.WriteLine("Chose Type: " + memoryTypeIndex.Value + " Heap (Size:" +
physicalDeviceMemoryProperties.MemoryHeaps[(int)selectedType.HeapIndex].Size + ", Flags:" +
physicalDeviceMemoryProperties.MemoryHeaps[(int)selectedType.HeapIndex].Flags + ") Flags:" +
selectedType.PropertyFlags);
vk.AllocateMemory(device,
new MemoryAllocateInfo(allocationSize: gpuMemoryReqs.Size, memoryTypeIndex: (uint)memoryTypeIndex.Value), null,
out var gpuMem);
vk.BindBufferMemory(device, gpuBuffer, gpuMem, 0);
Console.WriteLine("... Running");
try
{
var localMem = new byte[localBlockSize];
Random.Shared.NextBytes(localMem);
while (true)
{
TransferHandle last = default;
for (int i = 0; i < uploadCount; i++)
{
last = transferManager.EnqueueBytes(localMem, gpuBuffer, (ulong)i * localBlockSize);
}
if (transferManager.OutstandingWrites > 1000)
transferManager.WaitDone(last);
}
}
finally
{
Console.WriteLine();
Console.WriteLine();
Console.WriteLine("----------------------------- TEARDOWN -----------------------------");
Console.WriteLine();
Console.WriteLine();
vk.DestroyBuffer(device, gpuBuffer, null);
vk.FreeMemory(device, gpuMem, null);
transferManager.Dispose();
vk.DestroyDevice(device, null);
vk.DestroyInstance(instance, null);
vk.Dispose();
}
}
public readonly struct TransferHandle
{
public readonly ulong Manager;
public readonly ulong Id;
public TransferHandle(ulong manager, ulong id)
{
Manager = manager;
Id = id;
}
}
public unsafe class MultiQueueSequentialTransferManager : IDisposable
{
private readonly Vk _vk;
private readonly PhysicalDevice _physicalDevice;
private readonly Instance _instance;
private readonly Device _device;
private readonly ReadOnlyMemory<(Queue, uint)> _queues;
private readonly ReadOnlyMemory<SequentialTransferManager> _subManagers;
private ulong _nextManager;
public TransferHandle EnqueueBytes(ReadOnlyMemory<byte> bytes, Buffer gpuBuffer, ulong bufferOffset)
{
// This is a super basic round-robbin implementation, which should be fine most of the time.
// Could implement some sort of balancing by making the sub managers report how much outstanding work they already have.
var managerId = Interlocked.Increment(ref _nextManager);
var managers = _subManagers.Span;
var manager = managers[(int)(managerId % (ulong)managers.Length)];
var id = manager.EnqueueBytes(bytes, gpuBuffer, bufferOffset);
return new TransferHandle(managerId, id);
}
public bool CheckDone(TransferHandle handle)
{
var managers = _subManagers.Span;
var manager = managers[(int)(handle.Manager % (ulong)managers.Length)];
return manager.CheckDone(handle.Id);
}
public int OutstandingWrites
{
get
{
int sum = 0;
foreach (var v in _subManagers.Span)
{
sum += v.OutstandingWrites;
}
return sum;
}
}
public void WaitDone(TransferHandle handle)
{
var managers = _subManagers.Span;
var manager = managers[(int)(handle.Manager % (ulong)managers.Length)];
manager.WaitDone(handle.Id);
}
public MultiQueueSequentialTransferManager(Vk vk, PhysicalDevice physicalDevice, Instance instance, Device device, ReadOnlyMemory<(Queue, uint)> queues, ulong blockSizeHint)
{
_vk = vk;
_physicalDevice = physicalDevice;
_instance = instance;
_device = device;
_queues = queues;
vk.GetPhysicalDeviceMemoryProperties(_physicalDevice, out var physicalDeviceMemoryProperties);
(int, MemoryHeap)? localHeap = null;
var span = physicalDeviceMemoryProperties.MemoryHeaps.AsSpan();
for (var index = 0; index < span.Length; index++)
{
var heap = span[index];
if (heap.Flags == 0)
{
localHeap = (index, heap);
break;
}
}
if (!localHeap.HasValue)
{
throw new Exception();
}
int? memoryTypeIndex = null;
var span2 = physicalDeviceMemoryProperties.MemoryTypes.AsSpan();
for (var index = 0; index < span2.Length; index++)
{
var type = span2[index];
if (type.HeapIndex != localHeap.Value.Item1) continue;
if ((type.PropertyFlags & MemoryPropertyFlags.MemoryPropertyHostVisibleBit) != 0 &&
(type.PropertyFlags & MemoryPropertyFlags.MemoryPropertyLazilyAllocatedBit) == 0)
{
memoryTypeIndex = index;
break;
}
}
if (!memoryTypeIndex.HasValue)
{
throw new Exception();
}
var maxSize = localHeap.Value.Item2.Size;
maxSize /= 2; // Just to be conservative
Console.WriteLine("Transfer Manager has " + maxSize + " bytes to work with");
// Here you might want to leave some space for the rest of the app
var bytesPerQueue = (ulong)Math.Floor((double)maxSize / _queues.Length);
Console.WriteLine("Using " + bytesPerQueue + " bytes per queue");
var subManagers = new SequentialTransferManager[_queues.Length];
for (int i = 0; i < subManagers.Length; i++)
{
subManagers[i] = new SequentialTransferManager(vk, physicalDevice, instance, device, _queues.Span[i].Item1,
_queues.Span[i].Item2, (uint)localHeap.Value.Item1, bytesPerQueue, blockSizeHint);
}
_subManagers = subManagers;
}
public void Dispose()
{
foreach (var m in _subManagers.Span)
{
m.Dispose();
}
}
private sealed class SequentialTransferManager : IDisposable
{
private const int BLOCK_COUNT = 3;
private struct Block
{
public readonly DeviceMemory BackingMemory;
public readonly Buffer Buffer;
public readonly ReadOnlyMemory<CommandBuffer> CommandBuffers;
public int NextCommandBuffer = 0;
public ulong WaitFor = 0;
public Block(Buffer buffer, DeviceMemory backingMemory, ReadOnlyMemory<CommandBuffer> commandBuffers)
{
Buffer = buffer;
BackingMemory = backingMemory;
CommandBuffers = commandBuffers;
}
}
private readonly struct BufferWrite
{
public readonly ulong Id;
public readonly ReadOnlyMemory<byte> Data;
public readonly Buffer TargetBuffer;
public readonly ulong TargetOffset;
public BufferWrite(ulong id, ReadOnlyMemory<byte> data, Buffer targetBuffer, ulong targetOffset)
{
Id = id;
Data = data;
TargetBuffer = targetBuffer;
TargetOffset = targetOffset;
}
}
private readonly Vk _vk;
private readonly PhysicalDevice _physicalDevice;
private readonly Instance _instance;
private readonly Device _device;
private readonly Queue _queue;
private readonly uint _queueFamilyIndex;
private readonly uint _heapIndex;
private readonly ulong _memorySize;
private readonly int _blockSize;
private readonly Memory<Block> _blocks;
private readonly ConcurrentQueue<BufferWrite> _writeQueue = new ConcurrentQueue<BufferWrite>();
private readonly Thread _thread;
private readonly Semaphore _mainSemaphore;
private readonly CommandPool _commandPool;
private ulong _writeId = 1;
public int OutstandingWrites => _writeQueue.Count;
public bool CheckDone(ulong id)
{
_vk.GetSemaphoreCounterValue(_device, _mainSemaphore, out var value);
return id >= value;
}
public void WaitDone(ulong id)
{
var spin = new SpinWait();
while (true)
{
_vk.GetSemaphoreCounterValue(_device, _mainSemaphore, out var value);
if (value >= id)
{
return;
}
spin.SpinOnce();
}
}
public ulong EnqueueBytes(ReadOnlyMemory<byte> bytes, Buffer gpuBuffer, ulong bufferOffset)
{
var blockSize = _blockSize;
while (bytes.Length > blockSize)
{
var subBytes = bytes[..blockSize];
_writeQueue.Enqueue(new BufferWrite(Interlocked.Increment(ref _writeId), subBytes, gpuBuffer, bufferOffset));
bytes = bytes[blockSize..];
bufferOffset += (ulong)blockSize;
}
var lastId = Interlocked.Increment(ref _writeId);
_writeQueue.Enqueue(new BufferWrite(lastId, bytes, gpuBuffer, bufferOffset));
return lastId;
}
public SequentialTransferManager(
Vk vk,
PhysicalDevice physicalDevice,
Instance instance,
Device device,
Queue queue,
uint queueFamilyIndex,
uint heapIndex,
ulong memorySize,
ulong blockSizeHint)
{
_vk = vk;
_physicalDevice = physicalDevice;
_instance = instance;
_device = device;
_queue = queue;
_queueFamilyIndex = queueFamilyIndex;
_heapIndex = heapIndex;
_memorySize = memorySize;
_blockSize = (int)Math.Min(blockSizeHint, Math.Min(int.MaxValue, Math.Floor((double)_memorySize / BLOCK_COUNT)));
_vk.CreateCommandPool(_device, new CommandPoolCreateInfo(queueFamilyIndex: _queueFamilyIndex, flags: CommandPoolCreateFlags.CommandPoolCreateResetCommandBufferBit), null,
out _commandPool);
var blocks = new Block[BLOCK_COUNT];
for (int i = 0; i < BLOCK_COUNT; i++)
{
blocks[i] = CreateBlock(i);
}
_blocks = blocks;
_thread = new Thread(Loop) { IsBackground = true };
var createInfo = SemaphoreCreateInfo.Chain(out _).AddNext(out SemaphoreTypeCreateInfo typeCreateInfo);
typeCreateInfo.InitialValue = _writeId;
typeCreateInfo.SemaphoreType = SemaphoreType.Timeline;
_vk.CreateSemaphore(_device,
createInfo, null,
out _mainSemaphore);
Console.WriteLine($"Starting Manager on Queue {queueFamilyIndex} with {BLOCK_COUNT} blocks. Each block holds {_blockSize} bytes.");
_thread.Start();
}
private void Loop()
{
var nextBlock = 0;
var blocks = _blocks.Span;
var sem = _mainSemaphore;
_vk.GetPhysicalDeviceProperties(_physicalDevice, out var physicalDeviceProperties);
var coherentAtomSize = physicalDeviceProperties.Limits.NonCoherentAtomSize;
while (true)
{
var spinWait = new SpinWait();
while (true)
{
var potentialBlock = blocks[nextBlock];
_vk.GetSemaphoreCounterValue(_device, sem, out var value);
if (value >= potentialBlock.WaitFor)
{
break;
}
spinWait.SpinOnce();
nextBlock = (nextBlock + 1) % blocks.Length;
}
BufferWrite toWrite;
while (!_writeQueue.TryDequeue(out toWrite))
{
spinWait.SpinOnce();
}
var localData = toWrite.Data.Span;
// write to block
ref var block = ref blocks[nextBlock];
var alignedSize = (ulong)localData.Length;
alignedSize += coherentAtomSize - ((ulong)localData.Length % coherentAtomSize);
if (alignedSize >= (ulong)_blockSize)
{
alignedSize = Vk.WholeSize;
}
void* mappedData = null;
_vk.MapMemory(_device, block.BackingMemory, 0, alignedSize, 0, ref mappedData);
localData.CopyTo(new Span<byte>(mappedData, localData.Length));
_vk.InvalidateMappedMemoryRanges(_device, 1,
new MappedMemoryRange(memory: block.BackingMemory, offset: 0,
size: alignedSize));
_vk.UnmapMemory(_device, block.BackingMemory);
// make GPU do stuff
var commandBuffer = block.CommandBuffers.Span[block.NextCommandBuffer];
block.NextCommandBuffer = (block.NextCommandBuffer + 1) % block.CommandBuffers.Length;
_vk.BeginCommandBuffer(commandBuffer, new CommandBufferBeginInfo(flags: CommandBufferUsageFlags.CommandBufferUsageOneTimeSubmitBit));
_vk.CmdCopyBuffer(commandBuffer, block.Buffer, toWrite.TargetBuffer, 1,
new BufferCopy(0, 0, (ulong)localData.Length));
_vk.EndCommandBuffer(commandBuffer);
var startId = toWrite.Id - 1;
var endId = toWrite.Id;
block.WaitFor = endId;
var timelineInfo = new TimelineSemaphoreSubmitInfo(waitSemaphoreValueCount: 1,
pWaitSemaphoreValues: &startId, signalSemaphoreValueCount: 1, pSignalSemaphoreValues: &endId);
var dstMask = PipelineStageFlags.PipelineStageTransferBit;
_vk.QueueSubmit(_queue, 1, new SubmitInfo(waitSemaphoreCount: 1, pWaitSemaphores: &sem,
signalSemaphoreCount: 1, pSignalSemaphores: &sem, commandBufferCount: 1, pCommandBuffers: &commandBuffer, pNext: &timelineInfo,
pWaitDstStageMask: &dstMask), default);
}
}
private Block CreateBlock(int index)
{
Console.WriteLine("Allocating Block " + index + " on queue " + _queueFamilyIndex);
var queueFamilyIndex = _queueFamilyIndex;
_vk.CreateBuffer(_device,
new BufferCreateInfo(size: (ulong)_blockSize, usage: BufferUsageFlags.BufferUsageTransferSrcBit,
sharingMode: SharingMode.Exclusive, queueFamilyIndexCount: 1,
pQueueFamilyIndices: &queueFamilyIndex), null, out var buffer);
_vk.GetBufferMemoryRequirements(_device, buffer, out var requirements);
_vk.GetPhysicalDeviceMemoryProperties(_physicalDevice, out var physicalDeviceMemoryProperties);
int? memoryTypeIndex = null;
var span2 = physicalDeviceMemoryProperties.MemoryTypes.AsSpan();
for (var i = 0; i < span2.Length; i++)
{
var type = span2[i];
if (type.HeapIndex != _heapIndex) continue;
if ((requirements.MemoryTypeBits & 1u << i) == 0) continue;
if ((type.PropertyFlags & MemoryPropertyFlags.MemoryPropertyHostVisibleBit) != 0 &&
(type.PropertyFlags & MemoryPropertyFlags.MemoryPropertyLazilyAllocatedBit) == 0)
{
memoryTypeIndex = i;
break;
}
}
if (!memoryTypeIndex.HasValue)
{
throw new Exception();
}
Console.WriteLine("Allocating " + requirements.Size + " bytes for buffer of size " + _blockSize);
_vk.AllocateMemory(_device,
new MemoryAllocateInfo(allocationSize: requirements.Size, memoryTypeIndex: (uint)memoryTypeIndex.Value),
null, out var deviceMemory);
_vk.BindBufferMemory(_device, buffer, deviceMemory, 0);
var commandBuffers = new CommandBuffer[2];
fixed (CommandBuffer* pCommandBuffers = commandBuffers)
_vk.AllocateCommandBuffers(_device,
new CommandBufferAllocateInfo(commandPool: _commandPool, level: CommandBufferLevel.Primary,
commandBufferCount: BLOCK_COUNT), pCommandBuffers);
return new Block(buffer, deviceMemory, commandBuffers);
}
public void Dispose()
{
// TODO: Dispose stuff
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment