Skip to content

Instantly share code, notes, and snippets.

@dwilliamson
Last active May 11, 2026 21:03
Show Gist options
  • Select an option

  • Save dwilliamson/f9cbb20e3dca9b3b0f45c3007bdfc088 to your computer and use it in GitHub Desktop.

Select an option

Save dwilliamson/f9cbb20e3dca9b3b0f45c3007bdfc088 to your computer and use it in GitHub Desktop.
Star Engine CUDA Implementation from 2015
//
// TODO: Expose Desc() as public member?
// After that's done, why does the Desc struct exist at all?
// Use const/const_cast trick to make the members read-only?
//
#pragma once
#include <Core/Core.h>
#include <Core/PixelFormat.h>
struct Format;
namespace mpp
{
class Event;
}
clcpp_reflect_part(cmp)
namespace cmp
{
// ------------------------------------------------------------------------------------------------------- //
// Handles //
// ------------------------------------------------------------------------------------------------------- //
struct clcpp_attr(reflect) HndProgram : public clutl::Object
{
};
struct clcpp_attr(reflect) HndKernel : public clutl::Object
{
};
// ------------------------------------------------------------------------------------------------------- //
// Kernel Argument API //
// ------------------------------------------------------------------------------------------------------- //
class KernelArgs;
struct clcpp_attr(reflect_part) KernelArg
{
KernelArg();
// Return a copy of the argument
template <typename TYPE>
TYPE Get() const
{
return *(TYPE*)Data();
}
// Set the argument, copying the value
template <typename TYPE>
void Set(TYPE value)
{
*(TYPE*)Data() = value;
}
void* Data() const;
// Reflected data type
const clcpp::Type* data_type;
// Size of the data, not including sizeof(KernelArg)
u32 data_size;
// Argument index
u32 index;
// Byte offset to the data
u32 offset;
// Pointer to the argument list that owns this argument
KernelArgs* args_parent;
};
//
// This class allows queueing of kernel arguments so that they can be applied to a kernel at the point of
// execution. This allows the Compute API to guarantee safe access to a kernel without it being reloaded
// during use. The type helpers are just an added convenience and not the point of this class.
//
class clcpp_attr(reflect) KernelArgs
{
public:
KernelArgs();
template <typename TYPE>
void Push(TYPE value)
{
Push(&value, clcpp::GetType< CORE_STRIP_CONST_POINTER(TYPE) >(), sizeof(value));
}
private:
friend struct KernelArg;
friend class KernelArgIterator;
void Push(void* data, const clcpp::Type* type, u32 data_size);
// Store data in the container memory space for cheap instantiation
static const u32 MAX_DATA_SIZE = 1024;
u8 m_Data[MAX_DATA_SIZE];
u32 m_Position;
u32 m_NbArguments;
};
class KernelArgIterator
{
public:
KernelArgIterator(const KernelArgs& args);
bool GetNext(KernelArg& arg);
private:
const KernelArgs& m_KernelArgs;
u32 m_Position;
};
// ------------------------------------------------------------------------------------------------------- //
// Interfaces //
// ------------------------------------------------------------------------------------------------------- //
//
// Specifies read/write access to device memory for kernels
//
enum clcpp_attr(reflect) Access
{
Access_Read,
Access_Write,
Access_ReadWrite,
};
class HostMem;
class DeviceMem;
class Texture3D;
struct GfxResource;
struct Event;
// ------------------------------------------------------------------------------------------------------- //
// Concurrent Work Queue Interface //
// ------------------------------------------------------------------------------------------------------- //
static const int MAX_KERNEL_BLOCK_SIZE = 1;
// ARC-NOTE: No clcpp_impl_class as there are pure virtuals (and we don't WANT it to be creatable)
struct clcpp_attr(reflect) Queue : public clutl::Object2
{
virtual ~Queue() { }
virtual bool RunKernel(const HndKernel* h_kernel, const KernelArgs& args, u32 nb_items, int block_size) = 0;
virtual bool CopyHostToDevice(const void* src_data, u32 src_size, DeviceMem* device_mem) = 0;
virtual bool CopyDeviceToHost(DeviceMem* device_mem, void* dst_data, u32 dst_size) = 0;
virtual bool CopyHostToTexture3D(const void* src_data, u32 src_pitch, u32 src_height, u32 src_depth, Texture3D* h_texture) = 0;
virtual bool CopyTexture3DToHost(Texture3D* h_texture, void* dst_data, u32 dst_pitch, u32 dst_height, u32 dst_depth) = 0;
virtual bool CopyDeviceToGfxTexture3D(DeviceMem* src_device, GfxResource* dst_gfxres, u32 pitch, u32 height, u32 depth) = 0;
virtual bool CopyDeviceToGfxBuffer(DeviceMem* src_device, GfxResource* dst_gfxres, u32 size) = 0;
virtual bool Sync() = 0;
virtual bool MarkEvent(Event* event) = 0;
virtual bool WaitEvent(Event* event) = 0;
virtual void* Resource() = 0;
// Helpers to use descriptions embedded in memory objects to simplify the call
bool CopyMappedHostToDevice(HostMem* host_mem, DeviceMem* device_mem);
bool CopyDeviceToMappedHost(DeviceMem* device_mem, HostMem* host_mem);
bool CopyMappedHostToTexture3D(HostMem* host_mem, Texture3D* texture);
bool CopyTexture3DToMappedHost(Texture3D* texture, HostMem* host_mem);
};
// ------------------------------------------------------------------------------------------------------- //
// Host Memory Interface //
// ------------------------------------------------------------------------------------------------------- //
struct clcpp_attr(reflect) HostMemDesc
{
HostMemDesc();
u32 size;
void* mapped_mem;
};
//
// Pinned/page-locked host memory allocated to be the source or destination of transfers to or
// from the device.
//
class clcpp_attr(reflect) HostMem : public clutl::Object2
{
public:
virtual ~HostMem() { }
virtual void* MapNow(Queue* queue) = 0;
virtual void* Map(Queue* queue) = 0;
virtual void Unmap(Queue* queue) = 0;
const HostMemDesc& Desc() const { return m_Desc; }
protected:
HostMemDesc m_Desc;
};
// ------------------------------------------------------------------------------------------------------- //
// Device Memory Interface //
// ------------------------------------------------------------------------------------------------------- //
struct clcpp_attr(reflect) DeviceMemDesc
{
DeviceMemDesc();
u32 size;
};
//
// A chunk of memory allocated on the device that can't be directly access by the host without
// copying.
//
class clcpp_attr(reflect) DeviceMem : public clutl::Object2
{
public:
virtual ~DeviceMem() { }
const DeviceMemDesc& Desc() const { return m_Desc; }
protected:
DeviceMemDesc m_Desc;
};
// ------------------------------------------------------------------------------------------------------- //
// Texture Interface //
// ------------------------------------------------------------------------------------------------------- //
// TODO: There is a need for some kind of "sampler object" to describe filtering, etc.
struct clcpp_attr(reflect) TextureDesc
{
TextureDesc();
u32 width;
u32 height;
u32 depth;
u32 pitch;
u32 size;
Format format;
};
//
// A 3D texture allocated on the device to be sampled in kernels.
//
class clcpp_attr(reflect) Texture3D : public clutl::Object2
{
public:
virtual ~Texture3D() { }
const TextureDesc& Desc() const { return m_Desc; }
protected:
TextureDesc m_Desc;
};
struct clcpp_attr(reflect) GfxResource : public clutl::Object2
{
virtual ~GfxResource() { }
};
// ------------------------------------------------------------------------------------------------------- //
// Event Interface //
// ------------------------------------------------------------------------------------------------------- //
struct clcpp_attr(reflect_part) Event : public clutl::Object2
{
virtual ~Event() { };
};
// ------------------------------------------------------------------------------------------------------- //
// Main Compute Module Interface //
// ------------------------------------------------------------------------------------------------------- //
struct clcpp_attr(reflect_part) iCompute : public core::iSubsystem
{
virtual const HndProgram* Program_New(const file::Path& filename, const core::String32* kernel_names, u32 nb_kernels) = 0;
virtual const HndKernel* Program_GetKernel(const HndProgram* program, const core::String32& name) = 0;
virtual DeviceMem* DeviceMem_New(u32 size, Access access) = 0;
virtual HostMem* HostMem_New(u32 size, Access access) = 0;
// If read/write is specified then CUDA creates a surface reference
// Texture reflection can tell whether a surface needs to be bound or now
virtual Texture3D* Texture3D_New(u32 width, u32 height, u32 depth, const Format& fmt, Access access) = 0;
virtual GfxResource* GfxResource_New(void* resource) = 0;
virtual Event* Event_New() = 0;
virtual Queue* Queue_New() = 0;
};
}
@@ -1,785 +0,0 @@
#include "CUDA.h"
#include "CUDACompute.h"
#include <Core/File.h>
clcpp_impl_destruct(cuda::Queue);
clcpp_impl_class(cuda::Kernel);
clcpp_impl_class(cuda::Program);
clcpp_impl_destruct(cuda::DeviceMemory);
clcpp_impl_destruct(cuda::HostMemory);
clcpp_impl_destruct(cuda::Texture3D);
clcpp_impl_destruct(cuda::GfxResource);
clcpp_impl_destruct(cuda::Event);
namespace
{
// Format x BitCount matrix, mapping Format to a CUarray_format
CUarray_format g_NullArrayFormat = (CUarray_format)0;
CUarray_format g_ArrayFormatMap[FmtType_Count][3];
u32 g_TextureFlags[FmtView_Count];
int MapSMToCores(int major, int minor)
{
struct GpuArchCoresPerSM_t
{
int SM; // 0xMm (hexadecimal notation), M = SM Major version, m = SM minor version
int Cores;
} GpuArchCoresPerSM[] =
{
{ 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
{ 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
{ 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
};
// Search for matching version
int nb_arch_cores = sizeof(GpuArchCoresPerSM) / sizeof(GpuArchCoresPerSM[0]);
for (int i = 0; i < nb_arch_cores; i++)
{
if (GpuArchCoresPerSM[i].SM == ((major << 4) + minor))
return GpuArchCoresPerSM[i].Cores;
}
// If we don't find the values, we use the previous one to run property
const GpuArchCoresPerSM_t& last_arch = GpuArchCoresPerSM[nb_arch_cores - 1];
core::LogText("MapSMToCores for SM %d.%d is undefined. Default to use %d Cores/SM", major, minor, last_arch.Cores);
return last_arch.Cores;
}
}
bool cuda::HandleError(CUresult result, const char* expression, const char* file, int line)
{
if (result)
{
// Lookup error strings
const char* error_name = nullptr;
const char* error_desc = nullptr;
if (cuGetErrorName(result, &error_name) != CUDA_SUCCESS)
error_name = "<Unknown>";
if (cuGetErrorString(result, &error_desc) != CUDA_SUCCESS)
error_desc = "<Unknown>";
core::LogText("CUDA: Error at %s:%d (%s) code=%d(%s: %s)", file, line, expression, result, error_name, error_desc);
// TODO: device reset/exit?
return true;
}
return false;
}
int cuda::GetMaxGflopsDeviceId()
{
// Get device count
int device_count = 0;
cudaGetDeviceCount(&device_count);
// Find the best major SM architecture GPU device
int best_sm_arch = 0;
for (int i = 0; i < device_count; i++)
{
cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, i);
// If this GPU is not running on Compute Mode prohibited then we can add it to the list
if (device_prop.computeMode != cudaComputeModeProhibited)
{
if (device_prop.major > 0 && device_prop.major < 9999)
best_sm_arch = max(best_sm_arch, device_prop.major);
}
}
// Find the best CUDA capable GPU device
int sm_per_multiproc = 0;
int max_compute_perf = 0;
int max_perf_device = 0;
for (int i = 0; i < device_count; i++)
{
cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, i);
// If this GPU is not running on Compute Mode prohibited then we can add it to the list
if (device_prop.major == 9999 && device_prop.minor == 9999)
sm_per_multiproc = 1;
else
sm_per_multiproc = MapSMToCores(device_prop.major, device_prop.minor);
int compute_perf = device_prop.multiProcessorCount * sm_per_multiproc * device_prop.clockRate;
if (compute_perf > max_compute_perf)
{
// If we find GPU with SM major >2, search only these
if (best_sm_arch > 2)
{
if (device_prop.major == best_sm_arch)
{
max_compute_perf = compute_perf;
max_perf_device = i;
}
}
else
{
max_compute_perf = compute_perf;
max_perf_device = i;
}
}
}
return max_perf_device;
}
void cuda::EnsureContext(CUcontext context)
{
CUcontext current;
cuCtxGetCurrent(&current);
if (current != context)
cuCtxSetCurrent(context);
}
cuda::Queue::Queue(CUcontext context)
: m_Context(context)
, m_Stream(nullptr)
{
SetObjectType(this);
if (cudaHasError(cuStreamCreate(&m_Stream, CU_STREAM_NON_BLOCKING)))
m_Stream = nullptr;
}
cuda::Queue::~Queue()
{
if (m_Stream != nullptr)
cuStreamDestroy(m_Stream);
}
bool cuda::Queue::RunKernel(const cmp::HndKernel* h_kernel, const cmp::KernelArgs& args, u32 nb_items, int block_size)
{
static clcpp::uint32 HASH_DeviceMemory= clcpp::GetTypeNameHash<cmp::DeviceMem>();
static clcpp::uint32 HASH_HndTexture3D = clcpp::GetTypeNameHash<cmp::Texture3D>();
cuda::EnsureContext(m_Context);
// Cast to local implementations
core::Assert(h_kernel != nullptr);
const cuda::Kernel* kernel = h_kernel->Cast<cuda::Kernel>();
// Lock the parent program so that it can't be reloaded while in use
// TODO: This interferes with multi-thread launching of different kernels in the same program
mpp::MutexLock lock(kernel->program->mutex);
// Build a list of pointers to each argument
static const int MAX_NB_ARGS = 10;
void* arg_ptrs[MAX_NB_ARGS] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
cmp::KernelArg arg;
cmp::KernelArgIterator i(args);
u32 arg_index = 0;
u32 tex_arg_index = 0;
while (i.GetNext(arg))
{
core::Assert(arg.index < MAX_NB_ARGS);
clcpp::uint32 type_hash = arg.data_type->name.hash;
if (type_hash == HASH_DeviceMemory)
{
cuda::DeviceMemory* dev_mem = arg.Get<cuda::DeviceMemory*>();
arg_ptrs[arg_index++] = &dev_mem->ptr;
}
else if (type_hash == HASH_HndTexture3D)
{
cuda::Texture3D* texture = arg.Get<cuda::Texture3D*>();
// Get the next texture parameter
const cuda::KernelTextureParam& tex_param = kernel->texture_params[tex_arg_index++];
if (tex_param.ref_type == 't')
{
// Ensure dimensions/read mode match the texture
if (tex_param.read_type == 'u')
core::Assert((texture->flags & CU_TRSF_READ_AS_INTEGER) != 0);
else
core::Assert((texture->flags & CU_TRSF_READ_AS_INTEGER) == 0);
// Override texture reference settings with those in the texture
cuTexRefSetFilterMode(tex_param.tex_ref, texture->filter_mode);
cuTexRefSetAddressMode(tex_param.tex_ref, 0, texture->address_mode);
cuTexRefSetAddressMode(tex_param.tex_ref, 1, texture->address_mode);
cuTexRefSetAddressMode(tex_param.tex_ref, 2, texture->address_mode);
cuTexRefSetFlags(tex_param.tex_ref, texture->flags);
// Bind the texture's array data to the reference
core::Assert(tex_param.tex_ref != nullptr);
cuTexRefSetArray(tex_param.tex_ref, texture->array, CU_TRSA_OVERRIDE_FORMAT);
}
else if (tex_param.ref_type == 's')
{
// Bind the texture's array data to the surface reference
core::Assert(tex_param.surf_ref != nullptr);
cuSurfRefSetArray(tex_param.surf_ref, texture->array, 0);
}
}
else
{
arg_ptrs[arg_index++] = arg.Data();
}
}
// Assign max work items if necessary
// TODO: Set correct block size for target hardware!
if (block_size == cmp::MAX_KERNEL_BLOCK_SIZE)
block_size = 512;
block_size = min(block_size, (int)nb_items);
// Launch the kernel with max work items
// TODO: Set correct block size for target hardware!
const int nb_blocks = (nb_items + block_size - 1) / block_size;
bool has_error = cudaHasError(cuLaunchKernel(kernel->function, nb_blocks, 1, 1, block_size, 1, 1, 0, m_Stream, arg_ptrs, NULL));
return !has_error;
}
bool cuda::Queue::CopyHostToDevice(const void* src_data, u32 src_size, cmp::DeviceMem* h_dev_mem)
{
cuda::EnsureContext(m_Context);
core::Assert(h_dev_mem != nullptr);
core::Assert(src_data != nullptr);
// Cast to local implementations
const cuda::DeviceMemory* dev_mem = (cuda::DeviceMemory*)h_dev_mem;
core::Assert(src_size <= dev_mem->Desc().size);
// Place write in the stream
return !cudaHasError(cuMemcpyHtoDAsync(dev_mem->ptr, src_data, src_size, m_Stream));
}
bool cuda::Queue::CopyDeviceToHost(cmp::DeviceMem *h_dev_mem, void *dst_data, u32 dst_size)
{
cuda::EnsureContext(m_Context);
core::Assert(dst_data != nullptr);
core::Assert(h_dev_mem != nullptr);
// Cast to local implementations
const cuda::DeviceMemory* dev_mem = (cuda::DeviceMemory*)h_dev_mem;
core::Assert(dst_size <= dev_mem->Desc().size);
// Place read in the stream
return !cudaHasError(cuMemcpyDtoHAsync(dst_data, dev_mem->ptr, dst_size, m_Stream));
}
bool cuda::Queue::CopyHostToTexture3D(const void* src_data, u32 src_pitch, u32 src_height, u32 src_depth, cmp::Texture3D* h_texture)
{
cuda::EnsureContext(m_Context);
core::Assert(h_texture != nullptr);
core::Assert(src_data != nullptr);
// Cast to local implementations
const cuda::Texture3D* texture = (cuda::Texture3D*)h_texture;
const cmp::TextureDesc& desc = texture->Desc();
core::Assert(src_pitch <= desc.pitch);
core::Assert(src_height <= desc.height);
core::Assert(src_depth <= desc.depth);
core::Assert(texture->array != nullptr);
// Describe the host to array copy
CUDA_MEMCPY3D copy;
memset(&copy, 0, sizeof(copy));
copy.Depth = src_depth;
copy.Height = src_height;
copy.WidthInBytes = src_pitch;
copy.srcHost = src_data;
copy.srcHeight = src_height;
copy.srcMemoryType = CU_MEMORYTYPE_HOST;
copy.srcPitch = src_pitch;
copy.dstArray = texture->array;
copy.dstMemoryType = CU_MEMORYTYPE_ARRAY;
// Place the copy in the stream
return !cudaHasError(cuMemcpy3DAsync(&copy, m_Stream));
}
bool cuda::Queue::CopyTexture3DToHost(cmp::Texture3D* h_texture, void* dst_data, u32 dst_pitch, u32 dst_height, u32 dst_depth)
{
cuda::EnsureContext(m_Context);
core::Assert(h_texture != nullptr);
core::Assert(dst_data != nullptr);
// Cast to local implementations
const cuda::Texture3D* texture = (cuda::Texture3D*)h_texture;
const cmp::TextureDesc& desc = texture->Desc();
core::Assert(dst_pitch <= desc.pitch);
core::Assert(dst_height <= desc.height);
core::Assert(dst_depth <= desc.depth);
core::Assert(texture->array != nullptr);
// Describe the host to array copy
CUDA_MEMCPY3D copy;
memset(&copy, 0, sizeof(copy));
copy.Depth = dst_depth;
copy.Height = dst_height;
copy.WidthInBytes = dst_pitch;
copy.dstHost = dst_data;
copy.dstHeight = dst_height;
copy.dstMemoryType = CU_MEMORYTYPE_HOST;
copy.dstPitch = dst_pitch;
copy.srcArray = texture->array;
copy.srcMemoryType = CU_MEMORYTYPE_ARRAY;
// Place the copy in the stream
return !cudaHasError(cuMemcpy3DAsync(&copy, m_Stream));
}
bool cuda::Queue::CopyDeviceToGfxTexture3D(cmp::DeviceMem* cmp_src_device, cmp::GfxResource* cmp_dst_gfxres, u32 pitch, u32 height, u32 depth)
{
rmt_ScopedCPUSample(CopyDeviceToGfxTexture3D);
core::Assert(cmp_src_device != nullptr);
core::Assert(cmp_dst_gfxres != nullptr);
cuda::DeviceMemory* devmem_source = (cuda::DeviceMemory*)cmp_src_device;
cuda::GfxResource* gfxres_dest = (cuda::GfxResource*)cmp_dst_gfxres;
// Map the graphics resource
core::Assert(gfxres_dest->graphics_resource != nullptr);
if (cudaHasError(cuGraphicsMapResources(1, &gfxres_dest->graphics_resource, m_Stream)))
return false;
// Get an array for CUDA access to the graphics resource
CUarray array;
if (cudaHasError(cuGraphicsSubResourceGetMappedArray(&array, gfxres_dest->graphics_resource, 0, 0)))
{
cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream);
return false;
}
// Describe the device to array copy
CUDA_MEMCPY3D copy;
memset(&copy, 0, sizeof(copy));
copy.Depth = depth;
copy.Height = height;
copy.WidthInBytes = pitch;
copy.srcDevice = devmem_source->ptr;
copy.srcHeight = height;
copy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
copy.srcPitch = pitch;
copy.dstArray = array;
copy.dstMemoryType = CU_MEMORYTYPE_ARRAY;
if (cudaHasError(cuMemcpy3DAsync(&copy, m_Stream)))
{
cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream);
return false;
}
// Clean up with an unmap
return !cudaHasError(cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream));
}
bool cuda::Queue::CopyDeviceToGfxBuffer(cmp::DeviceMem* cmp_src_device, cmp::GfxResource* cmp_dst_gfxres, u32 size)
{
rmt_ScopedCPUSample(CopyDeviceToGfxBuffer);
core::Assert(cmp_src_device != nullptr);
core::Assert(cmp_dst_gfxres != nullptr);
cuda::DeviceMemory* devmem_source = (cuda::DeviceMemory*)cmp_src_device;
cuda::GfxResource* gfxres_dest = (cuda::GfxResource*)cmp_dst_gfxres;
// Map the graphics resource
core::Assert(gfxres_dest->graphics_resource != nullptr);
if (cudaHasError(cuGraphicsMapResources(1, &gfxres_dest->graphics_resource, m_Stream)))
return false;
// Get a device pointer for CUDA access to the graphics resource
CUdeviceptr device_ptr;
size_t map_size;
if (cudaHasError(cuGraphicsResourceGetMappedPointer(&device_ptr, &map_size, gfxres_dest->graphics_resource)))
{
cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream);
return false;
}
// Device-device copy
if (cudaHasError(cuMemcpyDtoDAsync(device_ptr, devmem_source->ptr, size, m_Stream)))
{
cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream);
return false;
}
// Clean up with an unmap
return !cudaHasError(cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream));
}
bool cuda::Queue::Sync()
{
rmt_ScopedCPUSample(cudaQueueSync);
rmt_ScopedCUDASample(cudaQueueSync, m_Stream);
cuda::EnsureContext(m_Context);
return !cudaHasError(cuStreamSynchronize(m_Stream));
}
bool cuda::Queue::MarkEvent(cmp::Event* cmp_event)
{
core::Assert(cmp_event != nullptr);
Event* event = (Event*)cmp_event;
core::Assert(m_Stream != nullptr);
return !cudaHasError(cuEventRecord(event->event, m_Stream));
}
bool cuda::Queue::WaitEvent(cmp::Event* cmp_event)
{
core::Assert(cmp_event != nullptr);
Event* event = (Event*)cmp_event;
core::Assert(m_Stream != nullptr);
return !cudaHasError(cuStreamWaitEvent(m_Stream, event->event, 0));
}
void* cuda::Queue::Resource()
{
return m_Stream;
}
CUstream cuda::Queue::Stream()
{
return m_Stream;
}
cuda::Kernel::Kernel()
: program(nullptr)
, function(nullptr)
{
}
cuda::Program::Program()
: module(nullptr)
, id(0)
, loader(nullptr)
{
}
cuda::Program::~Program()
{
// Delete all kernels this program owns
for (u32 i = 0; i < kernels.size(); i++)
Delete(kernels[i]);
// Remove from the loader
if (loader != nullptr)
loader->RemoveProgram(this);
// Release the module resource
if (module != nullptr)
cuModuleUnload(module);
}
cuda::DeviceMemory::DeviceMemory(u32 size)
: ptr(nullptr)
{
SetObjectType(this);
if (cudaHasError(cuMemAlloc(&ptr, size)))
return;
m_Desc.size = size;
}
cuda::DeviceMemory::~DeviceMemory()
{
if (ptr != nullptr)
cuMemFree(ptr);
}
cuda::HostMemory::HostMemory(u32 size)
{
SetObjectType(this);
if (cudaHasError(cuMemAllocHost(&m_Desc.mapped_mem, size)))
return;
m_Desc.size = size;
}
cuda::HostMemory::~HostMemory()
{
if (m_Desc.mapped_mem != nullptr)
cuMemFreeHost(m_Desc.mapped_mem);
}
void* cuda::HostMemory::MapNow(cmp::Queue*)
{
// As the CUDA Driver tracks the virtual memory ranges of the allocated memory, there's no
// need for an explicit Map call to get access to the memory.
return m_Desc.mapped_mem;
}
void* cuda::HostMemory::Map(cmp::Queue*)
{
// As the CUDA Driver tracks the virtual memory ranges of the allocated memory, there's no
// need for an explicit Map call to get access to the memory.
return m_Desc.mapped_mem;
}
void cuda::HostMemory::Unmap(cmp::Queue*)
{
// Nothing to do, see Map/Now comments
}
cuda::Texture3D::Texture3D(u32 width, u32 height, u32 depth, const Format& format, cmp::Access access)
: array(nullptr)
, address_mode(CU_TR_ADDRESS_MODE_CLAMP)
, filter_mode(CU_TR_FILTER_MODE_POINT)
, flags(0)
{
SetObjectType(this);
const FormatDesc& fmt_desc = FormatDesc_Get(format.fmt);
// Ensure this is a format that has an equal bit-size for each channel
if (fmt_desc.g_type != FmtType_None)
core::Assert(fmt_desc.r_nb_bits == fmt_desc.g_nb_bits);
if (fmt_desc.b_type != FmtType_None)
core::Assert(fmt_desc.r_nb_bits == fmt_desc.b_nb_bits);
if (fmt_desc.a_type != FmtType_None)
core::Assert(fmt_desc.r_nb_bits == fmt_desc.a_nb_bits);
// Transform bits counts 8,16,32 into the indices 0,1,2 while checking for unsupported bit counts
u32 array_format_index = core::LogBase2(fmt_desc.r_nb_bits) - 3;
core::Assert(array_format_index < 3);
core::Assert((1 << (array_format_index + 3)) == fmt_desc.r_nb_bits);
// Lookup the equivalent array format
CUarray_format array_format = g_ArrayFormatMap[fmt_desc.r_type][array_format_index];
core::Assert(array_format != g_NullArrayFormat);
// https://devtalk.nvidia.com/default/topic/690069/?comment=4124250
// Create the memory for the texture
CUDA_ARRAY3D_DESCRIPTOR array_desc;
memset(&array_desc, 0, sizeof(array_desc));
array_desc.Width = width;
array_desc.Height = height;
array_desc.Depth = depth;
array_desc.Format = array_format;
array_desc.NumChannels = fmt_desc.nb_bits / fmt_desc.r_nb_bits;
array_desc.Flags = (access == cmp::Access_Write || access == cmp::Access_ReadWrite) ? CUDA_ARRAY3D_SURFACE_LDST : 0;
if (cudaHasError(cuArray3DCreate(&array, &array_desc)))
return;
// Set view flags
address_mode = CU_TR_ADDRESS_MODE_CLAMP;
filter_mode = CU_TR_FILTER_MODE_POINT;
flags = g_TextureFlags[format.view]; // CU_TRSF_NORMALIZED_COORDINATES ?
// Set description
m_Desc.width = width;
m_Desc.height = height;
m_Desc.depth = depth;
m_Desc.pitch = width * fmt_desc.NbBytes();
m_Desc.size = m_Desc.pitch * height * depth;
}
cuda::Texture3D::~Texture3D()
{
if (array != nullptr)
cuArrayDestroy(array);
}
cuda::GfxResource::GfxResource(void* resource)
: resource(resource)
, graphics_resource(nullptr)
{
SetObjectType(this);
// Register D3D11 resource with CUDA
if (!cudaHasError(cuGraphicsD3D11RegisterResource(&graphics_resource, (ID3D11Resource*)resource, CU_GRAPHICS_REGISTER_FLAGS_NONE)))
{
cudaHasError(cuGraphicsResourceSetMapFlags(graphics_resource, CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD));
}
}
cuda::GfxResource::~GfxResource()
{
if (graphics_resource != nullptr)
cuGraphicsUnregisterResource(graphics_resource);
}
cuda::Event::Event()
: event(nullptr)
{
SetObjectType(this);
cudaHasError(cuEventCreate(&event, CU_EVENT_BLOCKING_SYNC | CU_EVENT_DISABLE_TIMING));
}
cuda::Event::~Event()
{
if (event != nullptr)
cuEventDestroy(event);
}
void cuda::Init()
{
// Set default texture flags
for (u32 i = 0; i < FmtView_Count; i++)
g_TextureFlags[i] = 0;
// Set supported texture flags
g_TextureFlags[FmtView_UInt] = CU_TRSF_READ_AS_INTEGER;
g_TextureFlags[FmtView_SInt] = CU_TRSF_READ_AS_INTEGER;
g_TextureFlags[FmtView_UNormSRGB] = CU_TRSF_SRGB;
// Set default array formats
for (u32 i = 0; i < FmtType_Count; i++)
{
for (u32 j = 0; j < 3; j++)
g_ArrayFormatMap[i][j] = g_NullArrayFormat;
}
// Set supported array formats
g_ArrayFormatMap[FmtType_UInt][0] = CU_AD_FORMAT_UNSIGNED_INT8;
g_ArrayFormatMap[FmtType_UInt][1] = CU_AD_FORMAT_UNSIGNED_INT16;
g_ArrayFormatMap[FmtType_UInt][2] = CU_AD_FORMAT_UNSIGNED_INT32;
g_ArrayFormatMap[FmtType_SInt][0] = CU_AD_FORMAT_SIGNED_INT8;
g_ArrayFormatMap[FmtType_SInt][1] = CU_AD_FORMAT_SIGNED_INT16;
g_ArrayFormatMap[FmtType_SInt][2] = CU_AD_FORMAT_SIGNED_INT32;
g_ArrayFormatMap[FmtType_Float][1] = CU_AD_FORMAT_HALF;
g_ArrayFormatMap[FmtType_Float][2] = CU_AD_FORMAT_FLOAT;
}
CUmodule cuda::LoadPTXModule(const file::Path& filename, core::String256& response)
{
// Load the program from disk
file::Path full_path = file::MakeGamePath(filename.c_str());
file::File file(full_path.c_str(), "rb");
if (!file.IsOpen())
return nullptr;
u32 program_size = file.GetSize();
if (program_size == 0)
return nullptr;
char* program_data = new char[program_size + 1];
file.Read(program_data, program_size);
program_data[program_size] = 0;
// Create a build log
u32 log_size = 1024;
char* build_log = new char[log_size];
// Describe build options
core::Vector<CUjit_option> options;
options.push_back(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES);
options.push_back(CU_JIT_INFO_LOG_BUFFER);
// Set option values
core::Vector<void*> option_values;
option_values.push_back((void*)log_size);
option_values.push_back(build_log);
// Load the module
// As this is a PTX file, the driver will on-demand build the binary
CUmodule module;
if (cudaHasError(cuModuleLoadDataEx(&module, program_data, options.size(), options.data(), option_values.data())))
{
log_size = (u32)option_values[0];
core::String256 build_log_str(build_log, log_size);
response.append(build_log_str);
delete [] build_log;
delete [] program_data;
return nullptr;
}
// Add build log to response
log_size = (u32)option_values[0];
core::String256 build_log_str(build_log, log_size);
response.append(build_log_str);
delete [] build_log;
delete [] program_data;
return module;
}
cuda::Program* cuda::NewProgram(const file::Path& filename)
{
// Load the module
core::String256 response;
CUmodule module = LoadPTXModule(filename, response);
if (module == nullptr)
return nullptr;
// Create the program object
Program* program = New<Program>();
if (program == nullptr)
{
cuModuleUnload(module);
return nullptr;
}
// Setup program
program->module = module;
program->id = core::MakeNameID(filename.c_str());
return program;
}
cuda::Kernel* cuda::NewKernel(Program* program, const core::String32& name)
{
// Get the kernel function
CUfunction function;
if (cudaHasError(cuModuleGetFunction(&function, program->module, name.c_str())))
return nullptr;
// Create the kernel object
Kernel* kernel = New<Kernel>();
if (kernel == nullptr)
return nullptr;
// Setup kernel
kernel->program = program;
kernel->name = name;
kernel->function = function;
return kernel;
}
@@ -1,228 +0,0 @@
#pragma once
#include <Core/Compute.h>
#include <Core/JobSystem.h>
#include <Core/PixelFormat.h>
class CUDACompute;
// CUDA APIs currently don't compile with clang, which is what clReflect uses
// However, it's very similar to GCC so define some needed macros from GCC
#ifdef __clcpp_parse__
#define __noinline__ __attribute__((noinline))
#define __forceinline__ __inline__ __attribute__((always_inline))
#define __align__(n) __attribute__((aligned(n)))
#define __thread__ __thread
#define __import__
#define __export__
#define __cdecl
#define __annotate__(a) __attribute__((a))
#define __location__(a) __annotate__(a)
#define CUDARTAPI
#endif
struct IDXGIAdapter;
struct ID3D11Device;
struct ID3D11Resource;
#include <cuda.h>
#include <cudad3d11.h>
#include <cuda_runtime.h>
#define cudaHasError(result) cuda::HandleError((result), #result, __FILE__, __LINE__)
clcpp_reflect_part(cuda)
namespace cuda
{
struct Program;
// Return if a CUDA function call returns an error, logging the error string and where the function was called
bool HandleError(CUresult result, const char* expression, const char* file, int line);
// Search all attached devices for the one with the highest performance
int GetMaxGflopsDeviceId();
// Ensure the current thread has the given context active
void EnsureContext(CUcontext context);
//
// Compute queues implemented as CUDA streams
//
class clcpp_attr(reflect_part) Queue : public cmp::Queue
{
public:
Queue(CUcontext context);
virtual ~Queue();
// Interface implementations
virtual bool RunKernel(const cmp::HndKernel* h_kernel, const cmp::KernelArgs& args, u32 nb_items, int block_size);
virtual bool CopyHostToDevice(const void* src_data, u32 src_size, cmp::DeviceMem* h_dev_mem);
virtual bool CopyDeviceToHost(cmp::DeviceMem* h_dev_mem, void* dst_data, u32 dst_size);
virtual bool CopyHostToTexture3D(const void* src_data, u32 src_pitch, u32 src_height, u32 src_depth, cmp::Texture3D* h_texture);
virtual bool CopyTexture3DToHost(cmp::Texture3D* h_texture, void* dst_data, u32 dst_pitch, u32 dst_height, u32 dst_depth);
virtual bool CopyDeviceToGfxTexture3D(cmp::DeviceMem* src_device, cmp::GfxResource* dst_gfxres, u32 pitch, u32 height, u32 depth);
virtual bool CopyDeviceToGfxBuffer(cmp::DeviceMem* src_device, cmp::GfxResource* dst_gfxres, u32 size);
virtual bool Sync();
virtual bool MarkEvent(cmp::Event* event);
virtual bool WaitEvent(cmp::Event* event);
virtual void* Resource();
CUstream Stream();
private:
CUcontext m_Context;
CUstream m_Stream;
};
struct clcpp_attr(reflect_part) KernelTextureParam
{
KernelTextureParam()
: ref_type(0)
, dimensions(0)
, read_type(0)
, tex_ref(nullptr)
, surf_ref(nullptr)
{
}
// Name of global reference this parameter maps to
core::String256 global_name;
// Type info
char ref_type;
u32 dimensions;
char read_type;
// Texture or surface reference, dependent upon ref_type
CUtexref tex_ref;
CUsurfref surf_ref;
};
//
// Kernels are function handles within a module with no resource to manage
//
struct clcpp_attr(reflect_part) Kernel : public cmp::HndKernel
{
Kernel();
// Parent program
Program* program;
// Keep name around for reloads
core::String32 name;
// Handle to function
CUfunction function;
core::Vector<KernelTextureParam> texture_params;
};
//
// Programs are CUDA modules, controlling a list of Kernel objects
//
struct clcpp_attr(reflect_part) Program : public cmp::HndProgram
{
Program();
~Program();
CUmodule module;
// Filename hash for reloads
u32 id;
// Pointer to the program loader for removal on destruction
CUDACompute* loader;
// Allocated kernels owned by this program
core::Vector<Kernel*> kernels;
// Mutex for reloading programs
mpp::Mutex mutex;
};
struct clcpp_attr(reflect_part) DeviceMemory : public cmp::DeviceMem
{
DeviceMemory(u32 size);
virtual ~DeviceMemory();
CUdeviceptr ptr;
};
//
// Page-locked host memory that is accessible to the device, useful in small parts for staging areas
//
struct clcpp_attr(reflect_part) HostMemory : public cmp::HostMem
{
HostMemory(u32 size);
virtual ~HostMemory();
// Interface implementations
virtual void* MapNow(cmp::Queue* queue);
virtual void* Map(cmp::Queue* queue);
virtual void Unmap(cmp::Queue* queue);
};
struct clcpp_attr(reflect_part) Texture3D : public cmp::Texture3D
{
Texture3D(u32 width, u32 height, u32 depth, const Format& format, cmp::Access access);
virtual ~Texture3D();
// Allocated device memory
CUarray array;
// View flags
CUaddress_mode address_mode;
CUfilter_mode filter_mode;
u32 flags;
};
struct clcpp_attr(reflect_part) GfxResource : public cmp::GfxResource
{
GfxResource(void* resource);
~GfxResource();
// The source D3D resource registered for use with CUDA
void* resource;
// The CUDA interop object
CUgraphicsResource graphics_resource;
};
struct clcpp_attr(reflect_part) Event : public cmp::Event
{
Event();
~Event();
CUevent event;
};
void Init();
CUmodule LoadPTXModule(const file::Path& filename, core::String256& response);
Program* NewProgram(const file::Path& filename);
Kernel* NewKernel(Program* program, const core::String32& name);
}
@@ -1,401 +0,0 @@
// TODO: Is it allowed for the same context to be pushed on multiple thread contexts at the same time?
#include "CUDACompute.h"
#include <Core/Math.h>
#include <Core/File.h>
clcpp_impl_class(CUDACompute)
// TODO: Got to move these
// One idea would be to get a new tool to generate a C++ file with all export functions
#include <Core/CoreReflection.h>
clcpp_impl_class(core::VectorReadIterator)
clcpp_impl_class(core::VectorWriteIterator)
clcpp_impl_class(core::String32)
clcpp_impl_class(core::String64)
clcpp_impl_class(core::String256)
namespace
{
cuda::Kernel* GetKernel(cuda::Program* program, const core::String256& kernel_name)
{
// Linear search for matching kernel name
for (u32 i = 0; i < program->kernels.size(); i++)
{
cuda::Kernel* kernel = program->kernels[i];
if (kernel->name == kernel_name)
return kernel;
}
return nullptr;
}
bool ReadString(file::File& fp, core::String256& string, u32 length)
{
string.set_length(length);
return fp.Read(string.data(), length) == length;
}
bool ReadString(file::File& fp, core::String256& string)
{
u32 string_length = 0;
if (!file::Read(fp, string_length))
return false;
return ReadString(fp, string, string_length);
}
bool LoadTextureReflection(const file::Path& filename, cuda::Program* program)
{
core::Assert(program != nullptr);
// Open for read
file::Path full_path = file::MakeGamePath(filename.c_str());
file::File fp(full_path.c_str(), "rb");
if (!fp.IsOpen())
return false;
// Ensure the ID matches
core::String256 id;
if (!ReadString(fp, id, 23))
return false;
if (id != core::String256("CUDAKernelTextureParams"))
return false;
CUmodule module = program->module;
// Read info for all functions
u32 nb_functions = 0;
if (!file::Read(fp, nb_functions))
return false;
for (u32 i = 0; i < nb_functions; i++)
{
// Read the function name
core::String256 function_name;
if (!ReadString(fp, function_name))
return false;
// Allocate enough space for all parameters in this function
u32 nb_params = 0;
if (!file::Read(fp, nb_params))
return false;
core::Vector<cuda::KernelTextureParam> texture_params(nb_params);
// Read all texture parameter objects
for (u32 j = 0; j < nb_params; j++)
{
cuda::KernelTextureParam& param = texture_params[j];
if (!ReadString(fp, param.global_name))
return false;
if (!file::Read(fp, param.ref_type))
return false;
if (!file::Read(fp, param.dimensions))
return false;
if (!file::Read(fp, param.read_type))
return false;
}
// Attempt to get a matching kernel
cuda::Kernel* kernel = GetKernel(program, function_name);
if (kernel == nullptr)
continue;
// Get global texture/surface references for each parameter
for (u32 j = 0; j < nb_params; j++)
{
cuda::KernelTextureParam& param = texture_params[j];
if (param.ref_type == 't')
cuModuleGetTexRef(&param.tex_ref, module, param.global_name.c_str());
if (param.ref_type == 's')
cuModuleGetSurfRef(&param.surf_ref, module, param.global_name.c_str());
}
// Store for runtime use in the kernel
kernel->texture_params.copy_from(texture_params);
}
return true;
}
}
CUDACompute::CUDACompute()
: m_DeviceID(-1)
, m_Device(-1)
, m_Context(nullptr)
{
// Initialise the driver API
core::LogText("CUDA: Initialising Driver API");
if (cudaHasError(cuInit(0)))
return;
// Report driver version
int driver_version;
if (cudaHasError(cuDriverGetVersion(&driver_version)))
return;
core::LogText("CUDA: Driver version %d", driver_version);
// Set the device with the highest gflops/s
m_DeviceID = cuda::GetMaxGflopsDeviceId();
if (cudaHasError(cuDeviceGet(&m_Device, m_DeviceID)))
return;
// Report what device is in use
char name[100];
cuDeviceGetName(name, 100, m_Device);
core::LogText("CUDA: Using device [%d]: %s", m_DeviceID, name);
// Create the main context and pop it off the stack to allow other CUDA contexts elsewhere
core::LogText("CUDA: Creating context");
if (cudaHasError(cuCtxCreate(&m_Context, CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST, m_Device)))
return;
cuCtxPopCurrent(&m_Context);
cuda::Init();
// Bind to remotery
rmtCUDABind bind;
bind.context = m_Context;
bind.CtxSetCurrent = &cuCtxSetCurrent;
bind.CtxGetCurrent = &cuCtxGetCurrent;
bind.EventCreate = &cuEventCreate;
bind.EventDestroy = &cuEventDestroy;
bind.EventRecord = &cuEventRecord;
bind.EventQuery = &cuEventQuery;
bind.EventElapsedTime = &cuEventElapsedTime;
rmt_BindCUDA(&bind);
}
CUDACompute::~CUDACompute()
{
if (m_Context != nullptr)
cuCtxDestroy(m_Context);
}
bool CUDACompute::FilesChanged(const core::Vector<file::Path>& filenames, core::String256& response)
{
cuda::EnsureContext(m_Context);
bool changed = false;
for (u32 i = 0; i < filenames.size(); i++)
{
u32 program_id = core::MakeNameID(filenames[i].c_str());
// Search for a matching program ID - note that there may be many programs matching the same ID
for (u32 j = 0; j < m_Programs.size(); j++)
{
cuda::Program* program = m_Programs[j];
if (program->id != program_id)
continue;
// Ensure reloads don't happen while a program/kernel is in use
mpp::MutexLock lock(program->mutex);
response += core::String256(" Compiling ");
response += filenames[i];
response += core::String256("\n");
// Reload/compile the new program and don't do anything if it fails
CUmodule module = cuda::LoadPTXModule(filenames[i], response);
if (module == nullptr)
break;
response += core::String256(" Success - reloading\n");
// Release the old program
if (program->module != nullptr)
cuModuleUnload(program->module);
// Set the new program and retrieve an all new set of kernel functions
program->module = module;
for (size_t k = 0; k < program->kernels.size(); k++)
{
cuda::Kernel* kernel = program->kernels[k];
CUfunction function;
if (cudaHasError(cuModuleGetFunction(&function, program->module, kernel->name.c_str())))
break;
kernel->function = function;
}
// Reload texture reflection
file::Path pathless_filename, extension;
file::SplitPathExt(filenames[i], pathless_filename, extension);
file::Path ckt_filename = pathless_filename + core::String256(".ckt");
if (!LoadTextureReflection(ckt_filename, program))
{
Delete(program);
break;
}
changed = true;
}
}
return changed;
}
const cmp::HndProgram* CUDACompute::Program_New(const file::Path& filename, const core::String32* kernel_names, u32 nb_kernels)
{
cuda::EnsureContext(m_Context);
// Load the program
file::Path ptx_filename = filename + file::Path(".ptx");
cuda::Program* program = cuda::NewProgram(file::NormalisePath(ptx_filename));
if (program == nullptr)
return nullptr;
// Retrieve all kernels
for (u32 i = 0; i < nb_kernels; i++)
{
cuda::Kernel* kernel = cuda::NewKernel(program, kernel_names[i]);
program->kernels.push_back(kernel);
}
// Load texture reflection
file::Path ckt_filename = filename + core::String256(".ckt");
if (!LoadTextureReflection(ckt_filename, program))
{
Delete(program);
return nullptr;
}
// Record the program in the loader
program->loader = this;
m_Programs.push_back(program);
return program;
}
const cmp::HndKernel* CUDACompute::Program_GetKernel(const cmp::HndProgram* h_program, const core::String32& name)
{
cuda::EnsureContext(m_Context);
// Linear search for kernel by name
const cuda::Program* program = h_program->Cast<cuda::Program>();
for (u32 i = 0; i < program->kernels.size(); i++)
{
cuda::Kernel* kernel = program->kernels[i];
if (kernel->name == name)
return kernel;
}
return nullptr;
}
cmp::DeviceMem* CUDACompute::DeviceMem_New(u32 size, cmp::Access)
{
cuda::EnsureContext(m_Context);
// Allocate device memory, ignoring flags (only required by OpenCL)
cuda::DeviceMemory* device_mem = new cuda::DeviceMemory(size);
if (device_mem->ptr == nullptr)
{
delete device_mem;
return nullptr;
}
return device_mem;
}
cmp::HostMem* CUDACompute::HostMem_New(u32 size, cmp::Access access)
{
cuda::EnsureContext(m_Context);
// Allocate host memory, ignoring flags (only required by OpenCL)
cuda::HostMemory* host_mem = new cuda::HostMemory(size);
if (host_mem->Desc().mapped_mem == nullptr)
{
delete host_mem;
return nullptr;
}
return host_mem;
}
cmp::Texture3D* CUDACompute::Texture3D_New(u32 width, u32 height, u32 depth, const Format& fmt, cmp::Access access)
{
cuda::EnsureContext(m_Context);
cuda::Texture3D* texture = new cuda::Texture3D(width, height, depth, fmt, access);
if (texture->array == nullptr)
{
delete texture;
return nullptr;
}
return texture;
}
cmp::GfxResource* CUDACompute::GfxResource_New(void* resource)
{
cuda::EnsureContext(m_Context);
cuda::GfxResource* gfx_resource = new cuda::GfxResource(resource);
if (gfx_resource->graphics_resource == nullptr)
{
delete gfx_resource;
return nullptr;
}
return gfx_resource;
}
cmp::Event* CUDACompute::Event_New()
{
cuda::EnsureContext(m_Context);
cuda::Event* event = new cuda::Event();
if (event->event == nullptr)
{
delete event;
return nullptr;
}
return event;
}
cmp::Queue* CUDACompute::Queue_New()
{
cuda::EnsureContext(m_Context);
cuda::Queue* queue = new cuda::Queue(m_Context);
if (queue->Stream() == nullptr)
{
delete queue;
return nullptr;
}
return queue;
}
void CUDACompute::RemoveProgram(cuda::Program* program)
{
// Linear search for program by pointer
for (u32 i = 0; i < m_Programs.size(); i++)
{
if (m_Programs[i] == program)
{
m_Programs.remove_unstable(i);
break;
}
}
}
@@ -1,38 +0,0 @@
#include "CUDA.h"
class clcpp_attr(reflect_part) CUDACompute : public cmp::iCompute
{
public:
CUDACompute();
~CUDACompute();
// Subsystem implementations
bool FilesChanged(const core::Vector<file::Path>& filenames, core::String256& response);
// Interface implementations
const cmp::HndProgram* Program_New(const file::Path& filename, const core::String32* kernel_names, u32 nb_kernels);
const cmp::HndKernel* Program_GetKernel(const cmp::HndProgram* program, const core::String32& name);
cmp::DeviceMem* DeviceMem_New(u32 size, cmp::Access access);
cmp::HostMem* HostMem_New(u32 size, cmp::Access access);
cmp::Texture3D* Texture3D_New(u32 width, u32 height, u32 depth, const Format& fmt, cmp::Access access);
cmp::GfxResource* GfxResource_New(void* resource);
cmp::Event* Event_New();
cmp::Queue* Queue_New();
void RemoveProgram(cuda::Program* program);
private:
void DequeueRaiseEvent();
// Currently selected device
int m_DeviceID;
CUdevice m_Device;
// Main context
CUcontext m_Context;
// List of loaded programs
core::Vector<cuda::Program*> m_Programs;
};
No newline at end of file
import os
import Utils
import Process
import BuildSystem
# Retrieve the installation directories from the environment
InstallDir = None
if "CUDA_PATH" in os.environ:
InstallDir = os.environ["CUDA_PATH"]
SampleDir = None
if "NVCUDASAMPLES_ROOT" in os.environ:
SampleDir = os.environ["NVCUDASAMPLES_ROOT"]
# Setup paths relative to the installation path
IncludeDir = os.path.join(InstallDir, "include") if InstallDir else None
x86LibDir = os.path.join(InstallDir, "lib/Win32") if InstallDir else None
x64LibDir = os.path.join(InstallDir, "lib/x64") if InstallDir else None
BinDir = os.path.join(InstallDir, "bin") if InstallDir else None
# Setup paths relative to the samples path
SampleCommonIncludeDir = os.path.join(SampleDir, "common/inc") if SampleDir else None
#
# Names of nVidia GPU Virtual Architectures for generating up to the PTX stage
#
VirtualArch = Utils.enum(
compute_10 = 'compute_10',
compute_11 = 'compute_11',
compute_12 = 'compute_12',
compute_13 = 'compute_13',
compute_20 = 'compute_20',
compute_30 = 'compute_30',
compute_32 = 'compute_32',
compute_35 = 'compute_35',
compute_50 = 'compute_50',
)
#
# Names of nVidia GPU Real Archtectures for generating final binary images
#
RealArch = Utils.enum(
sm_10 = 'sm_10',
sm_11 = 'sm_11',
sm_12 = 'sm_12',
sm_13 = 'sm_13',
sm_20 = 'sm_20',
sm_21 = 'sm_21',
sm_30 = 'sm_30',
sm_32 = 'sm_32',
sm_35 = 'sm_35',
sm_50 = 'sm_50',
)
class CUDACompileOptions:
def __init__(self):
# Set to 'c', 'c++' or 'cu' to explicitly set input language, rather than using extension
self.Language = None
# List of normal/system include search paths
self.IncludePaths = [ ]
self.SystemIncludePaths = [ ]
# List of files to include first during preprocessing
self.IncludeFiles = [ ]
# List of macros to define/undefine for preprocessor
self.DefineMacros = [ ]
self.UndefineMacros = [ ]
# List of library search paths
self.LibraryPaths = [ ]
# List of libraries to link with (specified without the library extension)
self.Libraries = [ ]
# Specify 32/64 bit machine target
self.MachineBits = 32
# Specific the path in which the compiler host EXE resides (e.g. MSVC, GCC)
self.HostCompilerPath = None
# Set to 'none', 'shared' or 'static' to specify runtime library type - default is 'static'
self.CUDARuntime = None
# Generate debug information for host/device code
self.HostDebugLevel = None
self.DeviceDebug = False
# GPU architecture and GPUs to generate code for
self.GPUArch = VirtualArch.compute_10;
self.GPUCode = RealArch.sm_10;
# Math operation behaviour
self.FlushSingleDenormalsToZero = False
self.PreciseSingleDivRecip = True
self.PreciseSingleSqrt = True
self.FuseMultipleAdds = True
self.UseFastMath = False
# Tool options
self.DisableWarnings = False
self.SourceInPTX = False
self.RestrictPointers = False
def UpdateCommandLine(self):
cmdline = [ ]
if self.Language: cmdline += [ '--x=' + self.Language ]
cmdline += [ '--include-path=' + path for path in self.IncludePaths ]
cmdline += [ '--system-include=' + path for path in self.SystemIncludePaths ]
cmdline += [ '--pre-include=' + file for file in self.IncludeFiles ]
cmdline += [ '--define-macro=' + macro for macro in self.DefineMacros ]
cmdline += [ '--undefine-macro=' + macro for macro in self.UndefineMacros ]
cmdline += [ '--library-path=' + lib for lib in self.LibraryPaths ]
cmdline += [ '--library' + lib for lib in self.Libraries ]
cmdline += [ '--machine=' + str(self.MachineBits) ]
if self.HostCompilerPath: cmdline += [ '--compiler-bindir=' + self.HostCompilerPath ]
if self.CUDARuntime: cmdline += [ '--cudart=' + self.CUDARuntime ]
if self.HostDebugLevel != None: cmdline += [ '--debug=' + str(self.HostDebugLevel) ]
if self.DeviceDebug: cmdline += [ '--device-debug' ]
cmdline += [ '--gpu-architecture=' + self.GPUArch ]
cmdline += [ '--gpu-code=' + self.GPUCode ]
cmdline += [ '--ftz=' + ('true' if self.FlushSingleDenormalsToZero else 'false') ]
cmdline += [ '--prec-div=' + ('true' if self.PreciseSingleDivRecip else 'false') ]
cmdline += [ '--prec-sqrt=' + ('true' if self.PreciseSingleSqrt else 'false') ]
cmdline += [ '--fmad=' + ('true' if self.FuseMultipleAdds else 'false') ]
if self.UseFastMath: cmdline += [ '--use_fast_math' ]
if self.DisableWarnings: cmdline += [ '--disable-warnings' ]
if self.SourceInPTX: cmdline += [ '--source-in-ptx' ]
if self.RestrictPointers: cmdline += [ '--restrict' ]
self.CommandLine = cmdline
class BuildPTXNode (BuildSystem.Node):
def __init__(self, source):
super().__init__()
self.Source = source
self.Dependencies = [ source ]
def Build(self, env):
# Build command-line from current configuration
cmdline = [ os.path.join(BinDir, "nvcc.exe") ]
cmdline += [ '--ptx' ]
cmdline += env.CurrentConfig.CUDACompileOptions.CommandLine
# Add the output .ptx file
output_files = self.GetOutputFiles(env)
cmdline += [ '--output-file=' + output_files[0] ]
# Add input file before finishing
cmdline += [ self.GetInputFile(env) ]
Utils.ShowCmdLine(env, cmdline)
# Launch the compiler and wait for it to finish
process = Process.OpenPiped(cmdline)
output = Process.WaitForPipeOutput(process)
if not env.NoToolOutput:
print(output)
return process.returncode == 0
def GetInputFile(self, env):
return self.Source.GetOutputFiles(env)[0]
def GetOutputFiles(self, env):
# Get the filename minus path and extension
# TODO: This only works if this node has another node as input that resides in
# the same directory as it. Need to evaluate relative path inputs in long chains.
input_file = self.GetInputFile(env)
input_file = os.path.split(input_file)[1]
input_file = os.path.splitext(input_file)[0]
ptx_path = os.path.join(env.CurrentConfig.OutputPath, input_file + ".ptx")
return [ ptx_path ]
def GetTempOutputFiles(self, env):
return self.GetOutputFiles(env)
SetOutputPaths(env, "Kernels")
debug_config = env.Configs["debug"]
release_config = env.Configs["release"]
# Need to add this as an include directory for the generated code to reference
current_dir = os.getcwd()
# Location of kernel data to be loaded at runtime
kernel_data_dir = project_dir + "pub/GameData/Kernels"
# Gather input files
input_files = Utils.Glob(".", "*.cu")
input_file_nodes = [ env.NewFile(cu_file) for cu_file in input_files ]
kernel_include_paths = [
current_dir,
project_dir + "src/CppClient",
project_dir + "extern/ComputeBridge/cbpp/inc",
]
# ComputeBridge uses same options for debug/release
cb_options = ComputeBridgePlatform.Options()
cb_options.IncludePaths = kernel_include_paths
cb_options_map = { "debug": cb_options, "release": cb_options }
# Run ComputeBridge for both CUDA and OpenCL
cuda_cb_files = [ ComputeBridgePlatform.BuildNode(cu_file, "cuda", cb_options_map) for cu_file in input_file_nodes ]
opencl_cb_files = [ ComputeBridgePlatform.BuildNode(cu_file, "opencl", cb_options_map) for cu_file in input_file_nodes ]
# Setup CUDA compile options
debug_config.CUDACompileOptions = CUDAPlatform.CUDACompileOptions()
debug_config.CUDACompileOptions.Language = 'cu'
debug_config.CUDACompileOptions.GPUArch = CUDAPlatform.VirtualArch.compute_20
debug_config.CUDACompileOptions.GPUCode = CUDAPlatform.RealArch.sm_20
debug_config.CUDACompileOptions.HostCompilerPath = os.path.join(MSVCPlatform.VCInstallDir, "bin")
debug_config.CUDACompileOptions.UpdateCommandLine()
release_config.CUDACompileOptions = debug_config.CUDACompileOptions
# Build CUDA PTX files
cuda_ptx_files = [ CUDAPlatform.BuildPTXNode(cb_file) for cb_file in cuda_cb_files ]
# Setup OpenCL compile options (it appears the OpenCL compiler already includes cwd)
debug_config.OpenCLCompileOptions = OpenCLPlatform.OpenCLCompileOptions()
debug_config.OpenCLCompileOptions.UpdateCommandLine()
release_config.OpenCLCompileOptions = debug_config.OpenCLCompileOptions
# OpenCL files are compiled on load so just run the precompiler
opencl_out_files = [ OpenCLPlatform.BuildOpenCLNode(cb_file) for cb_file in opencl_cb_files ]
# Copy CUDA output files for load and OpenCL ComputeBridge output for runtime compile
copied_files = [env.CopyOutputFile(ptx_file, 0, kernel_data_dir) for ptx_file in cuda_ptx_files ]
copied_files += [ env.CopyOutputFile(ctk_file, 1, kernel_data_dir) for ctk_file in cuda_cb_files ]
copied_files += [ env.CopyOutputFile(cl_file, 0, kernel_data_dir) for cl_file in opencl_cb_files ]
env.Build(cuda_ptx_files + opencl_out_files + copied_files, "Kernels")
VCGenerateProjectFile(env, "Kernels", input_files + [ "Kernels.pibfile" ], None, targets="Kernels", pibfile = "..\..\..\pibfile")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment