Last active
May 11, 2026 21:03
-
-
Save dwilliamson/f9cbb20e3dca9b3b0f45c3007bdfc088 to your computer and use it in GitHub Desktop.
Star Engine CUDA Implementation from 2015
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // | |
| // TODO: Expose Desc() as public member? | |
| // After that's done, why does the Desc struct exist at all? | |
| // Use const/const_cast trick to make the members read-only? | |
| // | |
| #pragma once | |
| #include <Core/Core.h> | |
| #include <Core/PixelFormat.h> | |
| struct Format; | |
| namespace mpp | |
| { | |
| class Event; | |
| } | |
| clcpp_reflect_part(cmp) | |
| namespace cmp | |
| { | |
| // ------------------------------------------------------------------------------------------------------- // | |
| // Handles // | |
| // ------------------------------------------------------------------------------------------------------- // | |
| struct clcpp_attr(reflect) HndProgram : public clutl::Object | |
| { | |
| }; | |
| struct clcpp_attr(reflect) HndKernel : public clutl::Object | |
| { | |
| }; | |
| // ------------------------------------------------------------------------------------------------------- // | |
| // Kernel Argument API // | |
| // ------------------------------------------------------------------------------------------------------- // | |
| class KernelArgs; | |
| struct clcpp_attr(reflect_part) KernelArg | |
| { | |
| KernelArg(); | |
| // Return a copy of the argument | |
| template <typename TYPE> | |
| TYPE Get() const | |
| { | |
| return *(TYPE*)Data(); | |
| } | |
| // Set the argument, copying the value | |
| template <typename TYPE> | |
| void Set(TYPE value) | |
| { | |
| *(TYPE*)Data() = value; | |
| } | |
| void* Data() const; | |
| // Reflected data type | |
| const clcpp::Type* data_type; | |
| // Size of the data, not including sizeof(KernelArg) | |
| u32 data_size; | |
| // Argument index | |
| u32 index; | |
| // Byte offset to the data | |
| u32 offset; | |
| // Pointer to the argument list that owns this argument | |
| KernelArgs* args_parent; | |
| }; | |
| // | |
| // This class allows queueing of kernel arguments so that they can be applied to a kernel at the point of | |
| // execution. This allows the Compute API to guarantee safe access to a kernel without it being reloaded | |
| // during use. The type helpers are just an added convenience and not the point of this class. | |
| // | |
| class clcpp_attr(reflect) KernelArgs | |
| { | |
| public: | |
| KernelArgs(); | |
| template <typename TYPE> | |
| void Push(TYPE value) | |
| { | |
| Push(&value, clcpp::GetType< CORE_STRIP_CONST_POINTER(TYPE) >(), sizeof(value)); | |
| } | |
| private: | |
| friend struct KernelArg; | |
| friend class KernelArgIterator; | |
| void Push(void* data, const clcpp::Type* type, u32 data_size); | |
| // Store data in the container memory space for cheap instantiation | |
| static const u32 MAX_DATA_SIZE = 1024; | |
| u8 m_Data[MAX_DATA_SIZE]; | |
| u32 m_Position; | |
| u32 m_NbArguments; | |
| }; | |
| class KernelArgIterator | |
| { | |
| public: | |
| KernelArgIterator(const KernelArgs& args); | |
| bool GetNext(KernelArg& arg); | |
| private: | |
| const KernelArgs& m_KernelArgs; | |
| u32 m_Position; | |
| }; | |
| // ------------------------------------------------------------------------------------------------------- // | |
| // Interfaces // | |
| // ------------------------------------------------------------------------------------------------------- // | |
| // | |
| // Specifies read/write access to device memory for kernels | |
| // | |
| enum clcpp_attr(reflect) Access | |
| { | |
| Access_Read, | |
| Access_Write, | |
| Access_ReadWrite, | |
| }; | |
| class HostMem; | |
| class DeviceMem; | |
| class Texture3D; | |
| struct GfxResource; | |
| struct Event; | |
| // ------------------------------------------------------------------------------------------------------- // | |
| // Concurrent Work Queue Interface // | |
| // ------------------------------------------------------------------------------------------------------- // | |
| static const int MAX_KERNEL_BLOCK_SIZE = 1; | |
| // ARC-NOTE: No clcpp_impl_class as there are pure virtuals (and we don't WANT it to be creatable) | |
| struct clcpp_attr(reflect) Queue : public clutl::Object2 | |
| { | |
| virtual ~Queue() { } | |
| virtual bool RunKernel(const HndKernel* h_kernel, const KernelArgs& args, u32 nb_items, int block_size) = 0; | |
| virtual bool CopyHostToDevice(const void* src_data, u32 src_size, DeviceMem* device_mem) = 0; | |
| virtual bool CopyDeviceToHost(DeviceMem* device_mem, void* dst_data, u32 dst_size) = 0; | |
| virtual bool CopyHostToTexture3D(const void* src_data, u32 src_pitch, u32 src_height, u32 src_depth, Texture3D* h_texture) = 0; | |
| virtual bool CopyTexture3DToHost(Texture3D* h_texture, void* dst_data, u32 dst_pitch, u32 dst_height, u32 dst_depth) = 0; | |
| virtual bool CopyDeviceToGfxTexture3D(DeviceMem* src_device, GfxResource* dst_gfxres, u32 pitch, u32 height, u32 depth) = 0; | |
| virtual bool CopyDeviceToGfxBuffer(DeviceMem* src_device, GfxResource* dst_gfxres, u32 size) = 0; | |
| virtual bool Sync() = 0; | |
| virtual bool MarkEvent(Event* event) = 0; | |
| virtual bool WaitEvent(Event* event) = 0; | |
| virtual void* Resource() = 0; | |
| // Helpers to use descriptions embedded in memory objects to simplify the call | |
| bool CopyMappedHostToDevice(HostMem* host_mem, DeviceMem* device_mem); | |
| bool CopyDeviceToMappedHost(DeviceMem* device_mem, HostMem* host_mem); | |
| bool CopyMappedHostToTexture3D(HostMem* host_mem, Texture3D* texture); | |
| bool CopyTexture3DToMappedHost(Texture3D* texture, HostMem* host_mem); | |
| }; | |
| // ------------------------------------------------------------------------------------------------------- // | |
| // Host Memory Interface // | |
| // ------------------------------------------------------------------------------------------------------- // | |
| struct clcpp_attr(reflect) HostMemDesc | |
| { | |
| HostMemDesc(); | |
| u32 size; | |
| void* mapped_mem; | |
| }; | |
| // | |
| // Pinned/page-locked host memory allocated to be the source or destination of transfers to or | |
| // from the device. | |
| // | |
| class clcpp_attr(reflect) HostMem : public clutl::Object2 | |
| { | |
| public: | |
| virtual ~HostMem() { } | |
| virtual void* MapNow(Queue* queue) = 0; | |
| virtual void* Map(Queue* queue) = 0; | |
| virtual void Unmap(Queue* queue) = 0; | |
| const HostMemDesc& Desc() const { return m_Desc; } | |
| protected: | |
| HostMemDesc m_Desc; | |
| }; | |
| // ------------------------------------------------------------------------------------------------------- // | |
| // Device Memory Interface // | |
| // ------------------------------------------------------------------------------------------------------- // | |
| struct clcpp_attr(reflect) DeviceMemDesc | |
| { | |
| DeviceMemDesc(); | |
| u32 size; | |
| }; | |
| // | |
| // A chunk of memory allocated on the device that can't be directly access by the host without | |
| // copying. | |
| // | |
| class clcpp_attr(reflect) DeviceMem : public clutl::Object2 | |
| { | |
| public: | |
| virtual ~DeviceMem() { } | |
| const DeviceMemDesc& Desc() const { return m_Desc; } | |
| protected: | |
| DeviceMemDesc m_Desc; | |
| }; | |
| // ------------------------------------------------------------------------------------------------------- // | |
| // Texture Interface // | |
| // ------------------------------------------------------------------------------------------------------- // | |
| // TODO: There is a need for some kind of "sampler object" to describe filtering, etc. | |
| struct clcpp_attr(reflect) TextureDesc | |
| { | |
| TextureDesc(); | |
| u32 width; | |
| u32 height; | |
| u32 depth; | |
| u32 pitch; | |
| u32 size; | |
| Format format; | |
| }; | |
| // | |
| // A 3D texture allocated on the device to be sampled in kernels. | |
| // | |
| class clcpp_attr(reflect) Texture3D : public clutl::Object2 | |
| { | |
| public: | |
| virtual ~Texture3D() { } | |
| const TextureDesc& Desc() const { return m_Desc; } | |
| protected: | |
| TextureDesc m_Desc; | |
| }; | |
| struct clcpp_attr(reflect) GfxResource : public clutl::Object2 | |
| { | |
| virtual ~GfxResource() { } | |
| }; | |
| // ------------------------------------------------------------------------------------------------------- // | |
| // Event Interface // | |
| // ------------------------------------------------------------------------------------------------------- // | |
| struct clcpp_attr(reflect_part) Event : public clutl::Object2 | |
| { | |
| virtual ~Event() { }; | |
| }; | |
| // ------------------------------------------------------------------------------------------------------- // | |
| // Main Compute Module Interface // | |
| // ------------------------------------------------------------------------------------------------------- // | |
| struct clcpp_attr(reflect_part) iCompute : public core::iSubsystem | |
| { | |
| virtual const HndProgram* Program_New(const file::Path& filename, const core::String32* kernel_names, u32 nb_kernels) = 0; | |
| virtual const HndKernel* Program_GetKernel(const HndProgram* program, const core::String32& name) = 0; | |
| virtual DeviceMem* DeviceMem_New(u32 size, Access access) = 0; | |
| virtual HostMem* HostMem_New(u32 size, Access access) = 0; | |
| // If read/write is specified then CUDA creates a surface reference | |
| // Texture reflection can tell whether a surface needs to be bound or now | |
| virtual Texture3D* Texture3D_New(u32 width, u32 height, u32 depth, const Format& fmt, Access access) = 0; | |
| virtual GfxResource* GfxResource_New(void* resource) = 0; | |
| virtual Event* Event_New() = 0; | |
| virtual Queue* Queue_New() = 0; | |
| }; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| @@ -1,785 +0,0 @@ | |
| #include "CUDA.h" | |
| #include "CUDACompute.h" | |
| #include <Core/File.h> | |
| clcpp_impl_destruct(cuda::Queue); | |
| clcpp_impl_class(cuda::Kernel); | |
| clcpp_impl_class(cuda::Program); | |
| clcpp_impl_destruct(cuda::DeviceMemory); | |
| clcpp_impl_destruct(cuda::HostMemory); | |
| clcpp_impl_destruct(cuda::Texture3D); | |
| clcpp_impl_destruct(cuda::GfxResource); | |
| clcpp_impl_destruct(cuda::Event); | |
| namespace | |
| { | |
| // Format x BitCount matrix, mapping Format to a CUarray_format | |
| CUarray_format g_NullArrayFormat = (CUarray_format)0; | |
| CUarray_format g_ArrayFormatMap[FmtType_Count][3]; | |
| u32 g_TextureFlags[FmtView_Count]; | |
| int MapSMToCores(int major, int minor) | |
| { | |
| struct GpuArchCoresPerSM_t | |
| { | |
| int SM; // 0xMm (hexadecimal notation), M = SM Major version, m = SM minor version | |
| int Cores; | |
| } GpuArchCoresPerSM[] = | |
| { | |
| { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class | |
| { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class | |
| { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class | |
| { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class | |
| { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class | |
| { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class | |
| { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class | |
| { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class | |
| }; | |
| // Search for matching version | |
| int nb_arch_cores = sizeof(GpuArchCoresPerSM) / sizeof(GpuArchCoresPerSM[0]); | |
| for (int i = 0; i < nb_arch_cores; i++) | |
| { | |
| if (GpuArchCoresPerSM[i].SM == ((major << 4) + minor)) | |
| return GpuArchCoresPerSM[i].Cores; | |
| } | |
| // If we don't find the values, we use the previous one to run property | |
| const GpuArchCoresPerSM_t& last_arch = GpuArchCoresPerSM[nb_arch_cores - 1]; | |
| core::LogText("MapSMToCores for SM %d.%d is undefined. Default to use %d Cores/SM", major, minor, last_arch.Cores); | |
| return last_arch.Cores; | |
| } | |
| } | |
| bool cuda::HandleError(CUresult result, const char* expression, const char* file, int line) | |
| { | |
| if (result) | |
| { | |
| // Lookup error strings | |
| const char* error_name = nullptr; | |
| const char* error_desc = nullptr; | |
| if (cuGetErrorName(result, &error_name) != CUDA_SUCCESS) | |
| error_name = "<Unknown>"; | |
| if (cuGetErrorString(result, &error_desc) != CUDA_SUCCESS) | |
| error_desc = "<Unknown>"; | |
| core::LogText("CUDA: Error at %s:%d (%s) code=%d(%s: %s)", file, line, expression, result, error_name, error_desc); | |
| // TODO: device reset/exit? | |
| return true; | |
| } | |
| return false; | |
| } | |
| int cuda::GetMaxGflopsDeviceId() | |
| { | |
| // Get device count | |
| int device_count = 0; | |
| cudaGetDeviceCount(&device_count); | |
| // Find the best major SM architecture GPU device | |
| int best_sm_arch = 0; | |
| for (int i = 0; i < device_count; i++) | |
| { | |
| cudaDeviceProp device_prop; | |
| cudaGetDeviceProperties(&device_prop, i); | |
| // If this GPU is not running on Compute Mode prohibited then we can add it to the list | |
| if (device_prop.computeMode != cudaComputeModeProhibited) | |
| { | |
| if (device_prop.major > 0 && device_prop.major < 9999) | |
| best_sm_arch = max(best_sm_arch, device_prop.major); | |
| } | |
| } | |
| // Find the best CUDA capable GPU device | |
| int sm_per_multiproc = 0; | |
| int max_compute_perf = 0; | |
| int max_perf_device = 0; | |
| for (int i = 0; i < device_count; i++) | |
| { | |
| cudaDeviceProp device_prop; | |
| cudaGetDeviceProperties(&device_prop, i); | |
| // If this GPU is not running on Compute Mode prohibited then we can add it to the list | |
| if (device_prop.major == 9999 && device_prop.minor == 9999) | |
| sm_per_multiproc = 1; | |
| else | |
| sm_per_multiproc = MapSMToCores(device_prop.major, device_prop.minor); | |
| int compute_perf = device_prop.multiProcessorCount * sm_per_multiproc * device_prop.clockRate; | |
| if (compute_perf > max_compute_perf) | |
| { | |
| // If we find GPU with SM major >2, search only these | |
| if (best_sm_arch > 2) | |
| { | |
| if (device_prop.major == best_sm_arch) | |
| { | |
| max_compute_perf = compute_perf; | |
| max_perf_device = i; | |
| } | |
| } | |
| else | |
| { | |
| max_compute_perf = compute_perf; | |
| max_perf_device = i; | |
| } | |
| } | |
| } | |
| return max_perf_device; | |
| } | |
| void cuda::EnsureContext(CUcontext context) | |
| { | |
| CUcontext current; | |
| cuCtxGetCurrent(¤t); | |
| if (current != context) | |
| cuCtxSetCurrent(context); | |
| } | |
| cuda::Queue::Queue(CUcontext context) | |
| : m_Context(context) | |
| , m_Stream(nullptr) | |
| { | |
| SetObjectType(this); | |
| if (cudaHasError(cuStreamCreate(&m_Stream, CU_STREAM_NON_BLOCKING))) | |
| m_Stream = nullptr; | |
| } | |
| cuda::Queue::~Queue() | |
| { | |
| if (m_Stream != nullptr) | |
| cuStreamDestroy(m_Stream); | |
| } | |
| bool cuda::Queue::RunKernel(const cmp::HndKernel* h_kernel, const cmp::KernelArgs& args, u32 nb_items, int block_size) | |
| { | |
| static clcpp::uint32 HASH_DeviceMemory= clcpp::GetTypeNameHash<cmp::DeviceMem>(); | |
| static clcpp::uint32 HASH_HndTexture3D = clcpp::GetTypeNameHash<cmp::Texture3D>(); | |
| cuda::EnsureContext(m_Context); | |
| // Cast to local implementations | |
| core::Assert(h_kernel != nullptr); | |
| const cuda::Kernel* kernel = h_kernel->Cast<cuda::Kernel>(); | |
| // Lock the parent program so that it can't be reloaded while in use | |
| // TODO: This interferes with multi-thread launching of different kernels in the same program | |
| mpp::MutexLock lock(kernel->program->mutex); | |
| // Build a list of pointers to each argument | |
| static const int MAX_NB_ARGS = 10; | |
| void* arg_ptrs[MAX_NB_ARGS] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; | |
| cmp::KernelArg arg; | |
| cmp::KernelArgIterator i(args); | |
| u32 arg_index = 0; | |
| u32 tex_arg_index = 0; | |
| while (i.GetNext(arg)) | |
| { | |
| core::Assert(arg.index < MAX_NB_ARGS); | |
| clcpp::uint32 type_hash = arg.data_type->name.hash; | |
| if (type_hash == HASH_DeviceMemory) | |
| { | |
| cuda::DeviceMemory* dev_mem = arg.Get<cuda::DeviceMemory*>(); | |
| arg_ptrs[arg_index++] = &dev_mem->ptr; | |
| } | |
| else if (type_hash == HASH_HndTexture3D) | |
| { | |
| cuda::Texture3D* texture = arg.Get<cuda::Texture3D*>(); | |
| // Get the next texture parameter | |
| const cuda::KernelTextureParam& tex_param = kernel->texture_params[tex_arg_index++]; | |
| if (tex_param.ref_type == 't') | |
| { | |
| // Ensure dimensions/read mode match the texture | |
| if (tex_param.read_type == 'u') | |
| core::Assert((texture->flags & CU_TRSF_READ_AS_INTEGER) != 0); | |
| else | |
| core::Assert((texture->flags & CU_TRSF_READ_AS_INTEGER) == 0); | |
| // Override texture reference settings with those in the texture | |
| cuTexRefSetFilterMode(tex_param.tex_ref, texture->filter_mode); | |
| cuTexRefSetAddressMode(tex_param.tex_ref, 0, texture->address_mode); | |
| cuTexRefSetAddressMode(tex_param.tex_ref, 1, texture->address_mode); | |
| cuTexRefSetAddressMode(tex_param.tex_ref, 2, texture->address_mode); | |
| cuTexRefSetFlags(tex_param.tex_ref, texture->flags); | |
| // Bind the texture's array data to the reference | |
| core::Assert(tex_param.tex_ref != nullptr); | |
| cuTexRefSetArray(tex_param.tex_ref, texture->array, CU_TRSA_OVERRIDE_FORMAT); | |
| } | |
| else if (tex_param.ref_type == 's') | |
| { | |
| // Bind the texture's array data to the surface reference | |
| core::Assert(tex_param.surf_ref != nullptr); | |
| cuSurfRefSetArray(tex_param.surf_ref, texture->array, 0); | |
| } | |
| } | |
| else | |
| { | |
| arg_ptrs[arg_index++] = arg.Data(); | |
| } | |
| } | |
| // Assign max work items if necessary | |
| // TODO: Set correct block size for target hardware! | |
| if (block_size == cmp::MAX_KERNEL_BLOCK_SIZE) | |
| block_size = 512; | |
| block_size = min(block_size, (int)nb_items); | |
| // Launch the kernel with max work items | |
| // TODO: Set correct block size for target hardware! | |
| const int nb_blocks = (nb_items + block_size - 1) / block_size; | |
| bool has_error = cudaHasError(cuLaunchKernel(kernel->function, nb_blocks, 1, 1, block_size, 1, 1, 0, m_Stream, arg_ptrs, NULL)); | |
| return !has_error; | |
| } | |
| bool cuda::Queue::CopyHostToDevice(const void* src_data, u32 src_size, cmp::DeviceMem* h_dev_mem) | |
| { | |
| cuda::EnsureContext(m_Context); | |
| core::Assert(h_dev_mem != nullptr); | |
| core::Assert(src_data != nullptr); | |
| // Cast to local implementations | |
| const cuda::DeviceMemory* dev_mem = (cuda::DeviceMemory*)h_dev_mem; | |
| core::Assert(src_size <= dev_mem->Desc().size); | |
| // Place write in the stream | |
| return !cudaHasError(cuMemcpyHtoDAsync(dev_mem->ptr, src_data, src_size, m_Stream)); | |
| } | |
| bool cuda::Queue::CopyDeviceToHost(cmp::DeviceMem *h_dev_mem, void *dst_data, u32 dst_size) | |
| { | |
| cuda::EnsureContext(m_Context); | |
| core::Assert(dst_data != nullptr); | |
| core::Assert(h_dev_mem != nullptr); | |
| // Cast to local implementations | |
| const cuda::DeviceMemory* dev_mem = (cuda::DeviceMemory*)h_dev_mem; | |
| core::Assert(dst_size <= dev_mem->Desc().size); | |
| // Place read in the stream | |
| return !cudaHasError(cuMemcpyDtoHAsync(dst_data, dev_mem->ptr, dst_size, m_Stream)); | |
| } | |
| bool cuda::Queue::CopyHostToTexture3D(const void* src_data, u32 src_pitch, u32 src_height, u32 src_depth, cmp::Texture3D* h_texture) | |
| { | |
| cuda::EnsureContext(m_Context); | |
| core::Assert(h_texture != nullptr); | |
| core::Assert(src_data != nullptr); | |
| // Cast to local implementations | |
| const cuda::Texture3D* texture = (cuda::Texture3D*)h_texture; | |
| const cmp::TextureDesc& desc = texture->Desc(); | |
| core::Assert(src_pitch <= desc.pitch); | |
| core::Assert(src_height <= desc.height); | |
| core::Assert(src_depth <= desc.depth); | |
| core::Assert(texture->array != nullptr); | |
| // Describe the host to array copy | |
| CUDA_MEMCPY3D copy; | |
| memset(©, 0, sizeof(copy)); | |
| copy.Depth = src_depth; | |
| copy.Height = src_height; | |
| copy.WidthInBytes = src_pitch; | |
| copy.srcHost = src_data; | |
| copy.srcHeight = src_height; | |
| copy.srcMemoryType = CU_MEMORYTYPE_HOST; | |
| copy.srcPitch = src_pitch; | |
| copy.dstArray = texture->array; | |
| copy.dstMemoryType = CU_MEMORYTYPE_ARRAY; | |
| // Place the copy in the stream | |
| return !cudaHasError(cuMemcpy3DAsync(©, m_Stream)); | |
| } | |
| bool cuda::Queue::CopyTexture3DToHost(cmp::Texture3D* h_texture, void* dst_data, u32 dst_pitch, u32 dst_height, u32 dst_depth) | |
| { | |
| cuda::EnsureContext(m_Context); | |
| core::Assert(h_texture != nullptr); | |
| core::Assert(dst_data != nullptr); | |
| // Cast to local implementations | |
| const cuda::Texture3D* texture = (cuda::Texture3D*)h_texture; | |
| const cmp::TextureDesc& desc = texture->Desc(); | |
| core::Assert(dst_pitch <= desc.pitch); | |
| core::Assert(dst_height <= desc.height); | |
| core::Assert(dst_depth <= desc.depth); | |
| core::Assert(texture->array != nullptr); | |
| // Describe the host to array copy | |
| CUDA_MEMCPY3D copy; | |
| memset(©, 0, sizeof(copy)); | |
| copy.Depth = dst_depth; | |
| copy.Height = dst_height; | |
| copy.WidthInBytes = dst_pitch; | |
| copy.dstHost = dst_data; | |
| copy.dstHeight = dst_height; | |
| copy.dstMemoryType = CU_MEMORYTYPE_HOST; | |
| copy.dstPitch = dst_pitch; | |
| copy.srcArray = texture->array; | |
| copy.srcMemoryType = CU_MEMORYTYPE_ARRAY; | |
| // Place the copy in the stream | |
| return !cudaHasError(cuMemcpy3DAsync(©, m_Stream)); | |
| } | |
| bool cuda::Queue::CopyDeviceToGfxTexture3D(cmp::DeviceMem* cmp_src_device, cmp::GfxResource* cmp_dst_gfxres, u32 pitch, u32 height, u32 depth) | |
| { | |
| rmt_ScopedCPUSample(CopyDeviceToGfxTexture3D); | |
| core::Assert(cmp_src_device != nullptr); | |
| core::Assert(cmp_dst_gfxres != nullptr); | |
| cuda::DeviceMemory* devmem_source = (cuda::DeviceMemory*)cmp_src_device; | |
| cuda::GfxResource* gfxres_dest = (cuda::GfxResource*)cmp_dst_gfxres; | |
| // Map the graphics resource | |
| core::Assert(gfxres_dest->graphics_resource != nullptr); | |
| if (cudaHasError(cuGraphicsMapResources(1, &gfxres_dest->graphics_resource, m_Stream))) | |
| return false; | |
| // Get an array for CUDA access to the graphics resource | |
| CUarray array; | |
| if (cudaHasError(cuGraphicsSubResourceGetMappedArray(&array, gfxres_dest->graphics_resource, 0, 0))) | |
| { | |
| cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream); | |
| return false; | |
| } | |
| // Describe the device to array copy | |
| CUDA_MEMCPY3D copy; | |
| memset(©, 0, sizeof(copy)); | |
| copy.Depth = depth; | |
| copy.Height = height; | |
| copy.WidthInBytes = pitch; | |
| copy.srcDevice = devmem_source->ptr; | |
| copy.srcHeight = height; | |
| copy.srcMemoryType = CU_MEMORYTYPE_DEVICE; | |
| copy.srcPitch = pitch; | |
| copy.dstArray = array; | |
| copy.dstMemoryType = CU_MEMORYTYPE_ARRAY; | |
| if (cudaHasError(cuMemcpy3DAsync(©, m_Stream))) | |
| { | |
| cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream); | |
| return false; | |
| } | |
| // Clean up with an unmap | |
| return !cudaHasError(cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream)); | |
| } | |
| bool cuda::Queue::CopyDeviceToGfxBuffer(cmp::DeviceMem* cmp_src_device, cmp::GfxResource* cmp_dst_gfxres, u32 size) | |
| { | |
| rmt_ScopedCPUSample(CopyDeviceToGfxBuffer); | |
| core::Assert(cmp_src_device != nullptr); | |
| core::Assert(cmp_dst_gfxres != nullptr); | |
| cuda::DeviceMemory* devmem_source = (cuda::DeviceMemory*)cmp_src_device; | |
| cuda::GfxResource* gfxres_dest = (cuda::GfxResource*)cmp_dst_gfxres; | |
| // Map the graphics resource | |
| core::Assert(gfxres_dest->graphics_resource != nullptr); | |
| if (cudaHasError(cuGraphicsMapResources(1, &gfxres_dest->graphics_resource, m_Stream))) | |
| return false; | |
| // Get a device pointer for CUDA access to the graphics resource | |
| CUdeviceptr device_ptr; | |
| size_t map_size; | |
| if (cudaHasError(cuGraphicsResourceGetMappedPointer(&device_ptr, &map_size, gfxres_dest->graphics_resource))) | |
| { | |
| cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream); | |
| return false; | |
| } | |
| // Device-device copy | |
| if (cudaHasError(cuMemcpyDtoDAsync(device_ptr, devmem_source->ptr, size, m_Stream))) | |
| { | |
| cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream); | |
| return false; | |
| } | |
| // Clean up with an unmap | |
| return !cudaHasError(cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream)); | |
| } | |
| bool cuda::Queue::Sync() | |
| { | |
| rmt_ScopedCPUSample(cudaQueueSync); | |
| rmt_ScopedCUDASample(cudaQueueSync, m_Stream); | |
| cuda::EnsureContext(m_Context); | |
| return !cudaHasError(cuStreamSynchronize(m_Stream)); | |
| } | |
| bool cuda::Queue::MarkEvent(cmp::Event* cmp_event) | |
| { | |
| core::Assert(cmp_event != nullptr); | |
| Event* event = (Event*)cmp_event; | |
| core::Assert(m_Stream != nullptr); | |
| return !cudaHasError(cuEventRecord(event->event, m_Stream)); | |
| } | |
| bool cuda::Queue::WaitEvent(cmp::Event* cmp_event) | |
| { | |
| core::Assert(cmp_event != nullptr); | |
| Event* event = (Event*)cmp_event; | |
| core::Assert(m_Stream != nullptr); | |
| return !cudaHasError(cuStreamWaitEvent(m_Stream, event->event, 0)); | |
| } | |
| void* cuda::Queue::Resource() | |
| { | |
| return m_Stream; | |
| } | |
| CUstream cuda::Queue::Stream() | |
| { | |
| return m_Stream; | |
| } | |
| cuda::Kernel::Kernel() | |
| : program(nullptr) | |
| , function(nullptr) | |
| { | |
| } | |
| cuda::Program::Program() | |
| : module(nullptr) | |
| , id(0) | |
| , loader(nullptr) | |
| { | |
| } | |
| cuda::Program::~Program() | |
| { | |
| // Delete all kernels this program owns | |
| for (u32 i = 0; i < kernels.size(); i++) | |
| Delete(kernels[i]); | |
| // Remove from the loader | |
| if (loader != nullptr) | |
| loader->RemoveProgram(this); | |
| // Release the module resource | |
| if (module != nullptr) | |
| cuModuleUnload(module); | |
| } | |
| cuda::DeviceMemory::DeviceMemory(u32 size) | |
| : ptr(nullptr) | |
| { | |
| SetObjectType(this); | |
| if (cudaHasError(cuMemAlloc(&ptr, size))) | |
| return; | |
| m_Desc.size = size; | |
| } | |
| cuda::DeviceMemory::~DeviceMemory() | |
| { | |
| if (ptr != nullptr) | |
| cuMemFree(ptr); | |
| } | |
| cuda::HostMemory::HostMemory(u32 size) | |
| { | |
| SetObjectType(this); | |
| if (cudaHasError(cuMemAllocHost(&m_Desc.mapped_mem, size))) | |
| return; | |
| m_Desc.size = size; | |
| } | |
| cuda::HostMemory::~HostMemory() | |
| { | |
| if (m_Desc.mapped_mem != nullptr) | |
| cuMemFreeHost(m_Desc.mapped_mem); | |
| } | |
| void* cuda::HostMemory::MapNow(cmp::Queue*) | |
| { | |
| // As the CUDA Driver tracks the virtual memory ranges of the allocated memory, there's no | |
| // need for an explicit Map call to get access to the memory. | |
| return m_Desc.mapped_mem; | |
| } | |
| void* cuda::HostMemory::Map(cmp::Queue*) | |
| { | |
| // As the CUDA Driver tracks the virtual memory ranges of the allocated memory, there's no | |
| // need for an explicit Map call to get access to the memory. | |
| return m_Desc.mapped_mem; | |
| } | |
| void cuda::HostMemory::Unmap(cmp::Queue*) | |
| { | |
| // Nothing to do, see Map/Now comments | |
| } | |
| cuda::Texture3D::Texture3D(u32 width, u32 height, u32 depth, const Format& format, cmp::Access access) | |
| : array(nullptr) | |
| , address_mode(CU_TR_ADDRESS_MODE_CLAMP) | |
| , filter_mode(CU_TR_FILTER_MODE_POINT) | |
| , flags(0) | |
| { | |
| SetObjectType(this); | |
| const FormatDesc& fmt_desc = FormatDesc_Get(format.fmt); | |
| // Ensure this is a format that has an equal bit-size for each channel | |
| if (fmt_desc.g_type != FmtType_None) | |
| core::Assert(fmt_desc.r_nb_bits == fmt_desc.g_nb_bits); | |
| if (fmt_desc.b_type != FmtType_None) | |
| core::Assert(fmt_desc.r_nb_bits == fmt_desc.b_nb_bits); | |
| if (fmt_desc.a_type != FmtType_None) | |
| core::Assert(fmt_desc.r_nb_bits == fmt_desc.a_nb_bits); | |
| // Transform bits counts 8,16,32 into the indices 0,1,2 while checking for unsupported bit counts | |
| u32 array_format_index = core::LogBase2(fmt_desc.r_nb_bits) - 3; | |
| core::Assert(array_format_index < 3); | |
| core::Assert((1 << (array_format_index + 3)) == fmt_desc.r_nb_bits); | |
| // Lookup the equivalent array format | |
| CUarray_format array_format = g_ArrayFormatMap[fmt_desc.r_type][array_format_index]; | |
| core::Assert(array_format != g_NullArrayFormat); | |
| // https://devtalk.nvidia.com/default/topic/690069/?comment=4124250 | |
| // Create the memory for the texture | |
| CUDA_ARRAY3D_DESCRIPTOR array_desc; | |
| memset(&array_desc, 0, sizeof(array_desc)); | |
| array_desc.Width = width; | |
| array_desc.Height = height; | |
| array_desc.Depth = depth; | |
| array_desc.Format = array_format; | |
| array_desc.NumChannels = fmt_desc.nb_bits / fmt_desc.r_nb_bits; | |
| array_desc.Flags = (access == cmp::Access_Write || access == cmp::Access_ReadWrite) ? CUDA_ARRAY3D_SURFACE_LDST : 0; | |
| if (cudaHasError(cuArray3DCreate(&array, &array_desc))) | |
| return; | |
| // Set view flags | |
| address_mode = CU_TR_ADDRESS_MODE_CLAMP; | |
| filter_mode = CU_TR_FILTER_MODE_POINT; | |
| flags = g_TextureFlags[format.view]; // CU_TRSF_NORMALIZED_COORDINATES ? | |
| // Set description | |
| m_Desc.width = width; | |
| m_Desc.height = height; | |
| m_Desc.depth = depth; | |
| m_Desc.pitch = width * fmt_desc.NbBytes(); | |
| m_Desc.size = m_Desc.pitch * height * depth; | |
| } | |
| cuda::Texture3D::~Texture3D() | |
| { | |
| if (array != nullptr) | |
| cuArrayDestroy(array); | |
| } | |
| cuda::GfxResource::GfxResource(void* resource) | |
| : resource(resource) | |
| , graphics_resource(nullptr) | |
| { | |
| SetObjectType(this); | |
| // Register D3D11 resource with CUDA | |
| if (!cudaHasError(cuGraphicsD3D11RegisterResource(&graphics_resource, (ID3D11Resource*)resource, CU_GRAPHICS_REGISTER_FLAGS_NONE))) | |
| { | |
| cudaHasError(cuGraphicsResourceSetMapFlags(graphics_resource, CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD)); | |
| } | |
| } | |
| cuda::GfxResource::~GfxResource() | |
| { | |
| if (graphics_resource != nullptr) | |
| cuGraphicsUnregisterResource(graphics_resource); | |
| } | |
| cuda::Event::Event() | |
| : event(nullptr) | |
| { | |
| SetObjectType(this); | |
| cudaHasError(cuEventCreate(&event, CU_EVENT_BLOCKING_SYNC | CU_EVENT_DISABLE_TIMING)); | |
| } | |
| cuda::Event::~Event() | |
| { | |
| if (event != nullptr) | |
| cuEventDestroy(event); | |
| } | |
| void cuda::Init() | |
| { | |
| // Set default texture flags | |
| for (u32 i = 0; i < FmtView_Count; i++) | |
| g_TextureFlags[i] = 0; | |
| // Set supported texture flags | |
| g_TextureFlags[FmtView_UInt] = CU_TRSF_READ_AS_INTEGER; | |
| g_TextureFlags[FmtView_SInt] = CU_TRSF_READ_AS_INTEGER; | |
| g_TextureFlags[FmtView_UNormSRGB] = CU_TRSF_SRGB; | |
| // Set default array formats | |
| for (u32 i = 0; i < FmtType_Count; i++) | |
| { | |
| for (u32 j = 0; j < 3; j++) | |
| g_ArrayFormatMap[i][j] = g_NullArrayFormat; | |
| } | |
| // Set supported array formats | |
| g_ArrayFormatMap[FmtType_UInt][0] = CU_AD_FORMAT_UNSIGNED_INT8; | |
| g_ArrayFormatMap[FmtType_UInt][1] = CU_AD_FORMAT_UNSIGNED_INT16; | |
| g_ArrayFormatMap[FmtType_UInt][2] = CU_AD_FORMAT_UNSIGNED_INT32; | |
| g_ArrayFormatMap[FmtType_SInt][0] = CU_AD_FORMAT_SIGNED_INT8; | |
| g_ArrayFormatMap[FmtType_SInt][1] = CU_AD_FORMAT_SIGNED_INT16; | |
| g_ArrayFormatMap[FmtType_SInt][2] = CU_AD_FORMAT_SIGNED_INT32; | |
| g_ArrayFormatMap[FmtType_Float][1] = CU_AD_FORMAT_HALF; | |
| g_ArrayFormatMap[FmtType_Float][2] = CU_AD_FORMAT_FLOAT; | |
| } | |
| CUmodule cuda::LoadPTXModule(const file::Path& filename, core::String256& response) | |
| { | |
| // Load the program from disk | |
| file::Path full_path = file::MakeGamePath(filename.c_str()); | |
| file::File file(full_path.c_str(), "rb"); | |
| if (!file.IsOpen()) | |
| return nullptr; | |
| u32 program_size = file.GetSize(); | |
| if (program_size == 0) | |
| return nullptr; | |
| char* program_data = new char[program_size + 1]; | |
| file.Read(program_data, program_size); | |
| program_data[program_size] = 0; | |
| // Create a build log | |
| u32 log_size = 1024; | |
| char* build_log = new char[log_size]; | |
| // Describe build options | |
| core::Vector<CUjit_option> options; | |
| options.push_back(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES); | |
| options.push_back(CU_JIT_INFO_LOG_BUFFER); | |
| // Set option values | |
| core::Vector<void*> option_values; | |
| option_values.push_back((void*)log_size); | |
| option_values.push_back(build_log); | |
| // Load the module | |
| // As this is a PTX file, the driver will on-demand build the binary | |
| CUmodule module; | |
| if (cudaHasError(cuModuleLoadDataEx(&module, program_data, options.size(), options.data(), option_values.data()))) | |
| { | |
| log_size = (u32)option_values[0]; | |
| core::String256 build_log_str(build_log, log_size); | |
| response.append(build_log_str); | |
| delete [] build_log; | |
| delete [] program_data; | |
| return nullptr; | |
| } | |
| // Add build log to response | |
| log_size = (u32)option_values[0]; | |
| core::String256 build_log_str(build_log, log_size); | |
| response.append(build_log_str); | |
| delete [] build_log; | |
| delete [] program_data; | |
| return module; | |
| } | |
| cuda::Program* cuda::NewProgram(const file::Path& filename) | |
| { | |
| // Load the module | |
| core::String256 response; | |
| CUmodule module = LoadPTXModule(filename, response); | |
| if (module == nullptr) | |
| return nullptr; | |
| // Create the program object | |
| Program* program = New<Program>(); | |
| if (program == nullptr) | |
| { | |
| cuModuleUnload(module); | |
| return nullptr; | |
| } | |
| // Setup program | |
| program->module = module; | |
| program->id = core::MakeNameID(filename.c_str()); | |
| return program; | |
| } | |
| cuda::Kernel* cuda::NewKernel(Program* program, const core::String32& name) | |
| { | |
| // Get the kernel function | |
| CUfunction function; | |
| if (cudaHasError(cuModuleGetFunction(&function, program->module, name.c_str()))) | |
| return nullptr; | |
| // Create the kernel object | |
| Kernel* kernel = New<Kernel>(); | |
| if (kernel == nullptr) | |
| return nullptr; | |
| // Setup kernel | |
| kernel->program = program; | |
| kernel->name = name; | |
| kernel->function = function; | |
| return kernel; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| @@ -1,228 +0,0 @@ | |
| #pragma once | |
| #include <Core/Compute.h> | |
| #include <Core/JobSystem.h> | |
| #include <Core/PixelFormat.h> | |
| class CUDACompute; | |
| // CUDA APIs currently don't compile with clang, which is what clReflect uses | |
| // However, it's very similar to GCC so define some needed macros from GCC | |
| #ifdef __clcpp_parse__ | |
| #define __noinline__ __attribute__((noinline)) | |
| #define __forceinline__ __inline__ __attribute__((always_inline)) | |
| #define __align__(n) __attribute__((aligned(n))) | |
| #define __thread__ __thread | |
| #define __import__ | |
| #define __export__ | |
| #define __cdecl | |
| #define __annotate__(a) __attribute__((a)) | |
| #define __location__(a) __annotate__(a) | |
| #define CUDARTAPI | |
| #endif | |
| struct IDXGIAdapter; | |
| struct ID3D11Device; | |
| struct ID3D11Resource; | |
| #include <cuda.h> | |
| #include <cudad3d11.h> | |
| #include <cuda_runtime.h> | |
| #define cudaHasError(result) cuda::HandleError((result), #result, __FILE__, __LINE__) | |
| clcpp_reflect_part(cuda) | |
| namespace cuda | |
| { | |
| struct Program; | |
| // Return if a CUDA function call returns an error, logging the error string and where the function was called | |
| bool HandleError(CUresult result, const char* expression, const char* file, int line); | |
| // Search all attached devices for the one with the highest performance | |
| int GetMaxGflopsDeviceId(); | |
| // Ensure the current thread has the given context active | |
| void EnsureContext(CUcontext context); | |
| // | |
| // Compute queues implemented as CUDA streams | |
| // | |
| class clcpp_attr(reflect_part) Queue : public cmp::Queue | |
| { | |
| public: | |
| Queue(CUcontext context); | |
| virtual ~Queue(); | |
| // Interface implementations | |
| virtual bool RunKernel(const cmp::HndKernel* h_kernel, const cmp::KernelArgs& args, u32 nb_items, int block_size); | |
| virtual bool CopyHostToDevice(const void* src_data, u32 src_size, cmp::DeviceMem* h_dev_mem); | |
| virtual bool CopyDeviceToHost(cmp::DeviceMem* h_dev_mem, void* dst_data, u32 dst_size); | |
| virtual bool CopyHostToTexture3D(const void* src_data, u32 src_pitch, u32 src_height, u32 src_depth, cmp::Texture3D* h_texture); | |
| virtual bool CopyTexture3DToHost(cmp::Texture3D* h_texture, void* dst_data, u32 dst_pitch, u32 dst_height, u32 dst_depth); | |
| virtual bool CopyDeviceToGfxTexture3D(cmp::DeviceMem* src_device, cmp::GfxResource* dst_gfxres, u32 pitch, u32 height, u32 depth); | |
| virtual bool CopyDeviceToGfxBuffer(cmp::DeviceMem* src_device, cmp::GfxResource* dst_gfxres, u32 size); | |
| virtual bool Sync(); | |
| virtual bool MarkEvent(cmp::Event* event); | |
| virtual bool WaitEvent(cmp::Event* event); | |
| virtual void* Resource(); | |
| CUstream Stream(); | |
| private: | |
| CUcontext m_Context; | |
| CUstream m_Stream; | |
| }; | |
| struct clcpp_attr(reflect_part) KernelTextureParam | |
| { | |
| KernelTextureParam() | |
| : ref_type(0) | |
| , dimensions(0) | |
| , read_type(0) | |
| , tex_ref(nullptr) | |
| , surf_ref(nullptr) | |
| { | |
| } | |
| // Name of global reference this parameter maps to | |
| core::String256 global_name; | |
| // Type info | |
| char ref_type; | |
| u32 dimensions; | |
| char read_type; | |
| // Texture or surface reference, dependent upon ref_type | |
| CUtexref tex_ref; | |
| CUsurfref surf_ref; | |
| }; | |
| // | |
| // Kernels are function handles within a module with no resource to manage | |
| // | |
| struct clcpp_attr(reflect_part) Kernel : public cmp::HndKernel | |
| { | |
| Kernel(); | |
| // Parent program | |
| Program* program; | |
| // Keep name around for reloads | |
| core::String32 name; | |
| // Handle to function | |
| CUfunction function; | |
| core::Vector<KernelTextureParam> texture_params; | |
| }; | |
| // | |
| // Programs are CUDA modules, controlling a list of Kernel objects | |
| // | |
| struct clcpp_attr(reflect_part) Program : public cmp::HndProgram | |
| { | |
| Program(); | |
| ~Program(); | |
| CUmodule module; | |
| // Filename hash for reloads | |
| u32 id; | |
| // Pointer to the program loader for removal on destruction | |
| CUDACompute* loader; | |
| // Allocated kernels owned by this program | |
| core::Vector<Kernel*> kernels; | |
| // Mutex for reloading programs | |
| mpp::Mutex mutex; | |
| }; | |
| struct clcpp_attr(reflect_part) DeviceMemory : public cmp::DeviceMem | |
| { | |
| DeviceMemory(u32 size); | |
| virtual ~DeviceMemory(); | |
| CUdeviceptr ptr; | |
| }; | |
| // | |
| // Page-locked host memory that is accessible to the device, useful in small parts for staging areas | |
| // | |
| struct clcpp_attr(reflect_part) HostMemory : public cmp::HostMem | |
| { | |
| HostMemory(u32 size); | |
| virtual ~HostMemory(); | |
| // Interface implementations | |
| virtual void* MapNow(cmp::Queue* queue); | |
| virtual void* Map(cmp::Queue* queue); | |
| virtual void Unmap(cmp::Queue* queue); | |
| }; | |
| struct clcpp_attr(reflect_part) Texture3D : public cmp::Texture3D | |
| { | |
| Texture3D(u32 width, u32 height, u32 depth, const Format& format, cmp::Access access); | |
| virtual ~Texture3D(); | |
| // Allocated device memory | |
| CUarray array; | |
| // View flags | |
| CUaddress_mode address_mode; | |
| CUfilter_mode filter_mode; | |
| u32 flags; | |
| }; | |
| struct clcpp_attr(reflect_part) GfxResource : public cmp::GfxResource | |
| { | |
| GfxResource(void* resource); | |
| ~GfxResource(); | |
| // The source D3D resource registered for use with CUDA | |
| void* resource; | |
| // The CUDA interop object | |
| CUgraphicsResource graphics_resource; | |
| }; | |
| struct clcpp_attr(reflect_part) Event : public cmp::Event | |
| { | |
| Event(); | |
| ~Event(); | |
| CUevent event; | |
| }; | |
| void Init(); | |
| CUmodule LoadPTXModule(const file::Path& filename, core::String256& response); | |
| Program* NewProgram(const file::Path& filename); | |
| Kernel* NewKernel(Program* program, const core::String32& name); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| @@ -1,401 +0,0 @@ | |
| // TODO: Is it allowed for the same context to be pushed on multiple thread contexts at the same time? | |
| #include "CUDACompute.h" | |
| #include <Core/Math.h> | |
| #include <Core/File.h> | |
| clcpp_impl_class(CUDACompute) | |
| // TODO: Got to move these | |
| // One idea would be to get a new tool to generate a C++ file with all export functions | |
| #include <Core/CoreReflection.h> | |
| clcpp_impl_class(core::VectorReadIterator) | |
| clcpp_impl_class(core::VectorWriteIterator) | |
| clcpp_impl_class(core::String32) | |
| clcpp_impl_class(core::String64) | |
| clcpp_impl_class(core::String256) | |
| namespace | |
| { | |
| cuda::Kernel* GetKernel(cuda::Program* program, const core::String256& kernel_name) | |
| { | |
| // Linear search for matching kernel name | |
| for (u32 i = 0; i < program->kernels.size(); i++) | |
| { | |
| cuda::Kernel* kernel = program->kernels[i]; | |
| if (kernel->name == kernel_name) | |
| return kernel; | |
| } | |
| return nullptr; | |
| } | |
| bool ReadString(file::File& fp, core::String256& string, u32 length) | |
| { | |
| string.set_length(length); | |
| return fp.Read(string.data(), length) == length; | |
| } | |
| bool ReadString(file::File& fp, core::String256& string) | |
| { | |
| u32 string_length = 0; | |
| if (!file::Read(fp, string_length)) | |
| return false; | |
| return ReadString(fp, string, string_length); | |
| } | |
| bool LoadTextureReflection(const file::Path& filename, cuda::Program* program) | |
| { | |
| core::Assert(program != nullptr); | |
| // Open for read | |
| file::Path full_path = file::MakeGamePath(filename.c_str()); | |
| file::File fp(full_path.c_str(), "rb"); | |
| if (!fp.IsOpen()) | |
| return false; | |
| // Ensure the ID matches | |
| core::String256 id; | |
| if (!ReadString(fp, id, 23)) | |
| return false; | |
| if (id != core::String256("CUDAKernelTextureParams")) | |
| return false; | |
| CUmodule module = program->module; | |
| // Read info for all functions | |
| u32 nb_functions = 0; | |
| if (!file::Read(fp, nb_functions)) | |
| return false; | |
| for (u32 i = 0; i < nb_functions; i++) | |
| { | |
| // Read the function name | |
| core::String256 function_name; | |
| if (!ReadString(fp, function_name)) | |
| return false; | |
| // Allocate enough space for all parameters in this function | |
| u32 nb_params = 0; | |
| if (!file::Read(fp, nb_params)) | |
| return false; | |
| core::Vector<cuda::KernelTextureParam> texture_params(nb_params); | |
| // Read all texture parameter objects | |
| for (u32 j = 0; j < nb_params; j++) | |
| { | |
| cuda::KernelTextureParam& param = texture_params[j]; | |
| if (!ReadString(fp, param.global_name)) | |
| return false; | |
| if (!file::Read(fp, param.ref_type)) | |
| return false; | |
| if (!file::Read(fp, param.dimensions)) | |
| return false; | |
| if (!file::Read(fp, param.read_type)) | |
| return false; | |
| } | |
| // Attempt to get a matching kernel | |
| cuda::Kernel* kernel = GetKernel(program, function_name); | |
| if (kernel == nullptr) | |
| continue; | |
| // Get global texture/surface references for each parameter | |
| for (u32 j = 0; j < nb_params; j++) | |
| { | |
| cuda::KernelTextureParam& param = texture_params[j]; | |
| if (param.ref_type == 't') | |
| cuModuleGetTexRef(¶m.tex_ref, module, param.global_name.c_str()); | |
| if (param.ref_type == 's') | |
| cuModuleGetSurfRef(¶m.surf_ref, module, param.global_name.c_str()); | |
| } | |
| // Store for runtime use in the kernel | |
| kernel->texture_params.copy_from(texture_params); | |
| } | |
| return true; | |
| } | |
| } | |
| CUDACompute::CUDACompute() | |
| : m_DeviceID(-1) | |
| , m_Device(-1) | |
| , m_Context(nullptr) | |
| { | |
| // Initialise the driver API | |
| core::LogText("CUDA: Initialising Driver API"); | |
| if (cudaHasError(cuInit(0))) | |
| return; | |
| // Report driver version | |
| int driver_version; | |
| if (cudaHasError(cuDriverGetVersion(&driver_version))) | |
| return; | |
| core::LogText("CUDA: Driver version %d", driver_version); | |
| // Set the device with the highest gflops/s | |
| m_DeviceID = cuda::GetMaxGflopsDeviceId(); | |
| if (cudaHasError(cuDeviceGet(&m_Device, m_DeviceID))) | |
| return; | |
| // Report what device is in use | |
| char name[100]; | |
| cuDeviceGetName(name, 100, m_Device); | |
| core::LogText("CUDA: Using device [%d]: %s", m_DeviceID, name); | |
| // Create the main context and pop it off the stack to allow other CUDA contexts elsewhere | |
| core::LogText("CUDA: Creating context"); | |
| if (cudaHasError(cuCtxCreate(&m_Context, CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST, m_Device))) | |
| return; | |
| cuCtxPopCurrent(&m_Context); | |
| cuda::Init(); | |
| // Bind to remotery | |
| rmtCUDABind bind; | |
| bind.context = m_Context; | |
| bind.CtxSetCurrent = &cuCtxSetCurrent; | |
| bind.CtxGetCurrent = &cuCtxGetCurrent; | |
| bind.EventCreate = &cuEventCreate; | |
| bind.EventDestroy = &cuEventDestroy; | |
| bind.EventRecord = &cuEventRecord; | |
| bind.EventQuery = &cuEventQuery; | |
| bind.EventElapsedTime = &cuEventElapsedTime; | |
| rmt_BindCUDA(&bind); | |
| } | |
| CUDACompute::~CUDACompute() | |
| { | |
| if (m_Context != nullptr) | |
| cuCtxDestroy(m_Context); | |
| } | |
| bool CUDACompute::FilesChanged(const core::Vector<file::Path>& filenames, core::String256& response) | |
| { | |
| cuda::EnsureContext(m_Context); | |
| bool changed = false; | |
| for (u32 i = 0; i < filenames.size(); i++) | |
| { | |
| u32 program_id = core::MakeNameID(filenames[i].c_str()); | |
| // Search for a matching program ID - note that there may be many programs matching the same ID | |
| for (u32 j = 0; j < m_Programs.size(); j++) | |
| { | |
| cuda::Program* program = m_Programs[j]; | |
| if (program->id != program_id) | |
| continue; | |
| // Ensure reloads don't happen while a program/kernel is in use | |
| mpp::MutexLock lock(program->mutex); | |
| response += core::String256(" Compiling "); | |
| response += filenames[i]; | |
| response += core::String256("\n"); | |
| // Reload/compile the new program and don't do anything if it fails | |
| CUmodule module = cuda::LoadPTXModule(filenames[i], response); | |
| if (module == nullptr) | |
| break; | |
| response += core::String256(" Success - reloading\n"); | |
| // Release the old program | |
| if (program->module != nullptr) | |
| cuModuleUnload(program->module); | |
| // Set the new program and retrieve an all new set of kernel functions | |
| program->module = module; | |
| for (size_t k = 0; k < program->kernels.size(); k++) | |
| { | |
| cuda::Kernel* kernel = program->kernels[k]; | |
| CUfunction function; | |
| if (cudaHasError(cuModuleGetFunction(&function, program->module, kernel->name.c_str()))) | |
| break; | |
| kernel->function = function; | |
| } | |
| // Reload texture reflection | |
| file::Path pathless_filename, extension; | |
| file::SplitPathExt(filenames[i], pathless_filename, extension); | |
| file::Path ckt_filename = pathless_filename + core::String256(".ckt"); | |
| if (!LoadTextureReflection(ckt_filename, program)) | |
| { | |
| Delete(program); | |
| break; | |
| } | |
| changed = true; | |
| } | |
| } | |
| return changed; | |
| } | |
| const cmp::HndProgram* CUDACompute::Program_New(const file::Path& filename, const core::String32* kernel_names, u32 nb_kernels) | |
| { | |
| cuda::EnsureContext(m_Context); | |
| // Load the program | |
| file::Path ptx_filename = filename + file::Path(".ptx"); | |
| cuda::Program* program = cuda::NewProgram(file::NormalisePath(ptx_filename)); | |
| if (program == nullptr) | |
| return nullptr; | |
| // Retrieve all kernels | |
| for (u32 i = 0; i < nb_kernels; i++) | |
| { | |
| cuda::Kernel* kernel = cuda::NewKernel(program, kernel_names[i]); | |
| program->kernels.push_back(kernel); | |
| } | |
| // Load texture reflection | |
| file::Path ckt_filename = filename + core::String256(".ckt"); | |
| if (!LoadTextureReflection(ckt_filename, program)) | |
| { | |
| Delete(program); | |
| return nullptr; | |
| } | |
| // Record the program in the loader | |
| program->loader = this; | |
| m_Programs.push_back(program); | |
| return program; | |
| } | |
| const cmp::HndKernel* CUDACompute::Program_GetKernel(const cmp::HndProgram* h_program, const core::String32& name) | |
| { | |
| cuda::EnsureContext(m_Context); | |
| // Linear search for kernel by name | |
| const cuda::Program* program = h_program->Cast<cuda::Program>(); | |
| for (u32 i = 0; i < program->kernels.size(); i++) | |
| { | |
| cuda::Kernel* kernel = program->kernels[i]; | |
| if (kernel->name == name) | |
| return kernel; | |
| } | |
| return nullptr; | |
| } | |
| cmp::DeviceMem* CUDACompute::DeviceMem_New(u32 size, cmp::Access) | |
| { | |
| cuda::EnsureContext(m_Context); | |
| // Allocate device memory, ignoring flags (only required by OpenCL) | |
| cuda::DeviceMemory* device_mem = new cuda::DeviceMemory(size); | |
| if (device_mem->ptr == nullptr) | |
| { | |
| delete device_mem; | |
| return nullptr; | |
| } | |
| return device_mem; | |
| } | |
| cmp::HostMem* CUDACompute::HostMem_New(u32 size, cmp::Access access) | |
| { | |
| cuda::EnsureContext(m_Context); | |
| // Allocate host memory, ignoring flags (only required by OpenCL) | |
| cuda::HostMemory* host_mem = new cuda::HostMemory(size); | |
| if (host_mem->Desc().mapped_mem == nullptr) | |
| { | |
| delete host_mem; | |
| return nullptr; | |
| } | |
| return host_mem; | |
| } | |
| cmp::Texture3D* CUDACompute::Texture3D_New(u32 width, u32 height, u32 depth, const Format& fmt, cmp::Access access) | |
| { | |
| cuda::EnsureContext(m_Context); | |
| cuda::Texture3D* texture = new cuda::Texture3D(width, height, depth, fmt, access); | |
| if (texture->array == nullptr) | |
| { | |
| delete texture; | |
| return nullptr; | |
| } | |
| return texture; | |
| } | |
| cmp::GfxResource* CUDACompute::GfxResource_New(void* resource) | |
| { | |
| cuda::EnsureContext(m_Context); | |
| cuda::GfxResource* gfx_resource = new cuda::GfxResource(resource); | |
| if (gfx_resource->graphics_resource == nullptr) | |
| { | |
| delete gfx_resource; | |
| return nullptr; | |
| } | |
| return gfx_resource; | |
| } | |
| cmp::Event* CUDACompute::Event_New() | |
| { | |
| cuda::EnsureContext(m_Context); | |
| cuda::Event* event = new cuda::Event(); | |
| if (event->event == nullptr) | |
| { | |
| delete event; | |
| return nullptr; | |
| } | |
| return event; | |
| } | |
| cmp::Queue* CUDACompute::Queue_New() | |
| { | |
| cuda::EnsureContext(m_Context); | |
| cuda::Queue* queue = new cuda::Queue(m_Context); | |
| if (queue->Stream() == nullptr) | |
| { | |
| delete queue; | |
| return nullptr; | |
| } | |
| return queue; | |
| } | |
| void CUDACompute::RemoveProgram(cuda::Program* program) | |
| { | |
| // Linear search for program by pointer | |
| for (u32 i = 0; i < m_Programs.size(); i++) | |
| { | |
| if (m_Programs[i] == program) | |
| { | |
| m_Programs.remove_unstable(i); | |
| break; | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| @@ -1,38 +0,0 @@ | |
| #include "CUDA.h" | |
| class clcpp_attr(reflect_part) CUDACompute : public cmp::iCompute | |
| { | |
| public: | |
| CUDACompute(); | |
| ~CUDACompute(); | |
| // Subsystem implementations | |
| bool FilesChanged(const core::Vector<file::Path>& filenames, core::String256& response); | |
| // Interface implementations | |
| const cmp::HndProgram* Program_New(const file::Path& filename, const core::String32* kernel_names, u32 nb_kernels); | |
| const cmp::HndKernel* Program_GetKernel(const cmp::HndProgram* program, const core::String32& name); | |
| cmp::DeviceMem* DeviceMem_New(u32 size, cmp::Access access); | |
| cmp::HostMem* HostMem_New(u32 size, cmp::Access access); | |
| cmp::Texture3D* Texture3D_New(u32 width, u32 height, u32 depth, const Format& fmt, cmp::Access access); | |
| cmp::GfxResource* GfxResource_New(void* resource); | |
| cmp::Event* Event_New(); | |
| cmp::Queue* Queue_New(); | |
| void RemoveProgram(cuda::Program* program); | |
| private: | |
| void DequeueRaiseEvent(); | |
| // Currently selected device | |
| int m_DeviceID; | |
| CUdevice m_Device; | |
| // Main context | |
| CUcontext m_Context; | |
| // List of loaded programs | |
| core::Vector<cuda::Program*> m_Programs; | |
| }; | |
| No newline at end of file |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import Utils | |
| import Process | |
| import BuildSystem | |
| # Retrieve the installation directories from the environment | |
| InstallDir = None | |
| if "CUDA_PATH" in os.environ: | |
| InstallDir = os.environ["CUDA_PATH"] | |
| SampleDir = None | |
| if "NVCUDASAMPLES_ROOT" in os.environ: | |
| SampleDir = os.environ["NVCUDASAMPLES_ROOT"] | |
| # Setup paths relative to the installation path | |
| IncludeDir = os.path.join(InstallDir, "include") if InstallDir else None | |
| x86LibDir = os.path.join(InstallDir, "lib/Win32") if InstallDir else None | |
| x64LibDir = os.path.join(InstallDir, "lib/x64") if InstallDir else None | |
| BinDir = os.path.join(InstallDir, "bin") if InstallDir else None | |
| # Setup paths relative to the samples path | |
| SampleCommonIncludeDir = os.path.join(SampleDir, "common/inc") if SampleDir else None | |
| # | |
| # Names of nVidia GPU Virtual Architectures for generating up to the PTX stage | |
| # | |
| VirtualArch = Utils.enum( | |
| compute_10 = 'compute_10', | |
| compute_11 = 'compute_11', | |
| compute_12 = 'compute_12', | |
| compute_13 = 'compute_13', | |
| compute_20 = 'compute_20', | |
| compute_30 = 'compute_30', | |
| compute_32 = 'compute_32', | |
| compute_35 = 'compute_35', | |
| compute_50 = 'compute_50', | |
| ) | |
| # | |
| # Names of nVidia GPU Real Archtectures for generating final binary images | |
| # | |
| RealArch = Utils.enum( | |
| sm_10 = 'sm_10', | |
| sm_11 = 'sm_11', | |
| sm_12 = 'sm_12', | |
| sm_13 = 'sm_13', | |
| sm_20 = 'sm_20', | |
| sm_21 = 'sm_21', | |
| sm_30 = 'sm_30', | |
| sm_32 = 'sm_32', | |
| sm_35 = 'sm_35', | |
| sm_50 = 'sm_50', | |
| ) | |
| class CUDACompileOptions: | |
| def __init__(self): | |
| # Set to 'c', 'c++' or 'cu' to explicitly set input language, rather than using extension | |
| self.Language = None | |
| # List of normal/system include search paths | |
| self.IncludePaths = [ ] | |
| self.SystemIncludePaths = [ ] | |
| # List of files to include first during preprocessing | |
| self.IncludeFiles = [ ] | |
| # List of macros to define/undefine for preprocessor | |
| self.DefineMacros = [ ] | |
| self.UndefineMacros = [ ] | |
| # List of library search paths | |
| self.LibraryPaths = [ ] | |
| # List of libraries to link with (specified without the library extension) | |
| self.Libraries = [ ] | |
| # Specify 32/64 bit machine target | |
| self.MachineBits = 32 | |
| # Specific the path in which the compiler host EXE resides (e.g. MSVC, GCC) | |
| self.HostCompilerPath = None | |
| # Set to 'none', 'shared' or 'static' to specify runtime library type - default is 'static' | |
| self.CUDARuntime = None | |
| # Generate debug information for host/device code | |
| self.HostDebugLevel = None | |
| self.DeviceDebug = False | |
| # GPU architecture and GPUs to generate code for | |
| self.GPUArch = VirtualArch.compute_10; | |
| self.GPUCode = RealArch.sm_10; | |
| # Math operation behaviour | |
| self.FlushSingleDenormalsToZero = False | |
| self.PreciseSingleDivRecip = True | |
| self.PreciseSingleSqrt = True | |
| self.FuseMultipleAdds = True | |
| self.UseFastMath = False | |
| # Tool options | |
| self.DisableWarnings = False | |
| self.SourceInPTX = False | |
| self.RestrictPointers = False | |
| def UpdateCommandLine(self): | |
| cmdline = [ ] | |
| if self.Language: cmdline += [ '--x=' + self.Language ] | |
| cmdline += [ '--include-path=' + path for path in self.IncludePaths ] | |
| cmdline += [ '--system-include=' + path for path in self.SystemIncludePaths ] | |
| cmdline += [ '--pre-include=' + file for file in self.IncludeFiles ] | |
| cmdline += [ '--define-macro=' + macro for macro in self.DefineMacros ] | |
| cmdline += [ '--undefine-macro=' + macro for macro in self.UndefineMacros ] | |
| cmdline += [ '--library-path=' + lib for lib in self.LibraryPaths ] | |
| cmdline += [ '--library' + lib for lib in self.Libraries ] | |
| cmdline += [ '--machine=' + str(self.MachineBits) ] | |
| if self.HostCompilerPath: cmdline += [ '--compiler-bindir=' + self.HostCompilerPath ] | |
| if self.CUDARuntime: cmdline += [ '--cudart=' + self.CUDARuntime ] | |
| if self.HostDebugLevel != None: cmdline += [ '--debug=' + str(self.HostDebugLevel) ] | |
| if self.DeviceDebug: cmdline += [ '--device-debug' ] | |
| cmdline += [ '--gpu-architecture=' + self.GPUArch ] | |
| cmdline += [ '--gpu-code=' + self.GPUCode ] | |
| cmdline += [ '--ftz=' + ('true' if self.FlushSingleDenormalsToZero else 'false') ] | |
| cmdline += [ '--prec-div=' + ('true' if self.PreciseSingleDivRecip else 'false') ] | |
| cmdline += [ '--prec-sqrt=' + ('true' if self.PreciseSingleSqrt else 'false') ] | |
| cmdline += [ '--fmad=' + ('true' if self.FuseMultipleAdds else 'false') ] | |
| if self.UseFastMath: cmdline += [ '--use_fast_math' ] | |
| if self.DisableWarnings: cmdline += [ '--disable-warnings' ] | |
| if self.SourceInPTX: cmdline += [ '--source-in-ptx' ] | |
| if self.RestrictPointers: cmdline += [ '--restrict' ] | |
| self.CommandLine = cmdline | |
| class BuildPTXNode (BuildSystem.Node): | |
| def __init__(self, source): | |
| super().__init__() | |
| self.Source = source | |
| self.Dependencies = [ source ] | |
| def Build(self, env): | |
| # Build command-line from current configuration | |
| cmdline = [ os.path.join(BinDir, "nvcc.exe") ] | |
| cmdline += [ '--ptx' ] | |
| cmdline += env.CurrentConfig.CUDACompileOptions.CommandLine | |
| # Add the output .ptx file | |
| output_files = self.GetOutputFiles(env) | |
| cmdline += [ '--output-file=' + output_files[0] ] | |
| # Add input file before finishing | |
| cmdline += [ self.GetInputFile(env) ] | |
| Utils.ShowCmdLine(env, cmdline) | |
| # Launch the compiler and wait for it to finish | |
| process = Process.OpenPiped(cmdline) | |
| output = Process.WaitForPipeOutput(process) | |
| if not env.NoToolOutput: | |
| print(output) | |
| return process.returncode == 0 | |
| def GetInputFile(self, env): | |
| return self.Source.GetOutputFiles(env)[0] | |
| def GetOutputFiles(self, env): | |
| # Get the filename minus path and extension | |
| # TODO: This only works if this node has another node as input that resides in | |
| # the same directory as it. Need to evaluate relative path inputs in long chains. | |
| input_file = self.GetInputFile(env) | |
| input_file = os.path.split(input_file)[1] | |
| input_file = os.path.splitext(input_file)[0] | |
| ptx_path = os.path.join(env.CurrentConfig.OutputPath, input_file + ".ptx") | |
| return [ ptx_path ] | |
| def GetTempOutputFiles(self, env): | |
| return self.GetOutputFiles(env) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| SetOutputPaths(env, "Kernels") | |
| debug_config = env.Configs["debug"] | |
| release_config = env.Configs["release"] | |
| # Need to add this as an include directory for the generated code to reference | |
| current_dir = os.getcwd() | |
| # Location of kernel data to be loaded at runtime | |
| kernel_data_dir = project_dir + "pub/GameData/Kernels" | |
| # Gather input files | |
| input_files = Utils.Glob(".", "*.cu") | |
| input_file_nodes = [ env.NewFile(cu_file) for cu_file in input_files ] | |
| kernel_include_paths = [ | |
| current_dir, | |
| project_dir + "src/CppClient", | |
| project_dir + "extern/ComputeBridge/cbpp/inc", | |
| ] | |
| # ComputeBridge uses same options for debug/release | |
| cb_options = ComputeBridgePlatform.Options() | |
| cb_options.IncludePaths = kernel_include_paths | |
| cb_options_map = { "debug": cb_options, "release": cb_options } | |
| # Run ComputeBridge for both CUDA and OpenCL | |
| cuda_cb_files = [ ComputeBridgePlatform.BuildNode(cu_file, "cuda", cb_options_map) for cu_file in input_file_nodes ] | |
| opencl_cb_files = [ ComputeBridgePlatform.BuildNode(cu_file, "opencl", cb_options_map) for cu_file in input_file_nodes ] | |
| # Setup CUDA compile options | |
| debug_config.CUDACompileOptions = CUDAPlatform.CUDACompileOptions() | |
| debug_config.CUDACompileOptions.Language = 'cu' | |
| debug_config.CUDACompileOptions.GPUArch = CUDAPlatform.VirtualArch.compute_20 | |
| debug_config.CUDACompileOptions.GPUCode = CUDAPlatform.RealArch.sm_20 | |
| debug_config.CUDACompileOptions.HostCompilerPath = os.path.join(MSVCPlatform.VCInstallDir, "bin") | |
| debug_config.CUDACompileOptions.UpdateCommandLine() | |
| release_config.CUDACompileOptions = debug_config.CUDACompileOptions | |
| # Build CUDA PTX files | |
| cuda_ptx_files = [ CUDAPlatform.BuildPTXNode(cb_file) for cb_file in cuda_cb_files ] | |
| # Setup OpenCL compile options (it appears the OpenCL compiler already includes cwd) | |
| debug_config.OpenCLCompileOptions = OpenCLPlatform.OpenCLCompileOptions() | |
| debug_config.OpenCLCompileOptions.UpdateCommandLine() | |
| release_config.OpenCLCompileOptions = debug_config.OpenCLCompileOptions | |
| # OpenCL files are compiled on load so just run the precompiler | |
| opencl_out_files = [ OpenCLPlatform.BuildOpenCLNode(cb_file) for cb_file in opencl_cb_files ] | |
| # Copy CUDA output files for load and OpenCL ComputeBridge output for runtime compile | |
| copied_files = [env.CopyOutputFile(ptx_file, 0, kernel_data_dir) for ptx_file in cuda_ptx_files ] | |
| copied_files += [ env.CopyOutputFile(ctk_file, 1, kernel_data_dir) for ctk_file in cuda_cb_files ] | |
| copied_files += [ env.CopyOutputFile(cl_file, 0, kernel_data_dir) for cl_file in opencl_cb_files ] | |
| env.Build(cuda_ptx_files + opencl_out_files + copied_files, "Kernels") | |
| VCGenerateProjectFile(env, "Kernels", input_files + [ "Kernels.pibfile" ], None, targets="Kernels", pibfile = "..\..\..\pibfile") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment