dwilliamson · May 11, 2026 21:03
diff --git a/Compute.h b/Compute.h

 //
 // TODO: Expose Desc() as public member?
 //       After that's done, why does the Desc struct exist at all?
 //       Use const/const_cast trick to make the members read-only?
 //

 #pragma once


 #include <Core/Core.h>
 #include <Core/PixelFormat.h>


 struct Format;
 namespace mpp
 {
 	class Event;
 }


 clcpp_reflect_part(cmp)
 namespace cmp
 {
 	// ------------------------------------------------------------------------------------------------------- //
 	// Handles																								   //
 	// ------------------------------------------------------------------------------------------------------- //


 	struct clcpp_attr(reflect) HndProgram : public clutl::Object
 	{
 	};
 	struct clcpp_attr(reflect) HndKernel : public clutl::Object
 	{
 	};



 	// ------------------------------------------------------------------------------------------------------- //
 	// Kernel Argument API																					   //
 	// ------------------------------------------------------------------------------------------------------- //


 	class KernelArgs;


 	struct clcpp_attr(reflect_part) KernelArg
 	{
 		KernelArg();

 		// Return a copy of the argument
 		template <typename TYPE>
 		TYPE Get() const
 		{
 			return *(TYPE*)Data();
 		}

 		// Set the argument, copying the value
 		template <typename TYPE>
 		void Set(TYPE value)
 		{
 			*(TYPE*)Data() = value;
 		}

 		void* Data() const;

 		// Reflected data type
 		const clcpp::Type* data_type;

 		// Size of the data, not including sizeof(KernelArg)
 		u32 data_size;

 		// Argument index
 		u32 index;

 		// Byte offset to the data
 		u32 offset;

 		// Pointer to the argument list that owns this argument
 		KernelArgs* args_parent;
 	};


 	//
 	// This class allows queueing of kernel arguments so that they can be applied to a kernel at the point of
 	// execution. This allows the Compute API to guarantee safe access to a kernel without it being reloaded
 	// during use. The type helpers are just an added convenience and not the point of this class.
 	//
 	class clcpp_attr(reflect) KernelArgs
 	{
 	public:
 		KernelArgs();

 		template <typename TYPE>
 		void Push(TYPE value)
 		{
 			Push(&value, clcpp::GetType< CORE_STRIP_CONST_POINTER(TYPE) >(), sizeof(value));
 		}

 	private:
 		friend struct KernelArg;
 		friend class KernelArgIterator;

 		void Push(void* data, const clcpp::Type* type, u32 data_size);

 		// Store data in the container memory space for cheap instantiation
 		static const u32 MAX_DATA_SIZE = 1024;
 		u8 m_Data[MAX_DATA_SIZE];
 		u32 m_Position;

 		u32 m_NbArguments;
 	};


 	class KernelArgIterator
 	{
 	public:
 		KernelArgIterator(const KernelArgs& args);

 		bool GetNext(KernelArg& arg);

 	private:
 		const KernelArgs& m_KernelArgs;
 		u32 m_Position;
 	};


 	
 	// ------------------------------------------------------------------------------------------------------- //
 	// Interfaces																							   //
 	// ------------------------------------------------------------------------------------------------------- //



 	//
 	// Specifies read/write access to device memory for kernels
 	//
 	enum clcpp_attr(reflect) Access
 	{
 		Access_Read,
 		Access_Write,
 		Access_ReadWrite,
 	};


 	class HostMem;
 	class DeviceMem;
 	class Texture3D;
 	struct GfxResource;
 	struct Event;



 	// ------------------------------------------------------------------------------------------------------- //
 	// Concurrent Work Queue Interface                                                                         //
 	// ------------------------------------------------------------------------------------------------------- //


 	static const int MAX_KERNEL_BLOCK_SIZE = 1;


 	// ARC-NOTE: No clcpp_impl_class as there are pure virtuals (and we don't WANT it to be creatable)
 	struct clcpp_attr(reflect) Queue : public clutl::Object2
 	{
 		virtual ~Queue() { }

 		virtual bool RunKernel(const HndKernel* h_kernel, const KernelArgs& args, u32 nb_items, int block_size) = 0;

 		virtual bool CopyHostToDevice(const void* src_data, u32 src_size, DeviceMem* device_mem) = 0;
 		virtual bool CopyDeviceToHost(DeviceMem* device_mem, void* dst_data, u32 dst_size) = 0;
 		virtual bool CopyHostToTexture3D(const void* src_data, u32 src_pitch, u32 src_height, u32 src_depth, Texture3D* h_texture) = 0;
 		virtual bool CopyTexture3DToHost(Texture3D* h_texture, void* dst_data, u32 dst_pitch, u32 dst_height, u32 dst_depth) = 0;

 		virtual bool CopyDeviceToGfxTexture3D(DeviceMem* src_device, GfxResource* dst_gfxres, u32 pitch, u32 height, u32 depth) = 0;
 		virtual bool CopyDeviceToGfxBuffer(DeviceMem* src_device, GfxResource* dst_gfxres, u32 size) = 0;

 		virtual bool Sync() = 0;

 		virtual bool MarkEvent(Event* event) = 0;
 		virtual bool WaitEvent(Event* event) = 0;

 		virtual void* Resource() = 0;

 		// Helpers to use descriptions embedded in memory objects to simplify the call
 		bool CopyMappedHostToDevice(HostMem* host_mem, DeviceMem* device_mem);
 		bool CopyDeviceToMappedHost(DeviceMem* device_mem, HostMem* host_mem);
 		bool CopyMappedHostToTexture3D(HostMem* host_mem, Texture3D* texture);
 		bool CopyTexture3DToMappedHost(Texture3D* texture, HostMem* host_mem);
 	};



 	// ------------------------------------------------------------------------------------------------------- //
 	// Host Memory Interface                                                                                   //
 	// ------------------------------------------------------------------------------------------------------- //



 	struct clcpp_attr(reflect) HostMemDesc
 	{
 		HostMemDesc();
 		u32 size;
 		void* mapped_mem;
 	};


 	//
 	// Pinned/page-locked host memory allocated to be the source or destination of transfers to or
 	// from the device.
 	//
 	class clcpp_attr(reflect) HostMem : public clutl::Object2
 	{
 	public:
 		virtual ~HostMem() { }

 		virtual void* MapNow(Queue* queue) = 0;
 		virtual void* Map(Queue* queue) = 0;
 		virtual void Unmap(Queue* queue) = 0;

 		const HostMemDesc& Desc() const { return m_Desc; }

 	protected:
 		HostMemDesc m_Desc;
 	};



 	// ------------------------------------------------------------------------------------------------------- //
 	// Device Memory Interface                                                                                  //
 	// ------------------------------------------------------------------------------------------------------- //



 	struct clcpp_attr(reflect) DeviceMemDesc
 	{
 		DeviceMemDesc();
 		u32 size;
 	};


 	//
 	// A chunk of memory allocated on the device that can't be directly access by the host without
 	// copying.
 	//
 	class clcpp_attr(reflect) DeviceMem : public clutl::Object2
 	{
 	public:
 		virtual ~DeviceMem() { }

 		const DeviceMemDesc& Desc() const { return m_Desc; }

 	protected:
 		DeviceMemDesc m_Desc;
 	};



 	// ------------------------------------------------------------------------------------------------------- //
 	// Texture Interface                                                                                    //
 	// ------------------------------------------------------------------------------------------------------- //


 	// TODO: There is a need for some kind of "sampler object" to describe filtering, etc.


 	struct clcpp_attr(reflect) TextureDesc
 	{
 		TextureDesc();
 		u32 width;
 		u32 height;
 		u32 depth;
 		u32 pitch;
 		u32 size;
 		Format format;
 	};


 	//
 	// A 3D texture allocated on the device to be sampled in kernels.
 	//
 	class clcpp_attr(reflect) Texture3D : public clutl::Object2
 	{
 	public:
 		virtual ~Texture3D() { }

 		const TextureDesc& Desc() const { return m_Desc; }

 	protected:
 		TextureDesc m_Desc;
 	};


 	struct clcpp_attr(reflect) GfxResource : public clutl::Object2
 	{
 		virtual ~GfxResource() { }
 	};



 	// ------------------------------------------------------------------------------------------------------- //
 	// Event Interface                                                                                         //
 	// ------------------------------------------------------------------------------------------------------- //



 	struct clcpp_attr(reflect_part) Event : public clutl::Object2
 	{
 		virtual ~Event() { };
 	};




 	// ------------------------------------------------------------------------------------------------------- //
 	// Main Compute Module Interface                                                                           //
 	// ------------------------------------------------------------------------------------------------------- //



 	struct clcpp_attr(reflect_part) iCompute : public core::iSubsystem
 	{
 		virtual const HndProgram* Program_New(const file::Path& filename, const core::String32* kernel_names, u32 nb_kernels) = 0;
 		virtual const HndKernel* Program_GetKernel(const HndProgram* program, const core::String32& name) = 0;

 		virtual DeviceMem* DeviceMem_New(u32 size, Access access) = 0;

 		virtual HostMem* HostMem_New(u32 size, Access access) = 0;

 		// If read/write is specified then CUDA creates a surface reference
 		// Texture reflection can tell whether a surface needs to be bound or now
 		virtual Texture3D* Texture3D_New(u32 width, u32 height, u32 depth, const Format& fmt, Access access) = 0;

 		virtual GfxResource* GfxResource_New(void* resource) = 0;

 		virtual Event* Event_New() = 0;

 		virtual Queue* Queue_New() = 0;
 	};
 }
diff --git a/CUDA.cpp b/CUDA.cpp
 @@ -1,785 +0,0 @@

 #include "CUDA.h"
 #include "CUDACompute.h"

 #include <Core/File.h>


 clcpp_impl_destruct(cuda::Queue);
 clcpp_impl_class(cuda::Kernel);
 clcpp_impl_class(cuda::Program);
 clcpp_impl_destruct(cuda::DeviceMemory);
 clcpp_impl_destruct(cuda::HostMemory);
 clcpp_impl_destruct(cuda::Texture3D);
 clcpp_impl_destruct(cuda::GfxResource);
 clcpp_impl_destruct(cuda::Event);


 namespace
 {
 	// Format x BitCount matrix, mapping Format to a CUarray_format
 	CUarray_format g_NullArrayFormat = (CUarray_format)0;
 	CUarray_format g_ArrayFormatMap[FmtType_Count][3];
 	u32 g_TextureFlags[FmtView_Count];


 	int MapSMToCores(int major, int minor)
 	{
 		struct GpuArchCoresPerSM_t
 		{
 			int SM;		// 0xMm (hexadecimal notation), M = SM Major version, m = SM minor version
 			int Cores;
 		} GpuArchCoresPerSM[] =
 		{
 			{ 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
 			{ 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
 			{ 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
 			{ 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
 			{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
 			{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
 			{ 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
 			{ 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
 		};

 		// Search for matching version
 		int nb_arch_cores = sizeof(GpuArchCoresPerSM) / sizeof(GpuArchCoresPerSM[0]);
 		for (int i = 0; i < nb_arch_cores; i++)
 		{
 			if (GpuArchCoresPerSM[i].SM == ((major << 4) + minor))
 				return GpuArchCoresPerSM[i].Cores;
 		}

 		// If we don't find the values, we use the previous one to run property
 		const GpuArchCoresPerSM_t& last_arch = GpuArchCoresPerSM[nb_arch_cores - 1];
 		core::LogText("MapSMToCores for SM %d.%d is undefined. Default to use %d Cores/SM", major, minor, last_arch.Cores);
 		return last_arch.Cores;
 	}
 }


 bool cuda::HandleError(CUresult result, const char* expression, const char* file, int line)
 {
 	if (result)
 	{
 		// Lookup error strings
 		const char* error_name = nullptr;
 		const char* error_desc = nullptr;
 		if (cuGetErrorName(result, &error_name) != CUDA_SUCCESS)
 			error_name = "<Unknown>";
 		if (cuGetErrorString(result, &error_desc) != CUDA_SUCCESS)
 			error_desc = "<Unknown>";

 		core::LogText("CUDA: Error at %s:%d (%s) code=%d(%s: %s)", file, line, expression, result, error_name, error_desc);

 		// TODO: device reset/exit?

 		return true;
 	}

 	return false;
 }


 int cuda::GetMaxGflopsDeviceId()
 {
 	// Get device count
 	int device_count = 0;
 	cudaGetDeviceCount(&device_count);

 	// Find the best major SM architecture GPU device
 	int best_sm_arch = 0;
 	for (int i = 0; i < device_count; i++)
 	{
 		cudaDeviceProp device_prop;
 		cudaGetDeviceProperties(&device_prop, i);

 		// If this GPU is not running on Compute Mode prohibited then we can add it to the list
 		if (device_prop.computeMode != cudaComputeModeProhibited)
 		{
 			if (device_prop.major > 0 && device_prop.major < 9999)
 				best_sm_arch = max(best_sm_arch, device_prop.major);
 		}
 	}

 	// Find the best CUDA capable GPU device
 	int sm_per_multiproc = 0;
 	int max_compute_perf = 0;
 	int max_perf_device = 0;
 	for (int i = 0; i < device_count; i++)
 	{
 		cudaDeviceProp device_prop;
 		cudaGetDeviceProperties(&device_prop, i);

 		// If this GPU is not running on Compute Mode prohibited then we can add it to the list
 		if (device_prop.major == 9999 && device_prop.minor == 9999)
 			sm_per_multiproc = 1;
 		else
 			sm_per_multiproc = MapSMToCores(device_prop.major, device_prop.minor);

 		int compute_perf = device_prop.multiProcessorCount * sm_per_multiproc * device_prop.clockRate;
 		if (compute_perf > max_compute_perf)
 		{
 			// If we find GPU with SM major >2, search only these
 			if (best_sm_arch > 2)
 			{
 				if (device_prop.major == best_sm_arch)
 				{
 					max_compute_perf = compute_perf;
 					max_perf_device = i;
 				}
 			}

 			else
 			{
 				max_compute_perf = compute_perf;
 				max_perf_device = i;
 			}
 		}
 	}

 	return max_perf_device;
 }


 void cuda::EnsureContext(CUcontext context)
 {
 	CUcontext current;
 	cuCtxGetCurrent(&current);
 	if (current != context)
 		cuCtxSetCurrent(context);
 }


 cuda::Queue::Queue(CUcontext context)
 	: m_Context(context)
 	, m_Stream(nullptr)
 {
 	SetObjectType(this);

 	if (cudaHasError(cuStreamCreate(&m_Stream, CU_STREAM_NON_BLOCKING)))
 		m_Stream = nullptr;
 }


 cuda::Queue::~Queue()
 {
 	if (m_Stream != nullptr)
 		cuStreamDestroy(m_Stream);
 }


 bool cuda::Queue::RunKernel(const cmp::HndKernel* h_kernel, const cmp::KernelArgs& args, u32 nb_items, int block_size)
 {
 	static clcpp::uint32 HASH_DeviceMemory= clcpp::GetTypeNameHash<cmp::DeviceMem>();
 	static clcpp::uint32 HASH_HndTexture3D = clcpp::GetTypeNameHash<cmp::Texture3D>();

 	cuda::EnsureContext(m_Context);

 	// Cast to local implementations
 	core::Assert(h_kernel != nullptr);
 	const cuda::Kernel* kernel = h_kernel->Cast<cuda::Kernel>();

 	// Lock the parent program so that it can't be reloaded while in use
 	// TODO: This interferes with multi-thread launching of different kernels in the same program
 	mpp::MutexLock lock(kernel->program->mutex);

 	// Build a list of pointers to each argument
 	static const int MAX_NB_ARGS = 10;
 	void* arg_ptrs[MAX_NB_ARGS] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 	cmp::KernelArg arg;
 	cmp::KernelArgIterator i(args);
 	u32 arg_index = 0;
 	u32 tex_arg_index = 0;
 	while (i.GetNext(arg))
 	{
 		core::Assert(arg.index < MAX_NB_ARGS);
 		clcpp::uint32 type_hash = arg.data_type->name.hash;
 		if (type_hash == HASH_DeviceMemory)
 		{
 			cuda::DeviceMemory* dev_mem = arg.Get<cuda::DeviceMemory*>();
 			arg_ptrs[arg_index++] = &dev_mem->ptr;
 		}
 		else if (type_hash == HASH_HndTexture3D)
 		{
 			cuda::Texture3D* texture = arg.Get<cuda::Texture3D*>();

 			// Get the next texture parameter
 			const cuda::KernelTextureParam& tex_param = kernel->texture_params[tex_arg_index++];
 			if (tex_param.ref_type == 't')
 			{
 				// Ensure dimensions/read mode match the texture
 				if (tex_param.read_type == 'u')
 					core::Assert((texture->flags & CU_TRSF_READ_AS_INTEGER) != 0);
 				else
 					core::Assert((texture->flags & CU_TRSF_READ_AS_INTEGER) == 0);

 				// Override texture reference settings with those in the texture
 				cuTexRefSetFilterMode(tex_param.tex_ref, texture->filter_mode);
 				cuTexRefSetAddressMode(tex_param.tex_ref, 0, texture->address_mode);
 				cuTexRefSetAddressMode(tex_param.tex_ref, 1, texture->address_mode);
 				cuTexRefSetAddressMode(tex_param.tex_ref, 2, texture->address_mode);
 				cuTexRefSetFlags(tex_param.tex_ref, texture->flags);

 				// Bind the texture's array data to the reference
 				core::Assert(tex_param.tex_ref != nullptr);
 				cuTexRefSetArray(tex_param.tex_ref, texture->array, CU_TRSA_OVERRIDE_FORMAT);
 			}

 			else if (tex_param.ref_type == 's')
 			{
 				// Bind the texture's array data to the surface reference
 				core::Assert(tex_param.surf_ref != nullptr);
 				cuSurfRefSetArray(tex_param.surf_ref, texture->array, 0);
 			}
 		}
 		else
 		{
 			arg_ptrs[arg_index++] = arg.Data();
 		}
 	}

 	// Assign max work items if necessary
 	// TODO: Set correct block size for target hardware!
 	if (block_size == cmp::MAX_KERNEL_BLOCK_SIZE)
 		block_size = 512;
 	block_size = min(block_size, (int)nb_items);

 	// Launch the kernel with max work items
 	// TODO: Set correct block size for target hardware!
 	const int nb_blocks = (nb_items + block_size - 1) / block_size;
 	bool has_error = cudaHasError(cuLaunchKernel(kernel->function, nb_blocks, 1, 1, block_size, 1, 1, 0, m_Stream, arg_ptrs, NULL));
 	return !has_error;
 }


 bool cuda::Queue::CopyHostToDevice(const void* src_data, u32 src_size, cmp::DeviceMem* h_dev_mem)
 {
 	cuda::EnsureContext(m_Context);

 	core::Assert(h_dev_mem != nullptr);
 	core::Assert(src_data != nullptr);

 	// Cast to local implementations
 	const cuda::DeviceMemory* dev_mem = (cuda::DeviceMemory*)h_dev_mem;
 	core::Assert(src_size <= dev_mem->Desc().size);

 	// Place write in the stream
 	return !cudaHasError(cuMemcpyHtoDAsync(dev_mem->ptr, src_data, src_size, m_Stream));
 }


 bool cuda::Queue::CopyDeviceToHost(cmp::DeviceMem *h_dev_mem, void *dst_data, u32 dst_size)
 {
 	cuda::EnsureContext(m_Context);

 	core::Assert(dst_data != nullptr);
 	core::Assert(h_dev_mem != nullptr);

 	// Cast to local implementations
 	const cuda::DeviceMemory* dev_mem = (cuda::DeviceMemory*)h_dev_mem;
 	core::Assert(dst_size <= dev_mem->Desc().size);

 	// Place read in the stream
 	return !cudaHasError(cuMemcpyDtoHAsync(dst_data, dev_mem->ptr, dst_size, m_Stream));
 }


 bool cuda::Queue::CopyHostToTexture3D(const void* src_data, u32 src_pitch, u32 src_height, u32 src_depth, cmp::Texture3D* h_texture)
 {
 	cuda::EnsureContext(m_Context);

 	core::Assert(h_texture != nullptr);
 	core::Assert(src_data != nullptr);

 	// Cast to local implementations
 	const cuda::Texture3D* texture = (cuda::Texture3D*)h_texture;
 	const cmp::TextureDesc& desc = texture->Desc();
 	core::Assert(src_pitch <= desc.pitch);
 	core::Assert(src_height <= desc.height);
 	core::Assert(src_depth <= desc.depth);
 	core::Assert(texture->array != nullptr);

 	// Describe the host to array copy
 	CUDA_MEMCPY3D copy;
 	memset(&copy, 0, sizeof(copy));
 	copy.Depth = src_depth;
 	copy.Height = src_height;
 	copy.WidthInBytes = src_pitch;
 	copy.srcHost = src_data;
 	copy.srcHeight = src_height;
 	copy.srcMemoryType = CU_MEMORYTYPE_HOST;
 	copy.srcPitch = src_pitch;
 	copy.dstArray = texture->array;
 	copy.dstMemoryType = CU_MEMORYTYPE_ARRAY;

 	// Place the copy in the stream
 	return !cudaHasError(cuMemcpy3DAsync(&copy, m_Stream));
 }


 bool cuda::Queue::CopyTexture3DToHost(cmp::Texture3D* h_texture, void* dst_data, u32 dst_pitch, u32 dst_height, u32 dst_depth)
 {
 	cuda::EnsureContext(m_Context);

 	core::Assert(h_texture != nullptr);
 	core::Assert(dst_data != nullptr);

 	// Cast to local implementations
 	const cuda::Texture3D* texture = (cuda::Texture3D*)h_texture;
 	const cmp::TextureDesc& desc = texture->Desc();
 	core::Assert(dst_pitch <= desc.pitch);
 	core::Assert(dst_height <= desc.height);
 	core::Assert(dst_depth <= desc.depth);
 	core::Assert(texture->array != nullptr);

 	// Describe the host to array copy
 	CUDA_MEMCPY3D copy;
 	memset(&copy, 0, sizeof(copy));
 	copy.Depth = dst_depth;
 	copy.Height = dst_height;
 	copy.WidthInBytes = dst_pitch;
 	copy.dstHost = dst_data;
 	copy.dstHeight = dst_height;
 	copy.dstMemoryType = CU_MEMORYTYPE_HOST;
 	copy.dstPitch = dst_pitch;
 	copy.srcArray = texture->array;
 	copy.srcMemoryType = CU_MEMORYTYPE_ARRAY;

 	// Place the copy in the stream
 	return !cudaHasError(cuMemcpy3DAsync(&copy, m_Stream));
 }


 bool cuda::Queue::CopyDeviceToGfxTexture3D(cmp::DeviceMem* cmp_src_device, cmp::GfxResource* cmp_dst_gfxres, u32 pitch, u32 height, u32 depth)
 {
 	rmt_ScopedCPUSample(CopyDeviceToGfxTexture3D);

 	core::Assert(cmp_src_device != nullptr);
 	core::Assert(cmp_dst_gfxres != nullptr);

 	cuda::DeviceMemory* devmem_source = (cuda::DeviceMemory*)cmp_src_device;
 	cuda::GfxResource* gfxres_dest = (cuda::GfxResource*)cmp_dst_gfxres;

 	// Map the graphics resource
 	core::Assert(gfxres_dest->graphics_resource != nullptr);
 	if (cudaHasError(cuGraphicsMapResources(1, &gfxres_dest->graphics_resource, m_Stream)))
 		return false;

 	// Get an array for CUDA access to the graphics resource
 	CUarray array;
 	if (cudaHasError(cuGraphicsSubResourceGetMappedArray(&array, gfxres_dest->graphics_resource, 0, 0)))
 	{
 		cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream);
 		return false;
 	}

 	// Describe the device to array copy
 	CUDA_MEMCPY3D copy;
 	memset(&copy, 0, sizeof(copy));
 	copy.Depth = depth;
 	copy.Height = height;
 	copy.WidthInBytes = pitch;
 	copy.srcDevice = devmem_source->ptr;
 	copy.srcHeight = height;
 	copy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
 	copy.srcPitch = pitch;
 	copy.dstArray = array;
 	copy.dstMemoryType = CU_MEMORYTYPE_ARRAY;

 	if (cudaHasError(cuMemcpy3DAsync(&copy, m_Stream)))
 	{
 		cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream);
 		return false;
 	}

 	// Clean up with an unmap
 	return !cudaHasError(cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream));		
 }


 bool cuda::Queue::CopyDeviceToGfxBuffer(cmp::DeviceMem* cmp_src_device, cmp::GfxResource* cmp_dst_gfxres, u32 size)
 {
 	rmt_ScopedCPUSample(CopyDeviceToGfxBuffer);

 	core::Assert(cmp_src_device != nullptr);
 	core::Assert(cmp_dst_gfxres != nullptr);

 	cuda::DeviceMemory* devmem_source = (cuda::DeviceMemory*)cmp_src_device;
 	cuda::GfxResource* gfxres_dest = (cuda::GfxResource*)cmp_dst_gfxres;

 	// Map the graphics resource
 	core::Assert(gfxres_dest->graphics_resource != nullptr);
 	if (cudaHasError(cuGraphicsMapResources(1, &gfxres_dest->graphics_resource, m_Stream)))
 		return false;

 	// Get a device pointer for CUDA access to the graphics resource
 	CUdeviceptr device_ptr;
 	size_t map_size;
 	if (cudaHasError(cuGraphicsResourceGetMappedPointer(&device_ptr, &map_size, gfxres_dest->graphics_resource)))
 	{
 		cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream);
 		return false;
 	}

 	// Device-device copy
 	if (cudaHasError(cuMemcpyDtoDAsync(device_ptr, devmem_source->ptr, size, m_Stream)))
 	{
 		cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream);
 		return false;
 	}

 	// Clean up with an unmap
 	return !cudaHasError(cuGraphicsUnmapResources(1, &gfxres_dest->graphics_resource, m_Stream));
 }


 bool cuda::Queue::Sync()
 {
 	rmt_ScopedCPUSample(cudaQueueSync);
 	rmt_ScopedCUDASample(cudaQueueSync, m_Stream);
 	cuda::EnsureContext(m_Context);
 	return !cudaHasError(cuStreamSynchronize(m_Stream));
 }


 bool cuda::Queue::MarkEvent(cmp::Event* cmp_event)
 {
 	core::Assert(cmp_event != nullptr);
 	Event* event = (Event*)cmp_event;
 	core::Assert(m_Stream != nullptr);
 	return !cudaHasError(cuEventRecord(event->event, m_Stream));
 }


 bool cuda::Queue::WaitEvent(cmp::Event* cmp_event)
 {
 	core::Assert(cmp_event != nullptr);
 	Event* event = (Event*)cmp_event;
 	core::Assert(m_Stream != nullptr);
 	return !cudaHasError(cuStreamWaitEvent(m_Stream, event->event, 0));
 }


 void* cuda::Queue::Resource()
 {
 	return m_Stream;
 }


 CUstream cuda::Queue::Stream()
 {
 	return m_Stream;
 }


 cuda::Kernel::Kernel()
 	: program(nullptr)
 	, function(nullptr)
 {
 }


 cuda::Program::Program()
 	: module(nullptr)
 	, id(0)
 	, loader(nullptr)
 {
 }


 cuda::Program::~Program()
 {
 	// Delete all kernels this program owns
 	for (u32 i = 0; i < kernels.size(); i++)
 		Delete(kernels[i]);

 	// Remove from the loader
 	if (loader != nullptr)
 		loader->RemoveProgram(this);

 	// Release the module resource
 	if (module != nullptr)
 		cuModuleUnload(module);
 }


 cuda::DeviceMemory::DeviceMemory(u32 size)
 	: ptr(nullptr)
 {
 	SetObjectType(this);

 	if (cudaHasError(cuMemAlloc(&ptr, size)))
 		return;

 	m_Desc.size = size;
 }


 cuda::DeviceMemory::~DeviceMemory()
 {
 	if (ptr != nullptr)
 		cuMemFree(ptr);
 }


 cuda::HostMemory::HostMemory(u32 size)
 {
 	SetObjectType(this);

 	if (cudaHasError(cuMemAllocHost(&m_Desc.mapped_mem, size)))
 		return;

 	m_Desc.size = size;
 }


 cuda::HostMemory::~HostMemory()
 {
 	if (m_Desc.mapped_mem != nullptr)
 		cuMemFreeHost(m_Desc.mapped_mem);
 }


 void* cuda::HostMemory::MapNow(cmp::Queue*)
 {
 	// As the CUDA Driver tracks the virtual memory ranges of the allocated memory, there's no
 	// need for an explicit Map call to get access to the memory.
 	return m_Desc.mapped_mem;
 }


 void* cuda::HostMemory::Map(cmp::Queue*)
 {
 	// As the CUDA Driver tracks the virtual memory ranges of the allocated memory, there's no
 	// need for an explicit Map call to get access to the memory.
 	return m_Desc.mapped_mem;
 }


 void cuda::HostMemory::Unmap(cmp::Queue*)
 {
 	// Nothing to do, see Map/Now comments
 }


 cuda::Texture3D::Texture3D(u32 width, u32 height, u32 depth, const Format& format, cmp::Access access)
 	: array(nullptr)
 	, address_mode(CU_TR_ADDRESS_MODE_CLAMP)
 	, filter_mode(CU_TR_FILTER_MODE_POINT)
 	, flags(0)
 {
 	SetObjectType(this);
 	const FormatDesc& fmt_desc = FormatDesc_Get(format.fmt);

 	// Ensure this is a format that has an equal bit-size for each channel
 	if (fmt_desc.g_type != FmtType_None)
 		core::Assert(fmt_desc.r_nb_bits == fmt_desc.g_nb_bits);
 	if (fmt_desc.b_type != FmtType_None)
 		core::Assert(fmt_desc.r_nb_bits == fmt_desc.b_nb_bits);
 	if (fmt_desc.a_type != FmtType_None)
 		core::Assert(fmt_desc.r_nb_bits == fmt_desc.a_nb_bits);

 	// Transform bits counts 8,16,32 into the indices 0,1,2 while checking for unsupported bit counts
 	u32 array_format_index = core::LogBase2(fmt_desc.r_nb_bits) - 3;
 	core::Assert(array_format_index < 3);
 	core::Assert((1 << (array_format_index + 3)) == fmt_desc.r_nb_bits);

 	// Lookup the equivalent array format
 	CUarray_format array_format = g_ArrayFormatMap[fmt_desc.r_type][array_format_index];
 	core::Assert(array_format != g_NullArrayFormat);

 	// https://devtalk.nvidia.com/default/topic/690069/?comment=4124250

 	// Create the memory for the texture
 	CUDA_ARRAY3D_DESCRIPTOR array_desc;
 	memset(&array_desc, 0, sizeof(array_desc));
 	array_desc.Width = width;
 	array_desc.Height = height;
 	array_desc.Depth = depth;
 	array_desc.Format = array_format;
 	array_desc.NumChannels = fmt_desc.nb_bits / fmt_desc.r_nb_bits;
 	array_desc.Flags = (access == cmp::Access_Write || access == cmp::Access_ReadWrite) ? CUDA_ARRAY3D_SURFACE_LDST : 0;
 	if (cudaHasError(cuArray3DCreate(&array, &array_desc)))
 		return;

 	// Set view flags
 	address_mode = CU_TR_ADDRESS_MODE_CLAMP;
 	filter_mode = CU_TR_FILTER_MODE_POINT;
 	flags = g_TextureFlags[format.view];	// CU_TRSF_NORMALIZED_COORDINATES ?

 	// Set description
 	m_Desc.width = width;
 	m_Desc.height = height;
 	m_Desc.depth = depth;
 	m_Desc.pitch = width * fmt_desc.NbBytes();
 	m_Desc.size = m_Desc.pitch * height * depth;
 }


 cuda::Texture3D::~Texture3D()
 {
 	if (array != nullptr)
 		cuArrayDestroy(array);
 }


 cuda::GfxResource::GfxResource(void* resource)
 	: resource(resource)
 	, graphics_resource(nullptr)
 {
 	SetObjectType(this);

 	// Register D3D11 resource with CUDA
 	if (!cudaHasError(cuGraphicsD3D11RegisterResource(&graphics_resource, (ID3D11Resource*)resource, CU_GRAPHICS_REGISTER_FLAGS_NONE)))
 	{
 		cudaHasError(cuGraphicsResourceSetMapFlags(graphics_resource, CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD));
 	}
 }


 cuda::GfxResource::~GfxResource()
 {
 	if (graphics_resource != nullptr)
 		cuGraphicsUnregisterResource(graphics_resource);
 }


 cuda::Event::Event()
 	: event(nullptr)
 {
 	SetObjectType(this);
 	cudaHasError(cuEventCreate(&event, CU_EVENT_BLOCKING_SYNC | CU_EVENT_DISABLE_TIMING));
 }


 cuda::Event::~Event()
 {
 	if (event != nullptr)
 		cuEventDestroy(event);
 }


 void cuda::Init()
 {
 	// Set default texture flags
 	for (u32 i = 0; i < FmtView_Count; i++)
 		g_TextureFlags[i] = 0;

 	// Set supported texture flags
 	g_TextureFlags[FmtView_UInt] = CU_TRSF_READ_AS_INTEGER;
 	g_TextureFlags[FmtView_SInt] = CU_TRSF_READ_AS_INTEGER;
 	g_TextureFlags[FmtView_UNormSRGB] = CU_TRSF_SRGB;

 	// Set default array formats
 	for (u32 i = 0; i < FmtType_Count; i++)
 	{
 		for (u32 j = 0; j < 3; j++)
 			g_ArrayFormatMap[i][j] = g_NullArrayFormat;
 	}

 	// Set supported array formats
 	g_ArrayFormatMap[FmtType_UInt][0] = CU_AD_FORMAT_UNSIGNED_INT8;
 	g_ArrayFormatMap[FmtType_UInt][1] = CU_AD_FORMAT_UNSIGNED_INT16;
 	g_ArrayFormatMap[FmtType_UInt][2] = CU_AD_FORMAT_UNSIGNED_INT32;
 	g_ArrayFormatMap[FmtType_SInt][0] = CU_AD_FORMAT_SIGNED_INT8;
 	g_ArrayFormatMap[FmtType_SInt][1] = CU_AD_FORMAT_SIGNED_INT16;
 	g_ArrayFormatMap[FmtType_SInt][2] = CU_AD_FORMAT_SIGNED_INT32;
 	g_ArrayFormatMap[FmtType_Float][1] = CU_AD_FORMAT_HALF;
 	g_ArrayFormatMap[FmtType_Float][2] = CU_AD_FORMAT_FLOAT;
 }


 CUmodule cuda::LoadPTXModule(const file::Path& filename, core::String256& response)
 {
 	// Load the program from disk
 	file::Path full_path = file::MakeGamePath(filename.c_str());
 	file::File file(full_path.c_str(), "rb");
 	if (!file.IsOpen())
 		return nullptr;
 	u32 program_size = file.GetSize();
 	if (program_size == 0)
 		return nullptr;
 	char* program_data = new char[program_size + 1];
 	file.Read(program_data, program_size);
 	program_data[program_size] = 0;

 	// Create a build log
 	u32 log_size = 1024;
 	char* build_log = new char[log_size];

 	// Describe build options
 	core::Vector<CUjit_option> options;
 	options.push_back(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES);
 	options.push_back(CU_JIT_INFO_LOG_BUFFER);

 	// Set option values
 	core::Vector<void*> option_values;
 	option_values.push_back((void*)log_size);
 	option_values.push_back(build_log);

 	// Load the module
 	// As this is a PTX file, the driver will on-demand build the binary
 	CUmodule module;
 	if (cudaHasError(cuModuleLoadDataEx(&module, program_data, options.size(), options.data(), option_values.data())))
 	{
 		log_size = (u32)option_values[0];
 		core::String256 build_log_str(build_log, log_size);
 		response.append(build_log_str);
 		delete [] build_log;
 		delete [] program_data;
 		return nullptr;
 	}

 	// Add build log to response
 	log_size = (u32)option_values[0];
 	core::String256 build_log_str(build_log, log_size);
 	response.append(build_log_str);
 	delete [] build_log;
 	delete [] program_data;

 	return module;
 }


 cuda::Program* cuda::NewProgram(const file::Path& filename)
 {
 	// Load the module
 	core::String256 response;
 	CUmodule module = LoadPTXModule(filename, response);
 	if (module == nullptr)
 		return nullptr;

 	// Create the program object
 	Program* program = New<Program>();
 	if (program == nullptr)
 	{
 		cuModuleUnload(module);
 		return nullptr;
 	}

 	// Setup program
 	program->module = module;
 	program->id = core::MakeNameID(filename.c_str());
 	return program;
 }


 cuda::Kernel* cuda::NewKernel(Program* program, const core::String32& name)
 {
 	// Get the kernel function
 	CUfunction function;
 	if (cudaHasError(cuModuleGetFunction(&function, program->module, name.c_str())))
 		return nullptr;

 	// Create the kernel object
 	Kernel* kernel = New<Kernel>();
 	if (kernel == nullptr)
 		return nullptr;

 	// Setup kernel
 	kernel->program = program;
 	kernel->name = name;
 	kernel->function = function;

 	return kernel;
 }
diff --git a/CUDA.h b/CUDA.h
 @@ -1,228 +0,0 @@

 #pragma once


 #include <Core/Compute.h>
 #include <Core/JobSystem.h>
 #include <Core/PixelFormat.h>


 class CUDACompute;


 // CUDA APIs currently don't compile with clang, which is what clReflect uses
 // However, it's very similar to GCC so define some needed macros from GCC
 #ifdef __clcpp_parse__

 	#define __noinline__		__attribute__((noinline))	        
 	#define __forceinline__		__inline__ __attribute__((always_inline))
 	#define __align__(n)		__attribute__((aligned(n)))
 	#define __thread__			__thread
 	#define __import__
 	#define __export__
 	#define __cdecl
 	#define __annotate__(a)		__attribute__((a))
 	#define __location__(a)		__annotate__(a)
 	#define CUDARTAPI

 #endif


 struct IDXGIAdapter;
 struct ID3D11Device;
 struct ID3D11Resource;


 #include <cuda.h>
 #include <cudad3d11.h>
 #include <cuda_runtime.h>


 #define cudaHasError(result) cuda::HandleError((result), #result, __FILE__, __LINE__)


 clcpp_reflect_part(cuda)
 namespace cuda
 {
 	struct Program;


 	// Return if a CUDA function call returns an error, logging the error string and where the function was called
 	bool HandleError(CUresult result, const char* expression, const char* file, int line);

 	// Search all attached devices for the one with the highest performance
 	int GetMaxGflopsDeviceId();


 	// Ensure the current thread has the given context active
 	void EnsureContext(CUcontext context);


 	//
 	// Compute queues implemented as CUDA streams
 	//
 	class clcpp_attr(reflect_part) Queue : public cmp::Queue
 	{
 	public:
 		Queue(CUcontext context);
 		virtual ~Queue();

 		// Interface implementations
 		virtual bool RunKernel(const cmp::HndKernel* h_kernel, const cmp::KernelArgs& args, u32 nb_items, int block_size);
 		virtual bool CopyHostToDevice(const void* src_data, u32 src_size, cmp::DeviceMem* h_dev_mem);
 		virtual bool CopyDeviceToHost(cmp::DeviceMem* h_dev_mem, void* dst_data, u32 dst_size);
 		virtual bool CopyHostToTexture3D(const void* src_data, u32 src_pitch, u32 src_height, u32 src_depth, cmp::Texture3D* h_texture);
 		virtual bool CopyTexture3DToHost(cmp::Texture3D* h_texture, void* dst_data, u32 dst_pitch, u32 dst_height, u32 dst_depth);
 		virtual bool CopyDeviceToGfxTexture3D(cmp::DeviceMem* src_device, cmp::GfxResource* dst_gfxres, u32 pitch, u32 height, u32 depth);
 		virtual bool CopyDeviceToGfxBuffer(cmp::DeviceMem* src_device, cmp::GfxResource* dst_gfxres, u32 size);
 		virtual bool Sync();
 		virtual bool MarkEvent(cmp::Event* event);
 		virtual bool WaitEvent(cmp::Event* event);
 		virtual void* Resource();

 		CUstream Stream();

 	private:
 		CUcontext m_Context;
 		CUstream m_Stream;
 	};


 	struct clcpp_attr(reflect_part) KernelTextureParam
 	{
 		KernelTextureParam()
 			: ref_type(0)
 			, dimensions(0)
 			, read_type(0)
 			, tex_ref(nullptr)
 			, surf_ref(nullptr)
 		{
 		}

 		// Name of global reference this parameter maps to
 		core::String256 global_name;

 		// Type info
 		char ref_type;
 		u32 dimensions;
 		char read_type;

 		// Texture or surface reference, dependent upon ref_type
 		CUtexref tex_ref;
 		CUsurfref surf_ref;
 	};


 	//
 	// Kernels are function handles within a module with no resource to manage
 	//
 	struct clcpp_attr(reflect_part) Kernel : public cmp::HndKernel
 	{
 		Kernel();

 		// Parent program
 		Program* program;

 		// Keep name around for reloads
 		core::String32 name;

 		// Handle to function
 		CUfunction function;

 		core::Vector<KernelTextureParam> texture_params;
 	};


 	//
 	// Programs are CUDA modules, controlling a list of Kernel objects
 	//
 	struct clcpp_attr(reflect_part) Program : public cmp::HndProgram
 	{
 		Program();
 		~Program();

 		CUmodule module;

 		// Filename hash for reloads
 		u32 id;

 		// Pointer to the program loader for removal on destruction
 		CUDACompute* loader;

 		// Allocated kernels owned by this program
 		core::Vector<Kernel*> kernels;

 		// Mutex for reloading programs
 		mpp::Mutex mutex;
 	};


 	struct clcpp_attr(reflect_part) DeviceMemory : public cmp::DeviceMem
 	{
 		DeviceMemory(u32 size);
 		virtual ~DeviceMemory();
 		
 		CUdeviceptr ptr;
 	};


 	//
 	// Page-locked host memory that is accessible to the device, useful in small parts for staging areas
 	//
 	struct clcpp_attr(reflect_part) HostMemory : public cmp::HostMem
 	{
 		HostMemory(u32 size);
 		virtual ~HostMemory();

 		// Interface implementations
 		virtual void* MapNow(cmp::Queue* queue);
 		virtual void* Map(cmp::Queue* queue);
 		virtual void Unmap(cmp::Queue* queue);
 	};


 	struct clcpp_attr(reflect_part) Texture3D : public cmp::Texture3D
 	{
 		Texture3D(u32 width, u32 height, u32 depth, const Format& format, cmp::Access access);
 		virtual ~Texture3D();

 		// Allocated device memory
 		CUarray array;

 		// View flags
 		CUaddress_mode address_mode;
 		CUfilter_mode filter_mode;
 		u32 flags;
 	};


 	struct clcpp_attr(reflect_part) GfxResource : public cmp::GfxResource
 	{
 		GfxResource(void* resource);
 		~GfxResource();

 		// The source D3D resource registered for use with CUDA
 		void* resource;

 		// The CUDA interop object
 		CUgraphicsResource graphics_resource;
 	};


 	struct clcpp_attr(reflect_part) Event : public cmp::Event
 	{
 		Event();
 		~Event();

 		CUevent event;
 	};


 	void Init();

 	CUmodule LoadPTXModule(const file::Path& filename, core::String256& response);

 	Program* NewProgram(const file::Path& filename);

 	Kernel* NewKernel(Program* program, const core::String32& name);
 }
diff --git a/CUDACompute.cpp b/CUDACompute.cpp
 @@ -1,401 +0,0 @@

 // TODO: Is it allowed for the same context to be pushed on multiple thread contexts at the same time?

 #include "CUDACompute.h"

 #include <Core/Math.h>
 #include <Core/File.h>


 clcpp_impl_class(CUDACompute)


 // TODO: Got to move these
 // One idea would be to get a new tool to generate a C++ file with all export functions
 #include <Core/CoreReflection.h>
 clcpp_impl_class(core::VectorReadIterator)
 clcpp_impl_class(core::VectorWriteIterator)
 clcpp_impl_class(core::String32)
 clcpp_impl_class(core::String64)
 clcpp_impl_class(core::String256)


 namespace
 {
 	cuda::Kernel* GetKernel(cuda::Program* program, const core::String256& kernel_name)
 	{
 		// Linear search for matching kernel name
 		for (u32 i = 0; i < program->kernels.size(); i++)
 		{
 			cuda::Kernel* kernel = program->kernels[i];
 			if (kernel->name == kernel_name)
 				return kernel;
 		}

 		return nullptr;
 	}


 	bool ReadString(file::File& fp, core::String256& string, u32 length)
 	{
 		string.set_length(length);
 		return fp.Read(string.data(), length) == length;
 	}


 	bool ReadString(file::File& fp, core::String256& string)
 	{
 		u32 string_length = 0;
 		if (!file::Read(fp, string_length))
 			return false;
 		return ReadString(fp, string, string_length);
 	}


 	bool LoadTextureReflection(const file::Path& filename, cuda::Program* program)
 	{
 		core::Assert(program != nullptr);

 		// Open for read
 		file::Path full_path = file::MakeGamePath(filename.c_str());
 		file::File fp(full_path.c_str(), "rb");
 		if (!fp.IsOpen())
 			return false;

 		// Ensure the ID matches
 		core::String256 id;
 		if (!ReadString(fp, id, 23))
 			return false;
 		if (id != core::String256("CUDAKernelTextureParams"))
 			return false;

 		CUmodule module = program->module;

 		// Read info for all functions
 		u32 nb_functions = 0;
 		if (!file::Read(fp, nb_functions))
 			return false;
 		for (u32 i = 0; i < nb_functions; i++)
 		{
 			// Read the function name
 			core::String256 function_name;
 			if (!ReadString(fp, function_name))
 				return false;

 			// Allocate enough space for all parameters in this function
 			u32 nb_params = 0;
 			if (!file::Read(fp, nb_params))
 				return false;
 			core::Vector<cuda::KernelTextureParam> texture_params(nb_params);

 			// Read all texture parameter objects
 			for (u32 j = 0; j < nb_params; j++)
 			{
 				cuda::KernelTextureParam& param = texture_params[j];
 				if (!ReadString(fp, param.global_name))
 					return false;
 				if (!file::Read(fp, param.ref_type))
 					return false;
 				if (!file::Read(fp, param.dimensions))
 					return false;
 				if (!file::Read(fp, param.read_type))
 					return false;
 			}

 			// Attempt to get a matching kernel
 			cuda::Kernel* kernel = GetKernel(program, function_name);
 			if (kernel == nullptr)
 				continue;

 			// Get global texture/surface references for each parameter
 			for (u32 j = 0; j < nb_params; j++)
 			{
 				cuda::KernelTextureParam& param = texture_params[j];
 				if (param.ref_type == 't')
 					cuModuleGetTexRef(&param.tex_ref, module, param.global_name.c_str());
 				if (param.ref_type == 's')
 					cuModuleGetSurfRef(&param.surf_ref, module, param.global_name.c_str());
 			}

 			// Store for runtime use in the kernel
 			kernel->texture_params.copy_from(texture_params);
 		}

 		return true;
 	}
 }


 CUDACompute::CUDACompute()
 	: m_DeviceID(-1)
 	, m_Device(-1)
 	, m_Context(nullptr)
 {
 	// Initialise the driver API
 	core::LogText("CUDA: Initialising Driver API");
 	if (cudaHasError(cuInit(0)))
 		return;

 	// Report driver version
 	int driver_version;
 	if (cudaHasError(cuDriverGetVersion(&driver_version)))
 		return;
 	core::LogText("CUDA: Driver version %d", driver_version);

 	// Set the device with the highest gflops/s
 	m_DeviceID = cuda::GetMaxGflopsDeviceId();
 	if (cudaHasError(cuDeviceGet(&m_Device, m_DeviceID)))
 		return;

 	// Report what device is in use
 	char name[100];
 	cuDeviceGetName(name, 100, m_Device);
 	core::LogText("CUDA: Using device [%d]: %s", m_DeviceID, name);

 	// Create the main context and pop it off the stack to allow other CUDA contexts elsewhere
 	core::LogText("CUDA: Creating context");
 	if (cudaHasError(cuCtxCreate(&m_Context, CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST, m_Device)))
 		return;
 	cuCtxPopCurrent(&m_Context);

 	cuda::Init();

 	// Bind to remotery
 	rmtCUDABind bind;
 	bind.context = m_Context;
 	bind.CtxSetCurrent = &cuCtxSetCurrent;
 	bind.CtxGetCurrent = &cuCtxGetCurrent;
 	bind.EventCreate = &cuEventCreate;
 	bind.EventDestroy = &cuEventDestroy;
 	bind.EventRecord = &cuEventRecord;
 	bind.EventQuery = &cuEventQuery;
 	bind.EventElapsedTime = &cuEventElapsedTime;
 	rmt_BindCUDA(&bind);
 }


 CUDACompute::~CUDACompute()
 {
 	if (m_Context != nullptr)
 		cuCtxDestroy(m_Context);
 }


 bool CUDACompute::FilesChanged(const core::Vector<file::Path>& filenames, core::String256& response)
 {
 	cuda::EnsureContext(m_Context);

 	bool changed = false;
 	for (u32 i = 0; i < filenames.size(); i++)
 	{
 		u32 program_id = core::MakeNameID(filenames[i].c_str());

 		// Search for a matching program ID - note that there may be many programs matching the same ID
 		for (u32 j = 0; j < m_Programs.size(); j++)
 		{
 			cuda::Program* program = m_Programs[j];
 			if (program->id != program_id)
 				continue;

 			// Ensure reloads don't happen while a program/kernel is in use
 			mpp::MutexLock lock(program->mutex);

 			response += core::String256("   Compiling ");
 			response += filenames[i];
 			response += core::String256("\n");

 			// Reload/compile the new program and don't do anything if it fails
 			CUmodule module = cuda::LoadPTXModule(filenames[i], response);
 			if (module == nullptr)
 				break;

 			response += core::String256("   Success - reloading\n");

 			// Release the old program
 			if (program->module != nullptr)
 				cuModuleUnload(program->module);

 			// Set the new program and retrieve an all new set of kernel functions
 			program->module = module;
 			for (size_t k = 0; k < program->kernels.size(); k++)
 			{
 				cuda::Kernel* kernel = program->kernels[k];

 				CUfunction function;
 				if (cudaHasError(cuModuleGetFunction(&function, program->module, kernel->name.c_str())))
 					break;
 				kernel->function = function;
 			}

 			// Reload texture reflection
 			file::Path pathless_filename, extension;
 			file::SplitPathExt(filenames[i], pathless_filename, extension);
 			file::Path ckt_filename = pathless_filename + core::String256(".ckt");
 			if (!LoadTextureReflection(ckt_filename, program))
 			{
 				Delete(program);
 				break;
 			}

 			changed = true;
 		}
 	}

 	return changed;
 }


 const cmp::HndProgram* CUDACompute::Program_New(const file::Path& filename, const core::String32* kernel_names, u32 nb_kernels)
 {
 	cuda::EnsureContext(m_Context);

 	// Load the program
 	file::Path ptx_filename = filename + file::Path(".ptx");
 	cuda::Program* program = cuda::NewProgram(file::NormalisePath(ptx_filename));
 	if (program == nullptr)
 		return nullptr;

 	// Retrieve all kernels
 	for (u32 i = 0; i < nb_kernels; i++)
 	{
 		cuda::Kernel* kernel = cuda::NewKernel(program, kernel_names[i]);
 		program->kernels.push_back(kernel);
 	}

 	// Load texture reflection
 	file::Path ckt_filename = filename + core::String256(".ckt");
 	if (!LoadTextureReflection(ckt_filename, program))
 	{
 		Delete(program);
 		return nullptr;
 	}

 	// Record the program in the loader
 	program->loader = this;
 	m_Programs.push_back(program);

 	return program;
 }


 const cmp::HndKernel* CUDACompute::Program_GetKernel(const cmp::HndProgram* h_program, const core::String32& name)
 {
 	cuda::EnsureContext(m_Context);

 	// Linear search for kernel by name
 	const cuda::Program* program = h_program->Cast<cuda::Program>();
 	for (u32 i = 0; i < program->kernels.size(); i++)
 	{
 		cuda::Kernel* kernel = program->kernels[i];
 		if (kernel->name == name)
 			return kernel;
 	}

 	return nullptr;
 }


 cmp::DeviceMem* CUDACompute::DeviceMem_New(u32 size, cmp::Access)
 {
 	cuda::EnsureContext(m_Context);

 	// Allocate device memory, ignoring flags (only required by OpenCL)
 	cuda::DeviceMemory* device_mem = new cuda::DeviceMemory(size);
 	if (device_mem->ptr == nullptr)
 	{
 		delete device_mem;
 		return nullptr;
 	}

 	return device_mem;
 }


 cmp::HostMem* CUDACompute::HostMem_New(u32 size, cmp::Access access)
 {
 	cuda::EnsureContext(m_Context);

 	// Allocate host memory, ignoring flags (only required by OpenCL)
 	cuda::HostMemory* host_mem = new cuda::HostMemory(size);
 	if (host_mem->Desc().mapped_mem == nullptr)
 	{
 		delete host_mem;
 		return nullptr;
 	}

 	return host_mem;
 }


 cmp::Texture3D* CUDACompute::Texture3D_New(u32 width, u32 height, u32 depth, const Format& fmt, cmp::Access access)
 {
 	cuda::EnsureContext(m_Context);

 	cuda::Texture3D* texture = new cuda::Texture3D(width, height, depth, fmt, access);
 	if (texture->array == nullptr)
 	{
 		delete texture;
 		return nullptr;
 	}

 	return texture;
 }


 cmp::GfxResource* CUDACompute::GfxResource_New(void* resource)
 {
 	cuda::EnsureContext(m_Context);

 	cuda::GfxResource* gfx_resource = new cuda::GfxResource(resource);
 	if (gfx_resource->graphics_resource == nullptr)
 	{
 		delete gfx_resource;
 		return nullptr;
 	}

 	return gfx_resource;
 }


 cmp::Event* CUDACompute::Event_New()
 {
 	cuda::EnsureContext(m_Context);

 	cuda::Event* event = new cuda::Event();
 	if (event->event == nullptr)
 	{
 		delete event;
 		return nullptr;
 	}

 	return event;
 }


 cmp::Queue* CUDACompute::Queue_New()
 {
 	cuda::EnsureContext(m_Context);

 	cuda::Queue* queue = new cuda::Queue(m_Context);
 	if (queue->Stream() == nullptr)
 	{
 		delete queue;
 		return nullptr;
 	}

 	return queue;
 }


 void CUDACompute::RemoveProgram(cuda::Program* program)
 {
 	// Linear search for program by pointer
 	for (u32 i = 0; i < m_Programs.size(); i++)
 	{
 		if (m_Programs[i] == program)
 		{
 			m_Programs.remove_unstable(i);
 			break;
 		}
 	}
 }
diff --git a/CUDACompute.h b/CUDACompute.h
 @@ -1,38 +0,0 @@

 #include "CUDA.h"


 class clcpp_attr(reflect_part) CUDACompute : public cmp::iCompute
 {
 public:
 	CUDACompute();
 	~CUDACompute();

 	// Subsystem implementations
 	bool FilesChanged(const core::Vector<file::Path>& filenames, core::String256& response);

 	// Interface implementations
 	const cmp::HndProgram* Program_New(const file::Path& filename, const core::String32* kernel_names, u32 nb_kernels);
 	const cmp::HndKernel* Program_GetKernel(const cmp::HndProgram* program, const core::String32& name);
 	cmp::DeviceMem* DeviceMem_New(u32 size, cmp::Access access);
 	cmp::HostMem* HostMem_New(u32 size, cmp::Access access);
 	cmp::Texture3D* Texture3D_New(u32 width, u32 height, u32 depth, const Format& fmt, cmp::Access access);
 	cmp::GfxResource* GfxResource_New(void* resource);
 	cmp::Event* Event_New();
 	cmp::Queue* Queue_New();

 	void RemoveProgram(cuda::Program* program);

 private:
 	void DequeueRaiseEvent();

 	// Currently selected device
 	int m_DeviceID;
 	CUdevice m_Device;

 	// Main context
 	CUcontext m_Context;

 	// List of loaded programs
 	core::Vector<cuda::Program*> m_Programs;
 };
 No newline at end of file
diff --git a/CUDAPlatform.py b/CUDAPlatform.py

 import os
 import Utils
 import Process
 import BuildSystem


 # Retrieve the installation directories from the environment
 InstallDir = None
 if "CUDA_PATH" in os.environ:
    InstallDir = os.environ["CUDA_PATH"]
 SampleDir = None
 if "NVCUDASAMPLES_ROOT" in os.environ:
    SampleDir = os.environ["NVCUDASAMPLES_ROOT"]


 # Setup paths relative to the installation path
 IncludeDir = os.path.join(InstallDir, "include") if InstallDir else None
 x86LibDir = os.path.join(InstallDir, "lib/Win32") if InstallDir else None
 x64LibDir = os.path.join(InstallDir, "lib/x64") if InstallDir else None
 BinDir = os.path.join(InstallDir, "bin") if InstallDir else None


 # Setup paths relative to the samples path
 SampleCommonIncludeDir = os.path.join(SampleDir, "common/inc") if SampleDir else None


 #
 # Names of nVidia GPU Virtual Architectures for generating up to the PTX stage
 #
 VirtualArch = Utils.enum(
    compute_10 = 'compute_10',
    compute_11 = 'compute_11',
    compute_12 = 'compute_12',
    compute_13 = 'compute_13',
    compute_20 = 'compute_20',
    compute_30 = 'compute_30',
    compute_32 = 'compute_32',
    compute_35 = 'compute_35',
    compute_50 = 'compute_50',
 )

 #
 # Names of nVidia GPU Real Archtectures for generating final binary images
 #
 RealArch = Utils.enum(
    sm_10 = 'sm_10',
    sm_11 = 'sm_11',
    sm_12 = 'sm_12',
    sm_13 = 'sm_13',
    sm_20 = 'sm_20',
    sm_21 = 'sm_21',
    sm_30 = 'sm_30',
    sm_32 = 'sm_32',
    sm_35 = 'sm_35',
    sm_50 = 'sm_50',
 )


 class CUDACompileOptions:

    def __init__(self):

        # Set to 'c', 'c++' or 'cu' to explicitly set input language, rather than using extension
        self.Language = None

        # List of normal/system include search paths
        self.IncludePaths = [ ]
        self.SystemIncludePaths = [ ]

        # List of files to include first during preprocessing 
        self.IncludeFiles = [ ]

        # List of macros to define/undefine for preprocessor
        self.DefineMacros = [ ]
        self.UndefineMacros = [ ]

        # List of library search paths
        self.LibraryPaths = [ ]

        # List of libraries to link with (specified without the library extension)
        self.Libraries = [ ]

        # Specify 32/64 bit machine target
        self.MachineBits = 32

        # Specific the path in which the compiler host EXE resides (e.g. MSVC, GCC)
        self.HostCompilerPath = None

        # Set to 'none', 'shared' or 'static' to specify runtime library type - default is 'static'
        self.CUDARuntime = None

        # Generate debug information for host/device code
        self.HostDebugLevel = None
        self.DeviceDebug = False

        # GPU architecture and GPUs to generate code for
        self.GPUArch = VirtualArch.compute_10;
        self.GPUCode = RealArch.sm_10;

        # Math operation behaviour
        self.FlushSingleDenormalsToZero = False
        self.PreciseSingleDivRecip = True
        self.PreciseSingleSqrt = True
        self.FuseMultipleAdds = True
        self.UseFastMath = False

        # Tool options
        self.DisableWarnings = False
        self.SourceInPTX = False
        self.RestrictPointers = False

    def UpdateCommandLine(self):

        cmdline = [ ]

        if self.Language: cmdline += [ '--x=' + self.Language ]

        cmdline += [ '--include-path=' + path for path in self.IncludePaths ]
        cmdline += [ '--system-include=' + path for path in self.SystemIncludePaths ]
        cmdline += [ '--pre-include=' + file for file in self.IncludeFiles ]
        cmdline += [ '--define-macro=' + macro for macro in self.DefineMacros ]
        cmdline += [ '--undefine-macro=' + macro for macro in self.UndefineMacros ]

        cmdline += [ '--library-path=' + lib for lib in self.LibraryPaths ]
        cmdline += [ '--library' + lib for lib in self.Libraries ]

        cmdline += [ '--machine=' + str(self.MachineBits) ]

        if self.HostCompilerPath: cmdline += [ '--compiler-bindir=' + self.HostCompilerPath ]
        if self.CUDARuntime: cmdline += [ '--cudart=' + self.CUDARuntime ]

        if self.HostDebugLevel != None: cmdline += [ '--debug=' + str(self.HostDebugLevel) ]
        if self.DeviceDebug: cmdline += [ '--device-debug' ]

        cmdline += [ '--gpu-architecture=' + self.GPUArch ]
        cmdline += [ '--gpu-code=' + self.GPUCode ]

        cmdline += [ '--ftz=' + ('true' if self.FlushSingleDenormalsToZero else 'false') ]
        cmdline += [ '--prec-div=' + ('true' if self.PreciseSingleDivRecip else 'false') ]
        cmdline += [ '--prec-sqrt=' + ('true' if self.PreciseSingleSqrt else 'false') ]
        cmdline += [ '--fmad=' + ('true' if self.FuseMultipleAdds else 'false') ]
        if self.UseFastMath: cmdline += [ '--use_fast_math' ]

        if self.DisableWarnings: cmdline += [ '--disable-warnings' ]
        if self.SourceInPTX: cmdline += [ '--source-in-ptx' ]
        if self.RestrictPointers: cmdline += [ '--restrict' ]

        self.CommandLine = cmdline


 class BuildPTXNode (BuildSystem.Node):

    def __init__(self, source):

        super().__init__()
        self.Source = source
        self.Dependencies = [ source ]

    def Build(self, env):

        # Build command-line from current configuration
        cmdline = [ os.path.join(BinDir, "nvcc.exe") ]
        cmdline += [ '--ptx' ]
        cmdline += env.CurrentConfig.CUDACompileOptions.CommandLine

        # Add the output .ptx file
        output_files = self.GetOutputFiles(env)
        cmdline += [ '--output-file=' + output_files[0] ]

        # Add input file before finishing
        cmdline += [ self.GetInputFile(env) ]
        Utils.ShowCmdLine(env, cmdline)

        # Launch the compiler and wait for it to finish
        process = Process.OpenPiped(cmdline)
        output = Process.WaitForPipeOutput(process)
        if not env.NoToolOutput:
            print(output)

        return process.returncode == 0

    def GetInputFile(self, env):

        return self.Source.GetOutputFiles(env)[0]

    def GetOutputFiles(self, env):

        # Get the filename minus path and extension
        # TODO: This only works if this node has another node as input that resides in
        # the same directory as it. Need to evaluate relative path inputs in long chains.
        input_file = self.GetInputFile(env)
        input_file = os.path.split(input_file)[1]
        input_file = os.path.splitext(input_file)[0]

        ptx_path = os.path.join(env.CurrentConfig.OutputPath, input_file + ".ptx")
        return [ ptx_path ]

    def GetTempOutputFiles(self, env):

        return self.GetOutputFiles(env)
diff --git a/Kernels.pibfile b/Kernels.pibfile

 SetOutputPaths(env, "Kernels")
 debug_config = env.Configs["debug"]
 release_config = env.Configs["release"]

 # Need to add this as an include directory for the generated code to reference
 current_dir = os.getcwd()

 # Location of kernel data to be loaded at runtime
 kernel_data_dir = project_dir + "pub/GameData/Kernels"

 # Gather input files
 input_files = Utils.Glob(".", "*.cu")
 input_file_nodes = [ env.NewFile(cu_file) for cu_file in input_files ]

 kernel_include_paths = [
 	current_dir,
 	project_dir + "src/CppClient",
 	project_dir + "extern/ComputeBridge/cbpp/inc",
 ]

 # ComputeBridge uses same options for debug/release
 cb_options = ComputeBridgePlatform.Options()
 cb_options.IncludePaths = kernel_include_paths
 cb_options_map = { "debug": cb_options, "release": cb_options }

 # Run ComputeBridge for both CUDA and OpenCL
 cuda_cb_files = [ ComputeBridgePlatform.BuildNode(cu_file, "cuda", cb_options_map) for cu_file in input_file_nodes ]
 opencl_cb_files = [ ComputeBridgePlatform.BuildNode(cu_file, "opencl", cb_options_map) for cu_file in input_file_nodes ]

 # Setup CUDA compile options
 debug_config.CUDACompileOptions = CUDAPlatform.CUDACompileOptions()
 debug_config.CUDACompileOptions.Language = 'cu'
 debug_config.CUDACompileOptions.GPUArch = CUDAPlatform.VirtualArch.compute_20
 debug_config.CUDACompileOptions.GPUCode = CUDAPlatform.RealArch.sm_20
 debug_config.CUDACompileOptions.HostCompilerPath = os.path.join(MSVCPlatform.VCInstallDir, "bin")
 debug_config.CUDACompileOptions.UpdateCommandLine()
 release_config.CUDACompileOptions = debug_config.CUDACompileOptions

 # Build CUDA PTX files
 cuda_ptx_files = [ CUDAPlatform.BuildPTXNode(cb_file) for cb_file in cuda_cb_files ]

 # Setup OpenCL compile options (it appears the OpenCL compiler already includes cwd)
 debug_config.OpenCLCompileOptions = OpenCLPlatform.OpenCLCompileOptions()
 debug_config.OpenCLCompileOptions.UpdateCommandLine()
 release_config.OpenCLCompileOptions = debug_config.OpenCLCompileOptions

 # OpenCL files are compiled on load so just run the precompiler
 opencl_out_files = [ OpenCLPlatform.BuildOpenCLNode(cb_file) for cb_file in opencl_cb_files ]

 # Copy CUDA output files for load and OpenCL ComputeBridge output for runtime compile
 copied_files = [env.CopyOutputFile(ptx_file, 0, kernel_data_dir) for ptx_file in cuda_ptx_files ]
 copied_files += [ env.CopyOutputFile(ctk_file, 1, kernel_data_dir) for ctk_file in cuda_cb_files ]
 copied_files += [ env.CopyOutputFile(cl_file, 0, kernel_data_dir) for cl_file in opencl_cb_files ]

 env.Build(cuda_ptx_files + opencl_out_files + copied_files, "Kernels")

 VCGenerateProjectFile(env, "Kernels", input_files + [ "Kernels.pibfile" ], None, targets="Kernels", pibfile = "..\..\..\pibfile")

	//
	// TODO: Expose Desc() as public member?
	// After that's done, why does the Desc struct exist at all?
	// Use const/const_cast trick to make the members read-only?
	//

	#pragma once


	#include <Core/Core.h>
	#include <Core/PixelFormat.h>


	struct Format;
	namespace mpp
	{
	class Event;
	}


	clcpp_reflect_part(cmp)
	namespace cmp
	{
	// ------------------------------------------------------------------------------------------------------- //
	// Handles //
	// ------------------------------------------------------------------------------------------------------- //


	struct clcpp_attr(reflect) HndProgram : public clutl::Object
	{
	};
	struct clcpp_attr(reflect) HndKernel : public clutl::Object
	{
	};



	// ------------------------------------------------------------------------------------------------------- //
	// Kernel Argument API //
	// ------------------------------------------------------------------------------------------------------- //


	class KernelArgs;


	struct clcpp_attr(reflect_part) KernelArg
	{
	KernelArg();

	// Return a copy of the argument
	template <typename TYPE>
	TYPE Get() const
	{
	return (TYPE)Data();
	}

	// Set the argument, copying the value
	template <typename TYPE>
	void Set(TYPE value)
	{
	(TYPE)Data() = value;
	}

	void* Data() const;

	// Reflected data type
	const clcpp::Type* data_type;

	// Size of the data, not including sizeof(KernelArg)
	u32 data_size;

	// Argument index
	u32 index;

	// Byte offset to the data
	u32 offset;

	// Pointer to the argument list that owns this argument
	KernelArgs* args_parent;
	};


	//
	// This class allows queueing of kernel arguments so that they can be applied to a kernel at the point of
	// execution. This allows the Compute API to guarantee safe access to a kernel without it being reloaded
	// during use. The type helpers are just an added convenience and not the point of this class.
	//
	class clcpp_attr(reflect) KernelArgs
	{
	public:
	KernelArgs();

	template <typename TYPE>
	void Push(TYPE value)
	{
	Push(&value, clcpp::GetType< CORE_STRIP_CONST_POINTER(TYPE) >(), sizeof(value));
	}

	private:
	friend struct KernelArg;
	friend class KernelArgIterator;

	void Push(void* data, const clcpp::Type* type, u32 data_size);

	// Store data in the container memory space for cheap instantiation
	static const u32 MAX_DATA_SIZE = 1024;
	u8 m_Data[MAX_DATA_SIZE];
	u32 m_Position;

	u32 m_NbArguments;
	};


	class KernelArgIterator
	{
	public:
	KernelArgIterator(const KernelArgs& args);

	bool GetNext(KernelArg& arg);

	private:
	const KernelArgs& m_KernelArgs;
	u32 m_Position;
	};



	// ------------------------------------------------------------------------------------------------------- //
	// Interfaces //
	// ------------------------------------------------------------------------------------------------------- //



	//
	// Specifies read/write access to device memory for kernels
	//
	enum clcpp_attr(reflect) Access
	{
	Access_Read,
	Access_Write,
	Access_ReadWrite,
	};


	class HostMem;
	class DeviceMem;
	class Texture3D;
	struct GfxResource;
	struct Event;



	// ------------------------------------------------------------------------------------------------------- //
	// Concurrent Work Queue Interface //
	// ------------------------------------------------------------------------------------------------------- //


	static const int MAX_KERNEL_BLOCK_SIZE = 1;


	// ARC-NOTE: No clcpp_impl_class as there are pure virtuals (and we don't WANT it to be creatable)
	struct clcpp_attr(reflect) Queue : public clutl::Object2
	{
	virtual ~Queue() { }

	virtual bool RunKernel(const HndKernel* h_kernel, const KernelArgs& args, u32 nb_items, int block_size) = 0;

	virtual bool CopyHostToDevice(const void* src_data, u32 src_size, DeviceMem* device_mem) = 0;
	virtual bool CopyDeviceToHost(DeviceMem* device_mem, void* dst_data, u32 dst_size) = 0;
	virtual bool CopyHostToTexture3D(const void* src_data, u32 src_pitch, u32 src_height, u32 src_depth, Texture3D* h_texture) = 0;
	virtual bool CopyTexture3DToHost(Texture3D* h_texture, void* dst_data, u32 dst_pitch, u32 dst_height, u32 dst_depth) = 0;

	virtual bool CopyDeviceToGfxTexture3D(DeviceMem* src_device, GfxResource* dst_gfxres, u32 pitch, u32 height, u32 depth) = 0;
	virtual bool CopyDeviceToGfxBuffer(DeviceMem* src_device, GfxResource* dst_gfxres, u32 size) = 0;

	virtual bool Sync() = 0;

	virtual bool MarkEvent(Event* event) = 0;
	virtual bool WaitEvent(Event* event) = 0;

	virtual void* Resource() = 0;

	// Helpers to use descriptions embedded in memory objects to simplify the call
	bool CopyMappedHostToDevice(HostMem* host_mem, DeviceMem* device_mem);
	bool CopyDeviceToMappedHost(DeviceMem* device_mem, HostMem* host_mem);
	bool CopyMappedHostToTexture3D(HostMem* host_mem, Texture3D* texture);
	bool CopyTexture3DToMappedHost(Texture3D* texture, HostMem* host_mem);
	};



	// ------------------------------------------------------------------------------------------------------- //
	// Host Memory Interface //
	// ------------------------------------------------------------------------------------------------------- //



	struct clcpp_attr(reflect) HostMemDesc
	{
	HostMemDesc();
	u32 size;
	void* mapped_mem;
	};


	//
	// Pinned/page-locked host memory allocated to be the source or destination of transfers to or
	// from the device.
	//
	class clcpp_attr(reflect) HostMem : public clutl::Object2
	{
	public:
	virtual ~HostMem() { }

	virtual void* MapNow(Queue* queue) = 0;
	virtual void* Map(Queue* queue) = 0;
	virtual void Unmap(Queue* queue) = 0;

	const HostMemDesc& Desc() const { return m_Desc; }

	protected:
	HostMemDesc m_Desc;
	};



	// ------------------------------------------------------------------------------------------------------- //
	// Device Memory Interface //
	// ------------------------------------------------------------------------------------------------------- //



	struct clcpp_attr(reflect) DeviceMemDesc
	{
	DeviceMemDesc();
	u32 size;
	};


	//
	// A chunk of memory allocated on the device that can't be directly access by the host without
	// copying.
	//
	class clcpp_attr(reflect) DeviceMem : public clutl::Object2
	{
	public:
	virtual ~DeviceMem() { }

	const DeviceMemDesc& Desc() const { return m_Desc; }

	protected:
	DeviceMemDesc m_Desc;
	};



	// ------------------------------------------------------------------------------------------------------- //
	// Texture Interface //
	// ------------------------------------------------------------------------------------------------------- //


	// TODO: There is a need for some kind of "sampler object" to describe filtering, etc.


	struct clcpp_attr(reflect) TextureDesc
	{
	TextureDesc();
	u32 width;
	u32 height;
	u32 depth;
	u32 pitch;
	u32 size;
	Format format;
	};


	//
	// A 3D texture allocated on the device to be sampled in kernels.
	//
	class clcpp_attr(reflect) Texture3D : public clutl::Object2
	{
	public:
	virtual ~Texture3D() { }

	const TextureDesc& Desc() const { return m_Desc; }

	protected:
	TextureDesc m_Desc;
	};


	struct clcpp_attr(reflect) GfxResource : public clutl::Object2
	{
	virtual ~GfxResource() { }
	};



	// ------------------------------------------------------------------------------------------------------- //
	// Event Interface //
	// ------------------------------------------------------------------------------------------------------- //



	struct clcpp_attr(reflect_part) Event : public clutl::Object2
	{
	virtual ~Event() { };
	};




	// ------------------------------------------------------------------------------------------------------- //
	// Main Compute Module Interface //
	// ------------------------------------------------------------------------------------------------------- //



	struct clcpp_attr(reflect_part) iCompute : public core::iSubsystem
	{
	virtual const HndProgram* Program_New(const file::Path& filename, const core::String32* kernel_names, u32 nb_kernels) = 0;
	virtual const HndKernel* Program_GetKernel(const HndProgram* program, const core::String32& name) = 0;

	virtual DeviceMem* DeviceMem_New(u32 size, Access access) = 0;

	virtual HostMem* HostMem_New(u32 size, Access access) = 0;

	// If read/write is specified then CUDA creates a surface reference
	// Texture reflection can tell whether a surface needs to be bound or now
	virtual Texture3D* Texture3D_New(u32 width, u32 height, u32 depth, const Format& fmt, Access access) = 0;

	virtual GfxResource* GfxResource_New(void* resource) = 0;

	virtual Event* Event_New() = 0;

	virtual Queue* Queue_New() = 0;
	};
	}
	@@ -1,228 +0,0 @@

	#pragma once


	#include <Core/Compute.h>
	#include <Core/JobSystem.h>
	#include <Core/PixelFormat.h>


	class CUDACompute;


	// CUDA APIs currently don't compile with clang, which is what clReflect uses
	// However, it's very similar to GCC so define some needed macros from GCC
	#ifdef __clcpp_parse__

	#define __noinline__ __attribute__((noinline))
	#define __forceinline__ __inline__ __attribute__((always_inline))
	#define __align__(n) __attribute__((aligned(n)))
	#define __thread__ __thread
	#define __import__
	#define __export__
	#define __cdecl
	#define __annotate__(a) __attribute__((a))
	#define __location__(a) __annotate__(a)
	#define CUDARTAPI

	#endif


	struct IDXGIAdapter;
	struct ID3D11Device;
	struct ID3D11Resource;


	#include <cuda.h>
	#include <cudad3d11.h>
	#include <cuda_runtime.h>


	#define cudaHasError(result) cuda::HandleError((result), #result, __FILE__, __LINE__)


	clcpp_reflect_part(cuda)
	namespace cuda
	{
	struct Program;


	// Return if a CUDA function call returns an error, logging the error string and where the function was called
	bool HandleError(CUresult result, const char* expression, const char* file, int line);

	// Search all attached devices for the one with the highest performance
	int GetMaxGflopsDeviceId();


	// Ensure the current thread has the given context active
	void EnsureContext(CUcontext context);


	//
	// Compute queues implemented as CUDA streams
	//
	class clcpp_attr(reflect_part) Queue : public cmp::Queue
	{
	public:
	Queue(CUcontext context);
	virtual ~Queue();

	// Interface implementations
	virtual bool RunKernel(const cmp::HndKernel* h_kernel, const cmp::KernelArgs& args, u32 nb_items, int block_size);
	virtual bool CopyHostToDevice(const void* src_data, u32 src_size, cmp::DeviceMem* h_dev_mem);
	virtual bool CopyDeviceToHost(cmp::DeviceMem* h_dev_mem, void* dst_data, u32 dst_size);
	virtual bool CopyHostToTexture3D(const void* src_data, u32 src_pitch, u32 src_height, u32 src_depth, cmp::Texture3D* h_texture);
	virtual bool CopyTexture3DToHost(cmp::Texture3D* h_texture, void* dst_data, u32 dst_pitch, u32 dst_height, u32 dst_depth);
	virtual bool CopyDeviceToGfxTexture3D(cmp::DeviceMem* src_device, cmp::GfxResource* dst_gfxres, u32 pitch, u32 height, u32 depth);
	virtual bool CopyDeviceToGfxBuffer(cmp::DeviceMem* src_device, cmp::GfxResource* dst_gfxres, u32 size);
	virtual bool Sync();
	virtual bool MarkEvent(cmp::Event* event);
	virtual bool WaitEvent(cmp::Event* event);
	virtual void* Resource();

	CUstream Stream();

	private:
	CUcontext m_Context;
	CUstream m_Stream;
	};


	struct clcpp_attr(reflect_part) KernelTextureParam
	{
	KernelTextureParam()
	: ref_type(0)
	, dimensions(0)
	, read_type(0)
	, tex_ref(nullptr)
	, surf_ref(nullptr)
	{
	}

	// Name of global reference this parameter maps to
	core::String256 global_name;

	// Type info
	char ref_type;
	u32 dimensions;
	char read_type;

	// Texture or surface reference, dependent upon ref_type
	CUtexref tex_ref;
	CUsurfref surf_ref;
	};


	//
	// Kernels are function handles within a module with no resource to manage
	//
	struct clcpp_attr(reflect_part) Kernel : public cmp::HndKernel
	{
	Kernel();

	// Parent program
	Program* program;

	// Keep name around for reloads
	core::String32 name;

	// Handle to function
	CUfunction function;

	core::Vector<KernelTextureParam> texture_params;
	};


	//
	// Programs are CUDA modules, controlling a list of Kernel objects
	//
	struct clcpp_attr(reflect_part) Program : public cmp::HndProgram
	{
	Program();
	~Program();

	CUmodule module;

	// Filename hash for reloads
	u32 id;

	// Pointer to the program loader for removal on destruction
	CUDACompute* loader;

	// Allocated kernels owned by this program
	core::Vector<Kernel*> kernels;

	// Mutex for reloading programs
	mpp::Mutex mutex;
	};


	struct clcpp_attr(reflect_part) DeviceMemory : public cmp::DeviceMem
	{
	DeviceMemory(u32 size);
	virtual ~DeviceMemory();

	CUdeviceptr ptr;
	};


	//
	// Page-locked host memory that is accessible to the device, useful in small parts for staging areas
	//
	struct clcpp_attr(reflect_part) HostMemory : public cmp::HostMem
	{
	HostMemory(u32 size);
	virtual ~HostMemory();

	// Interface implementations
	virtual void* MapNow(cmp::Queue* queue);
	virtual void* Map(cmp::Queue* queue);
	virtual void Unmap(cmp::Queue* queue);
	};


	struct clcpp_attr(reflect_part) Texture3D : public cmp::Texture3D
	{
	Texture3D(u32 width, u32 height, u32 depth, const Format& format, cmp::Access access);
	virtual ~Texture3D();

	// Allocated device memory
	CUarray array;

	// View flags
	CUaddress_mode address_mode;
	CUfilter_mode filter_mode;
	u32 flags;
	};


	struct clcpp_attr(reflect_part) GfxResource : public cmp::GfxResource
	{
	GfxResource(void* resource);
	~GfxResource();

	// The source D3D resource registered for use with CUDA
	void* resource;

	// The CUDA interop object
	CUgraphicsResource graphics_resource;
	};


	struct clcpp_attr(reflect_part) Event : public cmp::Event
	{
	Event();
	~Event();

	CUevent event;
	};


	void Init();

	CUmodule LoadPTXModule(const file::Path& filename, core::String256& response);

	Program* NewProgram(const file::Path& filename);

	Kernel* NewKernel(Program* program, const core::String32& name);
	}
	@@ -1,401 +0,0 @@

	// TODO: Is it allowed for the same context to be pushed on multiple thread contexts at the same time?

	#include "CUDACompute.h"

	#include <Core/Math.h>
	#include <Core/File.h>


	clcpp_impl_class(CUDACompute)


	// TODO: Got to move these
	// One idea would be to get a new tool to generate a C++ file with all export functions
	#include <Core/CoreReflection.h>
	clcpp_impl_class(core::VectorReadIterator)
	clcpp_impl_class(core::VectorWriteIterator)
	clcpp_impl_class(core::String32)
	clcpp_impl_class(core::String64)
	clcpp_impl_class(core::String256)


	namespace
	{
	cuda::Kernel* GetKernel(cuda::Program* program, const core::String256& kernel_name)
	{
	// Linear search for matching kernel name
	for (u32 i = 0; i < program->kernels.size(); i++)
	{
	cuda::Kernel* kernel = program->kernels[i];
	if (kernel->name == kernel_name)
	return kernel;
	}

	return nullptr;
	}


	bool ReadString(file::File& fp, core::String256& string, u32 length)
	{
	string.set_length(length);
	return fp.Read(string.data(), length) == length;
	}


	bool ReadString(file::File& fp, core::String256& string)
	{
	u32 string_length = 0;
	if (!file::Read(fp, string_length))
	return false;
	return ReadString(fp, string, string_length);
	}


	bool LoadTextureReflection(const file::Path& filename, cuda::Program* program)
	{
	core::Assert(program != nullptr);

	// Open for read
	file::Path full_path = file::MakeGamePath(filename.c_str());
	file::File fp(full_path.c_str(), "rb");
	if (!fp.IsOpen())
	return false;

	// Ensure the ID matches
	core::String256 id;
	if (!ReadString(fp, id, 23))
	return false;
	if (id != core::String256("CUDAKernelTextureParams"))
	return false;

	CUmodule module = program->module;

	// Read info for all functions
	u32 nb_functions = 0;
	if (!file::Read(fp, nb_functions))
	return false;
	for (u32 i = 0; i < nb_functions; i++)
	{
	// Read the function name
	core::String256 function_name;
	if (!ReadString(fp, function_name))
	return false;

	// Allocate enough space for all parameters in this function
	u32 nb_params = 0;
	if (!file::Read(fp, nb_params))
	return false;
	core::Vector<cuda::KernelTextureParam> texture_params(nb_params);

	// Read all texture parameter objects
	for (u32 j = 0; j < nb_params; j++)
	{
	cuda::KernelTextureParam& param = texture_params[j];
	if (!ReadString(fp, param.global_name))
	return false;
	if (!file::Read(fp, param.ref_type))
	return false;
	if (!file::Read(fp, param.dimensions))
	return false;
	if (!file::Read(fp, param.read_type))
	return false;
	}

	// Attempt to get a matching kernel
	cuda::Kernel* kernel = GetKernel(program, function_name);
	if (kernel == nullptr)
	continue;

	// Get global texture/surface references for each parameter
	for (u32 j = 0; j < nb_params; j++)
	{
	cuda::KernelTextureParam& param = texture_params[j];
	if (param.ref_type == 't')
	cuModuleGetTexRef(&param.tex_ref, module, param.global_name.c_str());
	if (param.ref_type == 's')
	cuModuleGetSurfRef(&param.surf_ref, module, param.global_name.c_str());
	}

	// Store for runtime use in the kernel
	kernel->texture_params.copy_from(texture_params);
	}

	return true;
	}
	}


	CUDACompute::CUDACompute()
	: m_DeviceID(-1)
	, m_Device(-1)
	, m_Context(nullptr)
	{
	// Initialise the driver API
	core::LogText("CUDA: Initialising Driver API");
	if (cudaHasError(cuInit(0)))
	return;

	// Report driver version
	int driver_version;
	if (cudaHasError(cuDriverGetVersion(&driver_version)))
	return;
	core::LogText("CUDA: Driver version %d", driver_version);

	// Set the device with the highest gflops/s
	m_DeviceID = cuda::GetMaxGflopsDeviceId();
	if (cudaHasError(cuDeviceGet(&m_Device, m_DeviceID)))
	return;

	// Report what device is in use
	char name[100];
	cuDeviceGetName(name, 100, m_Device);
	core::LogText("CUDA: Using device [%d]: %s", m_DeviceID, name);

	// Create the main context and pop it off the stack to allow other CUDA contexts elsewhere
	core::LogText("CUDA: Creating context");
	if (cudaHasError(cuCtxCreate(&m_Context, CU_CTX_SCHED_BLOCKING_SYNC \| CU_CTX_MAP_HOST, m_Device)))
	return;
	cuCtxPopCurrent(&m_Context);

	cuda::Init();

	// Bind to remotery
	rmtCUDABind bind;
	bind.context = m_Context;
	bind.CtxSetCurrent = &cuCtxSetCurrent;
	bind.CtxGetCurrent = &cuCtxGetCurrent;
	bind.EventCreate = &cuEventCreate;
	bind.EventDestroy = &cuEventDestroy;
	bind.EventRecord = &cuEventRecord;
	bind.EventQuery = &cuEventQuery;
	bind.EventElapsedTime = &cuEventElapsedTime;
	rmt_BindCUDA(&bind);
	}


	CUDACompute::~CUDACompute()
	{
	if (m_Context != nullptr)
	cuCtxDestroy(m_Context);
	}


	bool CUDACompute::FilesChanged(const core::Vector<file::Path>& filenames, core::String256& response)
	{
	cuda::EnsureContext(m_Context);

	bool changed = false;
	for (u32 i = 0; i < filenames.size(); i++)
	{
	u32 program_id = core::MakeNameID(filenames[i].c_str());

	// Search for a matching program ID - note that there may be many programs matching the same ID
	for (u32 j = 0; j < m_Programs.size(); j++)
	{
	cuda::Program* program = m_Programs[j];
	if (program->id != program_id)
	continue;

	// Ensure reloads don't happen while a program/kernel is in use
	mpp::MutexLock lock(program->mutex);

	response += core::String256(" Compiling ");
	response += filenames[i];
	response += core::String256("\n");

	// Reload/compile the new program and don't do anything if it fails
	CUmodule module = cuda::LoadPTXModule(filenames[i], response);
	if (module == nullptr)
	break;

	response += core::String256(" Success - reloading\n");

	// Release the old program
	if (program->module != nullptr)
	cuModuleUnload(program->module);

	// Set the new program and retrieve an all new set of kernel functions
	program->module = module;
	for (size_t k = 0; k < program->kernels.size(); k++)
	{
	cuda::Kernel* kernel = program->kernels[k];

	CUfunction function;
	if (cudaHasError(cuModuleGetFunction(&function, program->module, kernel->name.c_str())))
	break;
	kernel->function = function;
	}

	// Reload texture reflection
	file::Path pathless_filename, extension;
	file::SplitPathExt(filenames[i], pathless_filename, extension);
	file::Path ckt_filename = pathless_filename + core::String256(".ckt");
	if (!LoadTextureReflection(ckt_filename, program))
	{
	Delete(program);
	break;
	}

	changed = true;
	}
	}

	return changed;
	}


	const cmp::HndProgram* CUDACompute::Program_New(const file::Path& filename, const core::String32* kernel_names, u32 nb_kernels)
	{
	cuda::EnsureContext(m_Context);

	// Load the program
	file::Path ptx_filename = filename + file::Path(".ptx");
	cuda::Program* program = cuda::NewProgram(file::NormalisePath(ptx_filename));
	if (program == nullptr)
	return nullptr;

	// Retrieve all kernels
	for (u32 i = 0; i < nb_kernels; i++)
	{
	cuda::Kernel* kernel = cuda::NewKernel(program, kernel_names[i]);
	program->kernels.push_back(kernel);
	}

	// Load texture reflection
	file::Path ckt_filename = filename + core::String256(".ckt");
	if (!LoadTextureReflection(ckt_filename, program))
	{
	Delete(program);
	return nullptr;
	}

	// Record the program in the loader
	program->loader = this;
	m_Programs.push_back(program);

	return program;
	}


	const cmp::HndKernel* CUDACompute::Program_GetKernel(const cmp::HndProgram* h_program, const core::String32& name)
	{
	cuda::EnsureContext(m_Context);

	// Linear search for kernel by name
	const cuda::Program* program = h_program->Cast<cuda::Program>();
	for (u32 i = 0; i < program->kernels.size(); i++)
	{
	cuda::Kernel* kernel = program->kernels[i];
	if (kernel->name == name)
	return kernel;
	}

	return nullptr;
	}


	cmp::DeviceMem* CUDACompute::DeviceMem_New(u32 size, cmp::Access)
	{
	cuda::EnsureContext(m_Context);

	// Allocate device memory, ignoring flags (only required by OpenCL)
	cuda::DeviceMemory* device_mem = new cuda::DeviceMemory(size);
	if (device_mem->ptr == nullptr)
	{
	delete device_mem;
	return nullptr;
	}

	return device_mem;
	}


	cmp::HostMem* CUDACompute::HostMem_New(u32 size, cmp::Access access)
	{
	cuda::EnsureContext(m_Context);

	// Allocate host memory, ignoring flags (only required by OpenCL)
	cuda::HostMemory* host_mem = new cuda::HostMemory(size);
	if (host_mem->Desc().mapped_mem == nullptr)
	{
	delete host_mem;
	return nullptr;
	}

	return host_mem;
	}


	cmp::Texture3D* CUDACompute::Texture3D_New(u32 width, u32 height, u32 depth, const Format& fmt, cmp::Access access)
	{
	cuda::EnsureContext(m_Context);

	cuda::Texture3D* texture = new cuda::Texture3D(width, height, depth, fmt, access);
	if (texture->array == nullptr)
	{
	delete texture;
	return nullptr;
	}

	return texture;
	}


	cmp::GfxResource* CUDACompute::GfxResource_New(void* resource)
	{
	cuda::EnsureContext(m_Context);

	cuda::GfxResource* gfx_resource = new cuda::GfxResource(resource);
	if (gfx_resource->graphics_resource == nullptr)
	{
	delete gfx_resource;
	return nullptr;
	}

	return gfx_resource;
	}


	cmp::Event* CUDACompute::Event_New()
	{
	cuda::EnsureContext(m_Context);

	cuda::Event* event = new cuda::Event();
	if (event->event == nullptr)
	{
	delete event;
	return nullptr;
	}

	return event;
	}


	cmp::Queue* CUDACompute::Queue_New()
	{
	cuda::EnsureContext(m_Context);

	cuda::Queue* queue = new cuda::Queue(m_Context);
	if (queue->Stream() == nullptr)
	{
	delete queue;
	return nullptr;
	}

	return queue;
	}


	void CUDACompute::RemoveProgram(cuda::Program* program)
	{
	// Linear search for program by pointer
	for (u32 i = 0; i < m_Programs.size(); i++)
	{
	if (m_Programs[i] == program)
	{
	m_Programs.remove_unstable(i);
	break;
	}
	}
	}
	@@ -1,38 +0,0 @@

	#include "CUDA.h"


	class clcpp_attr(reflect_part) CUDACompute : public cmp::iCompute
	{
	public:
	CUDACompute();
	~CUDACompute();

	// Subsystem implementations
	bool FilesChanged(const core::Vector<file::Path>& filenames, core::String256& response);

	// Interface implementations
	const cmp::HndProgram* Program_New(const file::Path& filename, const core::String32* kernel_names, u32 nb_kernels);
	const cmp::HndKernel* Program_GetKernel(const cmp::HndProgram* program, const core::String32& name);
	cmp::DeviceMem* DeviceMem_New(u32 size, cmp::Access access);
	cmp::HostMem* HostMem_New(u32 size, cmp::Access access);
	cmp::Texture3D* Texture3D_New(u32 width, u32 height, u32 depth, const Format& fmt, cmp::Access access);
	cmp::GfxResource* GfxResource_New(void* resource);
	cmp::Event* Event_New();
	cmp::Queue* Queue_New();

	void RemoveProgram(cuda::Program* program);

	private:
	void DequeueRaiseEvent();

	// Currently selected device
	int m_DeviceID;
	CUdevice m_Device;

	// Main context
	CUcontext m_Context;

	// List of loaded programs
	core::Vector<cuda::Program*> m_Programs;
	};
	No newline at end of file

	import os
	import Utils
	import Process
	import BuildSystem


	# Retrieve the installation directories from the environment
	InstallDir = None
	if "CUDA_PATH" in os.environ:
	InstallDir = os.environ["CUDA_PATH"]
	SampleDir = None
	if "NVCUDASAMPLES_ROOT" in os.environ:
	SampleDir = os.environ["NVCUDASAMPLES_ROOT"]


	# Setup paths relative to the installation path
	IncludeDir = os.path.join(InstallDir, "include") if InstallDir else None
	x86LibDir = os.path.join(InstallDir, "lib/Win32") if InstallDir else None
	x64LibDir = os.path.join(InstallDir, "lib/x64") if InstallDir else None
	BinDir = os.path.join(InstallDir, "bin") if InstallDir else None


	# Setup paths relative to the samples path
	SampleCommonIncludeDir = os.path.join(SampleDir, "common/inc") if SampleDir else None


	#
	# Names of nVidia GPU Virtual Architectures for generating up to the PTX stage
	#
	VirtualArch = Utils.enum(
	compute_10 = 'compute_10',
	compute_11 = 'compute_11',
	compute_12 = 'compute_12',
	compute_13 = 'compute_13',
	compute_20 = 'compute_20',
	compute_30 = 'compute_30',
	compute_32 = 'compute_32',
	compute_35 = 'compute_35',
	compute_50 = 'compute_50',
	)

	#
	# Names of nVidia GPU Real Archtectures for generating final binary images
	#
	RealArch = Utils.enum(
	sm_10 = 'sm_10',
	sm_11 = 'sm_11',
	sm_12 = 'sm_12',
	sm_13 = 'sm_13',
	sm_20 = 'sm_20',
	sm_21 = 'sm_21',
	sm_30 = 'sm_30',
	sm_32 = 'sm_32',
	sm_35 = 'sm_35',
	sm_50 = 'sm_50',
	)


	class CUDACompileOptions:

	def __init__(self):

	# Set to 'c', 'c++' or 'cu' to explicitly set input language, rather than using extension
	self.Language = None

	# List of normal/system include search paths
	self.IncludePaths = [ ]
	self.SystemIncludePaths = [ ]

	# List of files to include first during preprocessing
	self.IncludeFiles = [ ]

	# List of macros to define/undefine for preprocessor
	self.DefineMacros = [ ]
	self.UndefineMacros = [ ]

	# List of library search paths
	self.LibraryPaths = [ ]

	# List of libraries to link with (specified without the library extension)
	self.Libraries = [ ]

	# Specify 32/64 bit machine target
	self.MachineBits = 32

	# Specific the path in which the compiler host EXE resides (e.g. MSVC, GCC)
	self.HostCompilerPath = None

	# Set to 'none', 'shared' or 'static' to specify runtime library type - default is 'static'
	self.CUDARuntime = None

	# Generate debug information for host/device code
	self.HostDebugLevel = None
	self.DeviceDebug = False

	# GPU architecture and GPUs to generate code for
	self.GPUArch = VirtualArch.compute_10;
	self.GPUCode = RealArch.sm_10;

	# Math operation behaviour
	self.FlushSingleDenormalsToZero = False
	self.PreciseSingleDivRecip = True
	self.PreciseSingleSqrt = True
	self.FuseMultipleAdds = True
	self.UseFastMath = False

	# Tool options
	self.DisableWarnings = False
	self.SourceInPTX = False
	self.RestrictPointers = False

	def UpdateCommandLine(self):

	cmdline = [ ]

	if self.Language: cmdline += [ '--x=' + self.Language ]

	cmdline += [ '--include-path=' + path for path in self.IncludePaths ]
	cmdline += [ '--system-include=' + path for path in self.SystemIncludePaths ]
	cmdline += [ '--pre-include=' + file for file in self.IncludeFiles ]
	cmdline += [ '--define-macro=' + macro for macro in self.DefineMacros ]
	cmdline += [ '--undefine-macro=' + macro for macro in self.UndefineMacros ]

	cmdline += [ '--library-path=' + lib for lib in self.LibraryPaths ]
	cmdline += [ '--library' + lib for lib in self.Libraries ]

	cmdline += [ '--machine=' + str(self.MachineBits) ]

	if self.HostCompilerPath: cmdline += [ '--compiler-bindir=' + self.HostCompilerPath ]
	if self.CUDARuntime: cmdline += [ '--cudart=' + self.CUDARuntime ]

	if self.HostDebugLevel != None: cmdline += [ '--debug=' + str(self.HostDebugLevel) ]
	if self.DeviceDebug: cmdline += [ '--device-debug' ]

	cmdline += [ '--gpu-architecture=' + self.GPUArch ]
	cmdline += [ '--gpu-code=' + self.GPUCode ]

	cmdline += [ '--ftz=' + ('true' if self.FlushSingleDenormalsToZero else 'false') ]
	cmdline += [ '--prec-div=' + ('true' if self.PreciseSingleDivRecip else 'false') ]
	cmdline += [ '--prec-sqrt=' + ('true' if self.PreciseSingleSqrt else 'false') ]
	cmdline += [ '--fmad=' + ('true' if self.FuseMultipleAdds else 'false') ]
	if self.UseFastMath: cmdline += [ '--use_fast_math' ]

	if self.DisableWarnings: cmdline += [ '--disable-warnings' ]
	if self.SourceInPTX: cmdline += [ '--source-in-ptx' ]
	if self.RestrictPointers: cmdline += [ '--restrict' ]

	self.CommandLine = cmdline


	class BuildPTXNode (BuildSystem.Node):

	def __init__(self, source):

	super().__init__()
	self.Source = source
	self.Dependencies = [ source ]

	def Build(self, env):

	# Build command-line from current configuration
	cmdline = [ os.path.join(BinDir, "nvcc.exe") ]
	cmdline += [ '--ptx' ]
	cmdline += env.CurrentConfig.CUDACompileOptions.CommandLine

	# Add the output .ptx file
	output_files = self.GetOutputFiles(env)
	cmdline += [ '--output-file=' + output_files[0] ]

	# Add input file before finishing
	cmdline += [ self.GetInputFile(env) ]
	Utils.ShowCmdLine(env, cmdline)

	# Launch the compiler and wait for it to finish
	process = Process.OpenPiped(cmdline)
	output = Process.WaitForPipeOutput(process)
	if not env.NoToolOutput:
	print(output)

	return process.returncode == 0

	def GetInputFile(self, env):

	return self.Source.GetOutputFiles(env)[0]

	def GetOutputFiles(self, env):

	# Get the filename minus path and extension
	# TODO: This only works if this node has another node as input that resides in
	# the same directory as it. Need to evaluate relative path inputs in long chains.
	input_file = self.GetInputFile(env)
	input_file = os.path.split(input_file)[1]
	input_file = os.path.splitext(input_file)[0]

	ptx_path = os.path.join(env.CurrentConfig.OutputPath, input_file + ".ptx")
	return [ ptx_path ]

	def GetTempOutputFiles(self, env):

	return self.GetOutputFiles(env)

	SetOutputPaths(env, "Kernels")
	debug_config = env.Configs["debug"]
	release_config = env.Configs["release"]

	# Need to add this as an include directory for the generated code to reference
	current_dir = os.getcwd()

	# Location of kernel data to be loaded at runtime
	kernel_data_dir = project_dir + "pub/GameData/Kernels"

	# Gather input files
	input_files = Utils.Glob(".", "*.cu")
	input_file_nodes = [ env.NewFile(cu_file) for cu_file in input_files ]

	kernel_include_paths = [
	current_dir,
	project_dir + "src/CppClient",
	project_dir + "extern/ComputeBridge/cbpp/inc",
	]

	# ComputeBridge uses same options for debug/release
	cb_options = ComputeBridgePlatform.Options()
	cb_options.IncludePaths = kernel_include_paths
	cb_options_map = { "debug": cb_options, "release": cb_options }

	# Run ComputeBridge for both CUDA and OpenCL
	cuda_cb_files = [ ComputeBridgePlatform.BuildNode(cu_file, "cuda", cb_options_map) for cu_file in input_file_nodes ]
	opencl_cb_files = [ ComputeBridgePlatform.BuildNode(cu_file, "opencl", cb_options_map) for cu_file in input_file_nodes ]

	# Setup CUDA compile options
	debug_config.CUDACompileOptions = CUDAPlatform.CUDACompileOptions()
	debug_config.CUDACompileOptions.Language = 'cu'
	debug_config.CUDACompileOptions.GPUArch = CUDAPlatform.VirtualArch.compute_20
	debug_config.CUDACompileOptions.GPUCode = CUDAPlatform.RealArch.sm_20
	debug_config.CUDACompileOptions.HostCompilerPath = os.path.join(MSVCPlatform.VCInstallDir, "bin")
	debug_config.CUDACompileOptions.UpdateCommandLine()
	release_config.CUDACompileOptions = debug_config.CUDACompileOptions

	# Build CUDA PTX files
	cuda_ptx_files = [ CUDAPlatform.BuildPTXNode(cb_file) for cb_file in cuda_cb_files ]

	# Setup OpenCL compile options (it appears the OpenCL compiler already includes cwd)
	debug_config.OpenCLCompileOptions = OpenCLPlatform.OpenCLCompileOptions()
	debug_config.OpenCLCompileOptions.UpdateCommandLine()
	release_config.OpenCLCompileOptions = debug_config.OpenCLCompileOptions

	# OpenCL files are compiled on load so just run the precompiler
	opencl_out_files = [ OpenCLPlatform.BuildOpenCLNode(cb_file) for cb_file in opencl_cb_files ]

	# Copy CUDA output files for load and OpenCL ComputeBridge output for runtime compile
	copied_files = [env.CopyOutputFile(ptx_file, 0, kernel_data_dir) for ptx_file in cuda_ptx_files ]
	copied_files += [ env.CopyOutputFile(ctk_file, 1, kernel_data_dir) for ctk_file in cuda_cb_files ]
	copied_files += [ env.CopyOutputFile(cl_file, 0, kernel_data_dir) for cl_file in opencl_cb_files ]

	env.Build(cuda_ptx_files + opencl_out_files + copied_files, "Kernels")

	VCGenerateProjectFile(env, "Kernels", input_files + [ "Kernels.pibfile" ], None, targets="Kernels", pibfile = "..\..\..\pibfile")