Last active
October 4, 2017 11:54
-
-
Save anax32/ff2176cf83949d02a012952ecc72699a to your computer and use it in GitHub Desktop.
CUDA kernal compile at runtime
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda.h> | |
#include <cuda_runtime_api.h> | |
#include <nvrtc.h> | |
#include <vector> | |
#include <iostream> | |
#include <iterator> | |
// simple kernel to compile and run | |
const char *cuda_kernel = " \n\ | |
extern \"C\" __global__ void global_test_function (int *a, int *b) \n\ | |
{ \n\ | |
*a = 11; \n\ | |
*b = 12; \n\ | |
return; \n\ | |
} \n"; | |
// compile a source string to ptx | |
auto compile_string_to_ptx (const char *source_string) -> std::vector<char> | |
{ | |
nvrtcResult res; | |
nvrtcProgram prog; | |
std::vector<char*> options | |
{ | |
"--gpu-architecture=compute_30", | |
"--fmad=false" | |
#if _DEBUG | |
, | |
"--device-debug", | |
"--generate-line-info" | |
#endif | |
}; | |
res = nvrtcCreateProgram (&prog, source_string, NULL, 0, NULL, NULL); | |
res = nvrtcCompileProgram (prog, options.size(), options.data()); | |
if (res != NVRTC_SUCCESS) | |
{ | |
size_t log_len; | |
std::string log; | |
res = nvrtcGetProgramLogSize (prog, &log_len); | |
log.resize (log_len + 1); | |
res = nvrtcGetProgramLog (prog, const_cast<char*>(log.data ())); | |
std::cout << log.c_str () << std::endl; | |
return std::vector<char>{}; | |
} | |
// get compiled code | |
size_t ptx_len = 0; | |
std::vector<char> ptx_src; | |
res = nvrtcGetPTXSize (prog, &ptx_len); | |
ptx_src.resize (ptx_len); | |
res = nvrtcGetPTX (prog, ptx_src.data()); | |
res = nvrtcDestroyProgram (&prog); | |
return ptx_src; | |
} | |
// create a module from a ptx string | |
auto module_from_ptx (const std::vector<char>& ptx) -> CUmodule | |
{ | |
CUresult res; | |
CUmodule mod = NULL; | |
std::string info_log (1024, '\0'); | |
std::string error_log (1024, '\0'); | |
std::vector<CUjit_option> options | |
{ | |
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, | |
CU_JIT_INFO_LOG_BUFFER, | |
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, | |
CU_JIT_ERROR_LOG_BUFFER, | |
CU_JIT_LOG_VERBOSE, | |
CU_JIT_WALL_TIME | |
}; | |
std::vector<void*> values | |
{ | |
(void *)(size_t)info_log.size (), | |
(void *)info_log.data (), | |
(void *)(size_t)error_log.size (), | |
(void *)error_log.data (), | |
(void *)(int)1, | |
(void *)0 | |
}; | |
res = cuModuleLoadDataEx (&mod, ptx.data (), options.size (), options.data (), (void **)values.data ()); | |
if (res != NVRTC_SUCCESS) | |
{ | |
return mod; | |
} | |
std::cout << "compiled in : " << (float)((int)(values[5]))/1000.0f << "s" << std::endl; | |
return mod; | |
} | |
// locate a pre-defined entry point in the module, | |
// setup device memory and copy some data in, | |
// execute the kernel, | |
// copy the contents of device memory back into host memory. | |
void execute_function (CUmodule module) | |
{ | |
CUresult res; | |
CUfunction fn = NULL; | |
// get the entry point we want to use for this module | |
res = cuModuleGetFunction (&fn, module, "global_test_function"); | |
// allocate some memory on the device and copy data into it | |
int h_A = 2; | |
int h_B = 3; | |
CUdeviceptr d_A, d_B; | |
cuMemAlloc (&d_A, sizeof (int)); | |
cuMemAlloc (&d_B, sizeof (int));; | |
cuMemcpyHtoD (d_A, &h_A, sizeof (int)); | |
cuMemcpyHtoD (d_B, &h_B, sizeof (int)); | |
void *args[] = { &d_A, &d_B, }; | |
// run the kernel | |
res = cuLaunchKernel( | |
fn, | |
1, 1, 1, | |
1, 1, 1, | |
0, | |
NULL, | |
args, | |
NULL); | |
// wait for the kernel | |
res = cuCtxSynchronize(); | |
// copy the device variables back into host memory | |
cuMemcpyDtoH (&h_A, d_A, sizeof (int)); | |
cuMemcpyDtoH (&h_B, d_B, sizeof (int)); | |
cuMemFree (d_A); | |
cuMemFree (d_B); | |
} | |
int main(int argc, char** argv) | |
{ | |
CUdevice device; | |
CUcontext context; | |
cuInit(0); | |
cuDeviceGet (&device, 0); | |
cuCtxCreate (&context, 0, device); | |
// compile source into ptx representation | |
auto ptx = compile_string_to_ptx (cuda_kernel); | |
// write the ptx to stdout | |
std::copy ( | |
std::begin (ptx), | |
std::end (ptx), | |
std::ostream_iterator<char> (std::cout, "")); | |
// load a cuda module from the ptx source | |
auto cuda_module = module_from_ptx (ptx); | |
if (cuda_module == NULL) | |
{ | |
std::cout << "ERR: Could not load module" << std::endl; | |
} | |
else | |
{ | |
// enumerate some properties of the module and execute the function | |
execute_function (cuda_module); | |
} | |
cuCtxDetach (context); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment