anax32 · October 4, 2017 11:54
diff --git a/cuda_kernel_rt.cpp b/cuda_kernel_rt.cpp
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <nvrtc.h>

 #include <vector>
 #include <iostream>
 #include <iterator>

 // simple kernel to compile and run
 const char *cuda_kernel = "                                           \n\
 extern \"C\" __global__ void global_test_function (int *a, int *b)    \n\
 {                                                                     \n\
    *a = 11;                                                          \n\
    *b = 12;                                                          \n\
    return;                                                           \n\
 }                                                                     \n";

 // compile a source string to ptx
 auto compile_string_to_ptx (const char *source_string) -> std::vector<char>
 {
 	nvrtcResult		res;
 	nvrtcProgram	prog;

  std::vector<char*> options
  {
      "--gpu-architecture=compute_30",
      "--fmad=false"
 #if _DEBUG
      ,
      "--device-debug",
      "--generate-line-info"
 #endif
  };	

 	res = nvrtcCreateProgram (&prog, source_string, NULL, 0, NULL, NULL);
 	res = nvrtcCompileProgram (prog, options.size(), options.data());

 	if (res != NVRTC_SUCCESS)
 	{
 		size_t			log_len;
    std::string log;

 		res = nvrtcGetProgramLogSize (prog, &log_len);

    log.resize (log_len + 1);

 		res = nvrtcGetProgramLog (prog, const_cast<char*>(log.data ()));
 		std::cout << log.c_str () << std::endl;

 		return std::vector<char>{};
 	}

 	// get compiled code
 	size_t				ptx_len = 0;
 	std::vector<char>	ptx_src;
 	
 	res = nvrtcGetPTXSize (prog, &ptx_len);
 	ptx_src.resize (ptx_len);
 	res = nvrtcGetPTX (prog, ptx_src.data());

 	res = nvrtcDestroyProgram (&prog);

 	return ptx_src;
 }

 // create a module from a ptx string
 auto module_from_ptx (const std::vector<char>& ptx) -> CUmodule
 {
  CUresult res;
 	CUmodule mod = NULL;

  std::string   info_log (1024, '\0');
  std::string   error_log (1024, '\0');

  std::vector<CUjit_option> options
  {
      CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
      CU_JIT_INFO_LOG_BUFFER,
      CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
      CU_JIT_ERROR_LOG_BUFFER,
      CU_JIT_LOG_VERBOSE,
      CU_JIT_WALL_TIME
  };

  std::vector<void*>        values
  {
      (void *)(size_t)info_log.size (),
      (void *)info_log.data (),
      (void *)(size_t)error_log.size (),
      (void *)error_log.data (),
      (void *)(int)1,
      (void *)0
  };

  res = cuModuleLoadDataEx (&mod, ptx.data (), options.size (), options.data (), (void **)values.data ());

  if (res != NVRTC_SUCCESS)
  {
      return mod;
  }

  std::cout << "compiled in : " << (float)((int)(values[5]))/1000.0f << "s" << std::endl;

  return mod;
 }

 // locate a pre-defined entry point in the module,
 // setup device memory and copy some data in,
 // execute the kernel,
 // copy the contents of device memory back into host memory.
 void execute_function (CUmodule module)
 {
    CUresult        res;
    CUfunction      fn = NULL;

    // get the entry point we want to use for this module
    res = cuModuleGetFunction (&fn, module, "global_test_function");

    // allocate some memory on the device and copy data into it
    int             h_A = 2;
    int             h_B = 3;
    CUdeviceptr     d_A, d_B;

    cuMemAlloc (&d_A, sizeof (int));
    cuMemAlloc (&d_B, sizeof (int));;
    cuMemcpyHtoD (d_A, &h_A, sizeof (int));
    cuMemcpyHtoD (d_B, &h_B, sizeof (int));
    void *args[] = { &d_A, &d_B, };

    // run the kernel
    res = cuLaunchKernel(
        fn,
        1, 1, 1,
        1, 1, 1,
        0,
        NULL,
        args,
        NULL);

    // wait for the kernel
    res = cuCtxSynchronize();

    // copy the device variables back into host memory
    cuMemcpyDtoH (&h_A, d_A, sizeof (int));
    cuMemcpyDtoH (&h_B, d_B, sizeof (int));

    cuMemFree (d_A);
    cuMemFree (d_B);
 }

 int main(int argc, char** argv)
 {
  CUdevice	    device;
  CUcontext	    context;

  cuInit(0);
  cuDeviceGet (&device, 0);
  cuCtxCreate (&context, 0, device);

  // compile source into ptx representation
  auto ptx = compile_string_to_ptx (cuda_kernel);

  // write the ptx to stdout
  std::copy (
      std::begin (ptx),
      std::end (ptx),
      std::ostream_iterator<char> (std::cout, ""));

  // load a cuda module from the ptx source
  auto cuda_module = module_from_ptx (ptx);

  if (cuda_module == NULL)
  {
      std::cout << "ERR: Could not load module" << std::endl;
  }
  else
  {
      // enumerate some properties of the module and execute the function
      execute_function (cuda_module);
  }

 　
  cuCtxDetach (context);

 	return 0;
 }
	#include <cuda.h>
	#include <cuda_runtime_api.h>
	#include <nvrtc.h>

	#include <vector>
	#include <iostream>
	#include <iterator>

	// simple kernel to compile and run
	const char *cuda_kernel = " \n\
	extern \"C\" __global__ void global_test_function (int a, int b) \n\
	{ \n\
	*a = 11; \n\
	*b = 12; \n\
	return; \n\
	} \n";

	// compile a source string to ptx
	auto compile_string_to_ptx (const char *source_string) -> std::vector<char>
	{
	nvrtcResult res;
	nvrtcProgram prog;

	std::vector<char*> options
	{
	"--gpu-architecture=compute_30",
	"--fmad=false"
	#if _DEBUG
	,
	"--device-debug",
	"--generate-line-info"
	#endif
	};

	res = nvrtcCreateProgram (&prog, source_string, NULL, 0, NULL, NULL);
	res = nvrtcCompileProgram (prog, options.size(), options.data());

	if (res != NVRTC_SUCCESS)
	{
	size_t log_len;
	std::string log;

	res = nvrtcGetProgramLogSize (prog, &log_len);

	log.resize (log_len + 1);

	res = nvrtcGetProgramLog (prog, const_cast<char*>(log.data ()));
	std::cout << log.c_str () << std::endl;

	return std::vector<char>{};
	}

	// get compiled code
	size_t ptx_len = 0;
	std::vector<char> ptx_src;

	res = nvrtcGetPTXSize (prog, &ptx_len);
	ptx_src.resize (ptx_len);
	res = nvrtcGetPTX (prog, ptx_src.data());

	res = nvrtcDestroyProgram (&prog);

	return ptx_src;
	}

	// create a module from a ptx string
	auto module_from_ptx (const std::vector<char>& ptx) -> CUmodule
	{
	CUresult res;
	CUmodule mod = NULL;

	std::string info_log (1024, '\0');
	std::string error_log (1024, '\0');

	std::vector<CUjit_option> options
	{
	CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
	CU_JIT_INFO_LOG_BUFFER,
	CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
	CU_JIT_ERROR_LOG_BUFFER,
	CU_JIT_LOG_VERBOSE,
	CU_JIT_WALL_TIME
	};

	std::vector<void*> values
	{
	(void *)(size_t)info_log.size (),
	(void *)info_log.data (),
	(void *)(size_t)error_log.size (),
	(void *)error_log.data (),
	(void *)(int)1,
	(void *)0
	};

	res = cuModuleLoadDataEx (&mod, ptx.data (), options.size (), options.data (), (void **)values.data ());

	if (res != NVRTC_SUCCESS)
	{
	return mod;
	}

	std::cout << "compiled in : " << (float)((int)(values[5]))/1000.0f << "s" << std::endl;

	return mod;
	}

	// locate a pre-defined entry point in the module,
	// setup device memory and copy some data in,
	// execute the kernel,
	// copy the contents of device memory back into host memory.
	void execute_function (CUmodule module)
	{
	CUresult res;
	CUfunction fn = NULL;

	// get the entry point we want to use for this module
	res = cuModuleGetFunction (&fn, module, "global_test_function");

	// allocate some memory on the device and copy data into it
	int h_A = 2;
	int h_B = 3;
	CUdeviceptr d_A, d_B;

	cuMemAlloc (&d_A, sizeof (int));
	cuMemAlloc (&d_B, sizeof (int));;
	cuMemcpyHtoD (d_A, &h_A, sizeof (int));
	cuMemcpyHtoD (d_B, &h_B, sizeof (int));
	void *args[] = { &d_A, &d_B, };

	// run the kernel
	res = cuLaunchKernel(
	fn,
	1, 1, 1,
	1, 1, 1,
	0,
	NULL,
	args,
	NULL);

	// wait for the kernel
	res = cuCtxSynchronize();

	// copy the device variables back into host memory
	cuMemcpyDtoH (&h_A, d_A, sizeof (int));
	cuMemcpyDtoH (&h_B, d_B, sizeof (int));

	cuMemFree (d_A);
	cuMemFree (d_B);
	}

	int main(int argc, char** argv)
	{
	CUdevice device;
	CUcontext context;

	cuInit(0);
	cuDeviceGet (&device, 0);
	cuCtxCreate (&context, 0, device);

	// compile source into ptx representation
	auto ptx = compile_string_to_ptx (cuda_kernel);

	// write the ptx to stdout
	std::copy (
	std::begin (ptx),
	std::end (ptx),
	std::ostream_iterator<char> (std::cout, ""));

	// load a cuda module from the ptx source
	auto cuda_module = module_from_ptx (ptx);

	if (cuda_module == NULL)
	{
	std::cout << "ERR: Could not load module" << std::endl;
	}
	else
	{
	// enumerate some properties of the module and execute the function
	execute_function (cuda_module);
	}


	cuCtxDetach (context);

	return 0;
	}