Skip to content

Instantly share code, notes, and snippets.

@msaroufim
Last active September 18, 2025 22:52
Show Gist options
  • Save msaroufim/9d90fbaea5eb3bcdf98880ca85140759 to your computer and use it in GitHub Desktop.
Save msaroufim/9d90fbaea5eb3bcdf98880ca85140759 to your computer and use it in GitHub Desktop.
// Minimal NVRTC CUBIN generation example
// Compile: g++ -std=c++11 minimal_nvrtc_cubin.cpp -lnvrtc -lcuda -lcudart
// String -> cubin via nvrtc -> handle via the cuda driver API
#include <iostream>
#include <vector>
#include <nvrtc.h>
#include <cuda.h>
const char* kernelSource = R"(
extern "C" __global__ void vectorAdd(float* a, float* b, float* c, int n) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {
c[tid] = a[tid] + b[tid];
}
}
)";
int main() {
cuInit(0);
CUdevice device;
CUcontext context;
cuDeviceGet(&device, 0);
cuCtxCreate(&context, 0, device);
nvrtcProgram prog;
nvrtcCreateProgram(&prog, kernelSource, "kernel.cu", 0, NULL, NULL);
const char* options[] = {"--gpu-architecture=sm_80"}; // Adjust for your GPU
nvrtcCompileProgram(prog, 1, options);
size_t cubinSize;
nvrtcGetCUBINSize(prog, &cubinSize);
std::vector<char> cubin(cubinSize);
nvrtcGetCUBIN(prog, cubin.data());
std::cout << "CUBIN generated: " << cubinSize << " bytes" << std::endl;
CUmodule module;
CUfunction kernel;
cuModuleLoadDataEx(&module, cubin.data(), 0, 0, 0);
cuModuleGetFunction(&kernel, module, "vectorAdd");
const int N = 1000;
CUdeviceptr d_a, d_b, d_c;
cuMemAlloc(&d_a, N * sizeof(float));
cuMemAlloc(&d_b, N * sizeof(float));
cuMemAlloc(&d_c, N * sizeof(float));
int n = N;
void* args[] = {&d_a, &d_b, &d_c, &n};
cuLaunchKernel(kernel, (N+255)/256, 1, 1, 256, 1, 1, 0, 0, args, 0);
cuCtxSynchronize();
std::cout << "Kernel executed successfully!" << std::endl;
cuMemFree(d_a);
cuMemFree(d_b);
cuMemFree(d_c);
cuModuleUnload(module);
nvrtcDestroyProgram(&prog);
cuCtxDestroy(context);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment