Skip to content

Instantly share code, notes, and snippets.

@qpwo
Created August 15, 2025 17:49
Show Gist options
  • Save qpwo/c277616a302fa6b9a792a747abb759ec to your computer and use it in GitHub Desktop.
Save qpwo/c277616a302fa6b9a792a747abb759ec to your computer and use it in GitHub Desktop.
example python cuda single-file / inline kernel
#!/usr/bin/env python3
import torch
from torch.utils.cpp_extension import load_inline
cpp_source = '''
#include <torch/extension.h>
torch::Tensor add_tensors(torch::Tensor a, torch::Tensor b);
'''
cuda_source = '''
#include <torch/extension.h>
__global__ void add_kernel(const float* a, const float* b, float* c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
torch::Tensor add_tensors(torch::Tensor a, torch::Tensor b) {
auto c = torch::zeros_like(a);
int n = a.numel();
int threads = 256;
int blocks = (n + threads - 1) / threads;
add_kernel<<<blocks, threads>>>(a.data_ptr<float>(), b.data_ptr<float>(), c.data_ptr<float>(), n);
return c;
}
'''
module = load_inline(
name='add_cuda',
cpp_sources=cpp_source,
cuda_sources=cuda_source,
functions=['add_tensors'],
verbose=True
)
a = torch.randn(1000000, device='cuda')
b = torch.randn(1000000, device='cuda')
c = module.add_tensors(a, b)
print(f"a[:5] = {a[:5]}")
print(f"b[:5] = {b[:5]}")
print(f"c[:5] = {c[:5]}")
print(f"torch.allclose(c, a+b) = {torch.allclose(c, a+b)}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment