qpwo · August 15, 2025 17:49
diff --git a/inline-cuda-kernel.py b/inline-cuda-kernel.py
 #!/usr/bin/env python3
 import torch
 from torch.utils.cpp_extension import load_inline

 cpp_source = '''
 #include <torch/extension.h>
 torch::Tensor add_tensors(torch::Tensor a, torch::Tensor b);
 '''

 cuda_source = '''
 #include <torch/extension.h>

 __global__ void add_kernel(const float* a, const float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
 }

 torch::Tensor add_tensors(torch::Tensor a, torch::Tensor b) {
    auto c = torch::zeros_like(a);
    int n = a.numel();
    int threads = 256;
    int blocks = (n + threads - 1) / threads;
    add_kernel<<<blocks, threads>>>(a.data_ptr<float>(), b.data_ptr<float>(), c.data_ptr<float>(), n);
    return c;
 }
 '''

 module = load_inline(
    name='add_cuda',
    cpp_sources=cpp_source,
    cuda_sources=cuda_source,
    functions=['add_tensors'],
    verbose=True
 )

 a = torch.randn(1000000, device='cuda')
 b = torch.randn(1000000, device='cuda')
 c = module.add_tensors(a, b)
 print(f"a[:5] = {a[:5]}")
 print(f"b[:5] = {b[:5]}")
 print(f"c[:5] = {c[:5]}")
 print(f"torch.allclose(c, a+b) = {torch.allclose(c, a+b)}")
	#!/usr/bin/env python3
	import torch
	from torch.utils.cpp_extension import load_inline

	cpp_source = '''
	#include <torch/extension.h>
	torch::Tensor add_tensors(torch::Tensor a, torch::Tensor b);
	'''

	cuda_source = '''
	#include <torch/extension.h>

	__global__ void add_kernel(const float* a, const float* b, float* c, int n) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < n) {
	c[idx] = a[idx] + b[idx];
	}
	}

	torch::Tensor add_tensors(torch::Tensor a, torch::Tensor b) {
	auto c = torch::zeros_like(a);
	int n = a.numel();
	int threads = 256;
	int blocks = (n + threads - 1) / threads;
	add_kernel<<<blocks, threads>>>(a.data_ptr<float>(), b.data_ptr<float>(), c.data_ptr<float>(), n);
	return c;
	}
	'''

	module = load_inline(
	name='add_cuda',
	cpp_sources=cpp_source,
	cuda_sources=cuda_source,
	functions=['add_tensors'],
	verbose=True
	)

	a = torch.randn(1000000, device='cuda')
	b = torch.randn(1000000, device='cuda')
	c = module.add_tensors(a, b)
	print(f"a[:5] = {a[:5]}")
	print(f"b[:5] = {b[:5]}")
	print(f"c[:5] = {c[:5]}")
	print(f"torch.allclose(c, a+b) = {torch.allclose(c, a+b)}")