curl -fsSL https://ollama.com/install.sh | shollama pull qwen3:32b| // Minimal NVRTC CUBIN generation example | |
| // Compile: g++ -std=c++11 minimal_nvrtc_cubin.cpp -lnvrtc -lcuda -lcudart | |
| // String -> cubin via nvrtc -> handle via the cuda driver API | |
| #include <iostream> | |
| #include <vector> | |
| #include <nvrtc.h> | |
| #include <cuda.h> | |
| const char* kernelSource = R"( |
| import torch | |
| # CUDA kernel with inline PTX | |
| kernel_source = """ | |
| __global__ void vector_add(const float* a, const float* b, float* c, int n) { | |
| int idx; | |
| asm("mov.u32 %0, %%ctaid.x;" : "=r"(idx)); | |
| int tid; | |
| asm("mov.u32 %0, %%tid.x;" : "=r"(tid)); | |
| int ntid; |
| pytorch_operator,base_name,overload,folder_name,is_mapped | |
| aten._adaptive_avg_pool2d.default,_adaptive_avg_pool2d,default,_adaptive_avg_pool2d,True | |
| aten._adaptive_avg_pool2d_backward.default,_adaptive_avg_pool2d_backward,default,_adaptive_avg_pool2d_backward,True | |
| aten._cudnn_rnn.default,_cudnn_rnn,default,_cudnn_rnn,True | |
| aten._log_softmax.default,_log_softmax,default,_log_softmax,True | |
| aten._log_softmax_backward_data.default,_log_softmax_backward_data,default,_log_softmax_backward_data,True | |
| aten._softmax.default,_softmax,default,_softmax,True | |
| aten._softmax_backward_data.default,_softmax_backward_data,default,_softmax_backward_data,True | |
| aten._sparse_coo_tensor_with_dims_and_tensors.default,_sparse_coo_tensor_with_dims_and_tensors,default,_sparse_coo_tensor_with_dims_and_tensors,True | |
| aten._to_copy.default,_to_copy,default,_to_copy,True |
| diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py | |
| index 938cb7dd97a..d3ac1369e6a 100644 | |
| --- a/torch/testing/_internal/common_methods_invocations.py | |
| +++ b/torch/testing/_internal/common_methods_invocations.py | |
| @@ -7443,6 +7443,57 @@ def reference_inputs_clone_contiguous(op, device, dtype, requires_grad, **kwargs | |
| yield SampleInput(a, kwargs={'memory_format': torch.channels_last_3d}) | |
| +def sample_inputs_copy(op_info, device, dtype, requires_grad, **kwargs): | |
| + """Sample inputs for copy and copy_ operations. |
| op_name | is_core | is_in_opinfo | is_in_torchbench | |
|---|---|---|---|---|
| __and__ | No | No | No | |
| __iand__ | No | No | No | |
| __ilshift__ | No | No | No | |
| __ior__ | No | No | No | |
| __irshift__ | No | No | No | |
| __ixor__ | No | No | No | |
| __lshift__ | No | No | No | |
| __or__ | No | No | No | |
| __rshift__ | No | No | No |
| import torch | |
| from torch import nn | |
| from torch.distributed.tensor.placement_types import Replicate, Shard | |
| import torch.distributed as dist | |
| from torch.distributed.device_mesh import init_device_mesh | |
| from torch.distributed.tensor import DTensor | |
| from torch.distributed.tensor.parallel import parallelize_module | |
| def dist_print(*args, **kwargs): |
| import torch | |
| from torch.utils.cpp_extension import _get_cuda_arch_flags | |
| def test_fix(): | |
| print("Testing CUDA arch flags fix...") | |
| user_arch_flags = ['-gencode=arch=compute_86,code=sm_86'] | |
| result = _get_cuda_arch_flags(user_arch_flags) | |
| print(f"User provided: {user_arch_flags}") |
| # Stop all GPU monitoring services that block ncu | |
| sudo systemctl stop nvidia-dcgm.service dynologd.service | |
| # Verify they're stopped | |
| sudo systemctl list-units --state=active | grep -E "(nvidia|dynolog)" | |
| # Check GPU is clear | |
| sudo lsof /dev/nvidia7 | grep -v python | |
| # Now run ncu |
| """ | |
| Limitations | |
| 1. Cannot do heavy templating, cannot use thrust for reductions | |
| 2. Cannot import any host includes | |
| Thank you @malfet! | |
| """ | |
| import ctypes | |
| import torch |