Skip to content

Instantly share code, notes, and snippets.

View msaroufim's full-sized avatar
🤖
Putting the finishing touches on my robot army

Mark Saroufim msaroufim

🤖
Putting the finishing touches on my robot army
View GitHub Profile
@msaroufim
msaroufim / llm-setup-gist.md
Created September 25, 2025 23:31
Local LLM Service Setup with Qwen3, Ollama, Open WebUI, and Tailscale

Local LLM Service Setup

1. Install Ollama

curl -fsSL https://ollama.com/install.sh | sh

2. Download Model

ollama pull qwen3:32b
// Minimal NVRTC CUBIN generation example
// Compile: g++ -std=c++11 minimal_nvrtc_cubin.cpp -lnvrtc -lcuda -lcudart
// String -> cubin via nvrtc -> handle via the cuda driver API
#include <iostream>
#include <vector>
#include <nvrtc.h>
#include <cuda.h>
const char* kernelSource = R"(
import torch
# CUDA kernel with inline PTX
kernel_source = """
__global__ void vector_add(const float* a, const float* b, float* c, int n) {
int idx;
asm("mov.u32 %0, %%ctaid.x;" : "=r"(idx));
int tid;
asm("mov.u32 %0, %%tid.x;" : "=r"(tid));
int ntid;
pytorch_operator,base_name,overload,folder_name,is_mapped
aten._adaptive_avg_pool2d.default,_adaptive_avg_pool2d,default,_adaptive_avg_pool2d,True
aten._adaptive_avg_pool2d_backward.default,_adaptive_avg_pool2d_backward,default,_adaptive_avg_pool2d_backward,True
aten._cudnn_rnn.default,_cudnn_rnn,default,_cudnn_rnn,True
aten._log_softmax.default,_log_softmax,default,_log_softmax,True
aten._log_softmax_backward_data.default,_log_softmax_backward_data,default,_log_softmax_backward_data,True
aten._softmax.default,_softmax,default,_softmax,True
aten._softmax_backward_data.default,_softmax_backward_data,default,_softmax_backward_data,True
aten._sparse_coo_tensor_with_dims_and_tensors.default,_sparse_coo_tensor_with_dims_and_tensors,default,_sparse_coo_tensor_with_dims_and_tensors,True
aten._to_copy.default,_to_copy,default,_to_copy,True
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 938cb7dd97a..d3ac1369e6a 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -7443,6 +7443,57 @@ def reference_inputs_clone_contiguous(op, device, dtype, requires_grad, **kwargs
yield SampleInput(a, kwargs={'memory_format': torch.channels_last_3d})
+def sample_inputs_copy(op_info, device, dtype, requires_grad, **kwargs):
+ """Sample inputs for copy and copy_ operations.
op_name is_core is_in_opinfo is_in_torchbench
__and__ No No No
__iand__ No No No
__ilshift__ No No No
__ior__ No No No
__irshift__ No No No
__ixor__ No No No
__lshift__ No No No
__or__ No No No
__rshift__ No No No
import torch
from torch import nn
from torch.distributed.tensor.placement_types import Replicate, Shard
import torch.distributed as dist
from torch.distributed.device_mesh import init_device_mesh
from torch.distributed.tensor import DTensor
from torch.distributed.tensor.parallel import parallelize_module
def dist_print(*args, **kwargs):
import torch
from torch.utils.cpp_extension import _get_cuda_arch_flags
def test_fix():
print("Testing CUDA arch flags fix...")
user_arch_flags = ['-gencode=arch=compute_86,code=sm_86']
result = _get_cuda_arch_flags(user_arch_flags)
print(f"User provided: {user_arch_flags}")
# Stop all GPU monitoring services that block ncu
sudo systemctl stop nvidia-dcgm.service dynologd.service
# Verify they're stopped
sudo systemctl list-units --state=active | grep -E "(nvidia|dynolog)"
# Check GPU is clear
sudo lsof /dev/nvidia7 | grep -v python
# Now run ncu
"""
Limitations
1. Cannot do heavy templating, cannot use thrust for reductions
2. Cannot import any host includes
Thank you @malfet!
"""
import ctypes
import torch