Skip to content

Instantly share code, notes, and snippets.

import torch
torch._inductor.config.combo_kernels = True
torch._inductor.config.fx_graph_cache = False
@torch.compile
def f(x, y):
return x + 1, y * 2
# x = torch.randn(1024, device="cuda")
@shunting314
shunting314 / k1.py
Last active November 25, 2025 02:23
import triton
import triton.language as tl
from torch._inductor.runtime import triton_helpers, triton_heuristics
from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_raw_stream
import torch
@triton_heuristics.pointwise(
size_hints={'x': 67108864}, tile_hint=TileHint.DEFAULT,
import triton
import triton.language as tl
from torch._inductor.runtime import triton_helpers, triton_heuristics
from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_raw_stream
import torch
@triton_heuristics.pointwise(
size_hints={'x': 16777216}, tile_hint=TileHint.DEFAULT,
import triton
import triton.language as tl
from torch._inductor.runtime import triton_helpers, triton_heuristics
from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
from torch._dynamo.testing import rand_strided
from torch._C import _cuda_getCurrentRawStream as get_raw_stream
import torch
@triton_heuristics.pointwise(
size_hints={'x': 67108864}, tile_hint=TileHint.DEFAULT,
import triton
import triton.language as tl
from torch._inductor.runtime import triton_helpers, triton_heuristics
from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
triton_helpers.set_driver_to_gpu()
from torch._dynamo.testing import rand_strided
# AOT ID: ['0_inference']
from ctypes import c_void_p, c_long, c_int
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from cmath import nanj
from torch._inductor.hooks import run_intermediate_hooks
s36 = 3
import os
os.environ['VLLM_TORCH_PROFILER_DIR'] = '/tmp/myprofile'
os.environ['TORCH_TRACE'] = '/tmp/tlp'
os.environ['INDUCTOR_PROVENANCE'] = '1'
os.environ['TORCHINDUCTOR_CACHE_DIR'] = '/tmp/torchinductor_shunting/'
os.environ['TORCHINDUCTOR_BENCHMARK_KERNEL'] = '1'
os.environ['TORCH_LOGS_FORMAT'] = '%(levelname)s: %(message)s'
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "Sym(s36)", arg1_1: "bf16[s36, 128256][128256, 1]cuda:0", arg2_1: "f32[s36][1]cuda:0"):
# File: /home/shunting/ws/vllm/vllm/v1/sample/sampler.py:87 in forward, code: logits = logits.to(torch.float32)
convert_element_type: "f32[s36, 128256][128256, 1]cuda:0" = torch.ops.prims.convert_element_type.default(arg1_1, torch.float32); arg1_1 = None
# File: /home/shunting/ws/vllm/vllm/v1/sample/sampler.py:138 in apply_temperature, code: return logits.div_(temp.unsqueeze(dim=1))
unsqueeze: "f32[s36, 1][1, 1]cuda:0" = torch.ops.aten.unsqueeze.default(arg2_1, 1); arg2_1 = None
# No stacktrace found for following nodes
ge_scalar: "b8[s36, 1][1, 1]cuda:0" = torch.ops.aten.ge.Scalar(unsqueeze, 0)
class Solution {
public:
int numberOfArithmeticSlices(vector<int>& nums) {
int out = 0;
int s = 0;
while (s + 1 < nums.size()) {
int dif = nums[s + 1] - nums[s];
int e = s + 2;
while (e < nums.size() && nums[e] - nums[e - 1] == dif) {
++e;
"""
Compile-time auto-tuning block:
import torch
from torch._dynamo.testing import rand_strided
from torch._dynamo.utils import preserve_rng_state
from torch._inductor.select_algorithm import AlgorithmSelectorCache
from torch._inductor.async_compile import AsyncCompile
async_compile = AsyncCompile()