shunting314’s gists

shunting314 / a.py

Created November 25, 2025 23:58

	import torch

	torch._inductor.config.combo_kernels = True
	torch._inductor.config.fx_graph_cache = False

	@torch.compile
	def f(x, y):
	return x + 1, y * 2

	# x = torch.randn(1024, device="cuda")

shunting314 / k1.py

Last active November 25, 2025 02:23

	import triton
	import triton.language as tl
	from torch._inductor.runtime import triton_helpers, triton_heuristics
	from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
	from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
	from torch._dynamo.testing import rand_strided
	from torch._C import _cuda_getCurrentRawStream as get_raw_stream
	import torch
	@triton_heuristics.pointwise(
	size_hints={'x': 67108864}, tile_hint=TileHint.DEFAULT,

shunting314 / k2.py

Created November 25, 2025 02:16

	import triton
	import triton.language as tl
	from torch._inductor.runtime import triton_helpers, triton_heuristics
	from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
	from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
	from torch._dynamo.testing import rand_strided
	from torch._C import _cuda_getCurrentRawStream as get_raw_stream
	import torch
	@triton_heuristics.pointwise(
	size_hints={'x': 16777216}, tile_hint=TileHint.DEFAULT,

shunting314 / k1.py

Created November 25, 2025 02:16

	import triton
	import triton.language as tl
	from torch._inductor.runtime import triton_helpers, triton_heuristics
	from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
	from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
	from torch._dynamo.testing import rand_strided
	from torch._C import _cuda_getCurrentRawStream as get_raw_stream
	import torch
	@triton_heuristics.pointwise(
	size_hints={'x': 67108864}, tile_hint=TileHint.DEFAULT,

shunting314 / k.py

Created November 25, 2025 01:11


	import triton
	import triton.language as tl

	from torch._inductor.runtime import triton_helpers, triton_heuristics
	from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
	from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
	triton_helpers.set_driver_to_gpu()

	from torch._dynamo.testing import rand_strided

shunting314 / c2zabg4sxwaruafypujeiqcvy42mw7c5ts5nitztxeyr3vdh3fnn.py

Created November 25, 2025 00:53

	# AOT ID: ['0_inference']
	from ctypes import c_void_p, c_long, c_int
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from cmath import nanj
	from torch._inductor.hooks import run_intermediate_hooks

shunting314 / runnable.py

Created November 25, 2025 00:25

	s36 = 3


	import os
	os.environ['VLLM_TORCH_PROFILER_DIR'] = '/tmp/myprofile'
	os.environ['TORCH_TRACE'] = '/tmp/tlp'
	os.environ['INDUCTOR_PROVENANCE'] = '1'
	os.environ['TORCHINDUCTOR_CACHE_DIR'] = '/tmp/torchinductor_shunting/'
	os.environ['TORCHINDUCTOR_BENCHMARK_KERNEL'] = '1'
	os.environ['TORCH_LOGS_FORMAT'] = '%(levelname)s: %(message)s'

shunting314 / sampling_fx.py

Created November 24, 2025 07:20

	class <lambda>(torch.nn.Module):
	def forward(self, arg0_1: "Sym(s36)", arg1_1: "bf16[s36, 128256][128256, 1]cuda:0", arg2_1: "f32[s36][1]cuda:0"):
	# File: /home/shunting/ws/vllm/vllm/v1/sample/sampler.py:87 in forward, code: logits = logits.to(torch.float32)
	convert_element_type: "f32[s36, 128256][128256, 1]cuda:0" = torch.ops.prims.convert_element_type.default(arg1_1, torch.float32); arg1_1 = None

	# File: /home/shunting/ws/vllm/vllm/v1/sample/sampler.py:138 in apply_temperature, code: return logits.div_(temp.unsqueeze(dim=1))
	unsqueeze: "f32[s36, 1][1, 1]cuda:0" = torch.ops.aten.unsqueeze.default(arg2_1, 1); arg2_1 = None

	# No stacktrace found for following nodes
	ge_scalar: "b8[s36, 1][1, 1]cuda:0" = torch.ops.aten.ge.Scalar(unsqueeze, 0)

shunting314 / gist:6e3553970548259a60cabb316709f350

Created November 22, 2025 20:02

	class Solution {
	public:
	int numberOfArithmeticSlices(vector<int>& nums) {
	int out = 0;
	int s = 0;
	while (s + 1 < nums.size()) {
	int dif = nums[s + 1] - nums[s];
	int e = s + 2;
	while (e < nums.size() && nums[e] - nums[e - 1] == dif) {
	++e;

shunting314 / cwgm463vzwyl45dsakbdxknvgipesm7theoq6yvfzm5exrhbslqf.py

Created November 21, 2025 06:38

	"""
	Compile-time auto-tuning block:

	import torch
	from torch._dynamo.testing import rand_strided
	from torch._dynamo.utils import preserve_rng_state
	from torch._inductor.select_algorithm import AlgorithmSelectorCache
	from torch._inductor.async_compile import AsyncCompile

	async_compile = AsyncCompile()