Skip to content

Instantly share code, notes, and snippets.

################## RESULTS
# ######### GROUPED_MM #######
# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant noquant --compile
# Average tokens/sec: 24.15
# Average tokens/sec including batches 193.18
# Memory used: 95.25 GB
# model size: 93.62
# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --batch_size 8 --moe_quant noquant --compile --compile_mode "max-autotune"
# Average tokens/sec: 23.97
@HDCharles
HDCharles / log.log
Created July 29, 2025 16:45
output from repro
/data/users/hdcharles/pytorch/torch/backends/cuda/__init__.py:131: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /data/users/hdcharles/pytorch/aten/src/ATen/Context.cpp:80.)
return torch._C._get_cublas_allow_tf32()
/tmp/torchinductor_hdcharles/ah/cahrqbokro3llqbhea5qjqikmqu6ncya53rrk5ia2mdrxlmeglxs.py:105: UserWarning: Logical operators 'and' and 'or' are deprecated for non-scalar tensors; please use '&' or '|' instead
mask = offs_am[:, None] < m_size and offs_bn[None, :] < n_size
UserWarning: Enable tracemalloc to get the object allocation traceback
/tmp/torchind
_________________ TestAutoQuant.test_autoquant_compile_12_cuda _________________
a = (<test_integration.TestAutoQuant testMethod=test_autoquant_compile_12_cuda>,)
kw = {}
@wraps(func)
def standalone_func(*a, **kw):
> return func(*(a + p.args), **p.kwargs, **kw)
/opt/conda/envs/venv/lib/python3.9/site-packages/parameterized/parameterized.py:620:
This file has been truncated, but you can view the full file.
/home/cdhernandez/.conda/envs/pytorch-3.12/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature.
self.gen = func(*args, **kwds)
V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] Output code:
V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # AOT ID: ['0_inference']
V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from ctypes import c_void_p, c_long, c_int
V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import torch
V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import math
V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1
V0320 11:46:30.704000 18434 site-packages/torch/_dynamo/utils.py:1782] {"chromium_event": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "6dda4945313dbc76cddf217f3df965aa"}
{
"name": "dynamo",
"ts": 1742496390704270.5,
"args": {
"compile_id": "0/0"
},
"ph": "B",
"cat": "dynamo_timed",
"tid": 0,
@HDCharles
HDCharles / small_moe_repro.py
Created March 20, 2025 18:03
user warning?
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import functional as F
from dataclasses import dataclass
torch.manual_seed(0)
# T tokens
@HDCharles
HDCharles / moe_compile_issue.py
Last active March 19, 2025 22:41
this shows a place where moe doesn't work with compile
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import functional as F
from dataclasses import dataclass
torch.manual_seed(0)
# T tokens
@HDCharles
HDCharles / test_moe_compile.py
Last active March 19, 2025 04:08
code testing moe implementations with compile
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import functional as F
from dataclasses import dataclass
torch.manual_seed(0)
# T tokens
@HDCharles
HDCharles / bsr_bench.sh
Created February 27, 2025 18:14
BSR benchmark script
# BSR benchmarks
export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder
export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result bsr_bench_results.txt
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --sparsity semi-structured --precision float16 --write_result bsr_bench_results.txt
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --sparsity semi-structured --precision float16 --write_result bsr_bench_results.txt
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result bsr_bench_results.txt --sparsity bsr-0.8-32
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --write_result bsr_bench_results.txt --sparsity bsr-0.8-64
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/mod
W1008 09:22:11.858000 1289935 torch/_higher_order_ops/triton_kernel_wrap.py:503] [0/0] ValueError: Incorrect number of arguments passed to kernel
W1008 09:22:11.870000 1289935 torch/_higher_order_ops/triton_kernel_wrap.py:503] [0/0] Encountered an exception in identify_mutated_tensors, assuming every input is mutated
W1008 09:22:11.870000 1289935 torch/_higher_order_ops/triton_kernel_wrap.py:503] [0/0] Traceback (most recent call last):
W1008 09:22:11.870000 1289935 torch/_higher_order_ops/triton_kernel_wrap.py:503] [0/0] File "/home/cdhernandez/local/pytorch/torch/_higher_order_ops/triton_kernel_wrap.py", line 482, in identify_mutated_tensors
W1008 09:22:11.870000 1289935 torch/_higher_order_ops/triton_kernel_wrap.py:503] [0/0] ttir_module, ordered_tensor_names = generate_ttir(kernel, kwargs)
W1008 09:22:11.870000 1289935 torch/_higher_order_ops/triton_kernel_wrap.py:503] [0/0] File "/home/cdhernandez/local/pytorch/torch/_higher_order_ops/triton_kernel_wrap.py", line 139, in generate_ttir
W1008 09:22