Skip to content

Instantly share code, notes, and snippets.

import torch
from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TorchAoConfig
from PIL import Image
import requests
import torch.utils.benchmark as benchmark
from torchao.utils import benchmark_model
def benchmark_fn(f, *args, **kwargs):
# Manual warmup
for _ in range(2):
f(*args, **kwargs)
@jerryzh168
jerryzh168 / gist:0c22c06bc0a1c07e128a1763bc72b3e4
Created February 27, 2025 21:37
Quantizing and Uploading the Quantized Model in huggingface transformers
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
model_name = "meta-llama/Meta-Llama-3-8B"
quantization_config = TorchAoConfig("int8_weight_only")
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
hub_repo = # YOUR HUB REPO ID
import torch
import torchao
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
# benchmark the performance
import torch.utils.benchmark as benchmark
def benchmark_fn(f, *args, **kwargs):
# Manual warmup
for _ in range(5):
import torch
import torchao
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
# benchmark the performance
import torch.utils.benchmark as benchmark
def benchmark_fn(f, *args, **kwargs):
# Manual warmup
for _ in range(5):
{"GEMV": {}, "GEMV_REVSPLITK": {"(1, 1536, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 1}, "(1, 4096, 1024, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 8, "A_load_order": 1, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 1}, "(1, 7168, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 512, "BLOCK_SIZE_K": 8, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 4, "num_ctas": 1, "num_stages": 1}, "(1, 4096, 3584, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 16, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 1}, "(1, 6144, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 8, "A_l
model.safetensors.index.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23.9k/23.9k [00:00<00:00, 125MB/s]
model-00001-of-00004.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.98G/4.98G [01:58<00:00, 42.0MB/s]
model-00002-of-00004.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5.00G/5.00G [01:58<00:00, 42.2MB/s]
model-00003-of-00004.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.92G/4.92G [01:56<00:00, 42.0MB/s]
model-00004-of-00004.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████
Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00, 5.77s/it, est. speed input: 1.39 toks/s, output: 177.49 toks/s]
1
Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00, 5.57s/it, est. speed input: 2.16 toks/s, output: 183.93 toks/s]
4
Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00, 1.44s/it, est. speed input: 8.35 toks/s, output: 518.36 toks/s]
8
Processed prompts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:06<00:00, 1.33it/s, est. speed input: 18.29 toks/s, output: 1052.92 toks/s]
16
Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:06<00:00, 2.33it
{"GEMV": {}, "GEMV_REVSPLITK": {"(1, 6144, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 8, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 1}, "(1, 4096, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 16, "A_load_order": 1, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 2}, "(1, 28672, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 8, "A_load_order": 1, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 2}, "(1, 4096, 14336, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 512, "BLOCK_SIZE_K": 8, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 4, "num_ctas": 1, "num_stages": 2}, "(1, 6144, 4096, 64, 2)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "A
[2024-12-19 10:59:34 TP5] Scheduler hit an exception: Traceback (most recent call last):
File "/data/users/jerryzh/sglang/python/sglang/srt/managers/scheduler.py", line 1528, in run_scheduler_process
scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)
File "/data/users/jerryzh/sglang/python/sglang/srt/managers/scheduler.py", line 192, in __init__
self.tp_worker = TpWorkerClass(
File "/data/users/jerryzh/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 62, in __init__
self.worker = TpModelWorker(server_args, gpu_id, tp_rank, dp_rank, nccl_port)
File "/data/users/jerryzh/sglang/python/sglang/srt/managers/tp_worker.py", line 62, in __init__
self.model_runner = ModelRunner(
File "/data/users/jerryzh/sglang/python/sglang/srt/model_executor/model_runner.py", line 158, in __init__
[2024-12-17 15:09:47 TP0] Decode batch. #running-req: 968, #token: 511530, token usage: 0.11, gen throughput (token/s): 1554.84, #queue-req: 0
[2024-12-17 15:09:48 TP0] Decode batch. #running-req: 922, #token: 523719, token usage: 0.11, gen throughput (token/s): 47852.88, #queue-req: 0
[2024-12-17 15:09:49 TP0] Decode batch. #running-req: 883, #token: 535168, token usage: 0.11, gen throughput (token/s): 46588.76, #queue-req: 0
[2024-12-17 15:09:50 TP0] Decode batch. #running-req: 847, #token: 548080, token usage: 0.11, gen throughput (token/s): 44284.98, #queue-req: 0
[2024-12-17 15:09:50 TP0] Decode batch. #running-req: 799, #token: 545397, token usage: 0.11, gen throughput (token/s): 42336.33, #queue-req: 0
[2024-12-17 15:09:51 TP0] Decode batch. #running-req: 767, #token: 556549, token usage: 0.12, gen throughput (token/s): 41241.09, #queue-req: 0
[2024-12-17 15:09:52 TP0] Decode batch. #running-req: 730, #token: 558371, token usage: 0.12, gen throughput (token/s): 39677.03, #queue-req: 0
[2024-12-17 15:09