jerryzh168’s gists

jerryzh168 / gist:0e749d0dab40e2a62a7f2e48639f77b5

Created March 25, 2025 20:52

	import torch
	from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TorchAoConfig
	from PIL import Image
	import requests
	import torch.utils.benchmark as benchmark
	from torchao.utils import benchmark_model
	def benchmark_fn(f, args, *kwargs):
	# Manual warmup
	for _ in range(2):
	f(args, *kwargs)

jerryzh168 / gist:0c22c06bc0a1c07e128a1763bc72b3e4

Created February 27, 2025 21:37

Quantizing and Uploading the Quantized Model in huggingface transformers

	import torch
	from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer

	model_name = "meta-llama/Meta-Llama-3-8B"
	quantization_config = TorchAoConfig("int8_weight_only")
	quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	input_text = "What are we having for dinner?"
	input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
	hub_repo = # YOUR HUB REPO ID

jerryzh168 / gist:d5c6c401b2abdf18e0b6771341f1525c

Created January 3, 2025 18:53

	import torch
	import torchao
	from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig

	# benchmark the performance
	import torch.utils.benchmark as benchmark

	def benchmark_fn(f, args, *kwargs):
	# Manual warmup
	for _ in range(5):

jerryzh168 / gist:01d367aaf44dbbbfd4068a4a10a00061

Created January 3, 2025 18:53

	import torch
	import torchao
	from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig

	# benchmark the performance
	import torch.utils.benchmark as benchmark

	def benchmark_fn(f, args, *kwargs):
	# Manual warmup
	for _ in range(5):

jerryzh168 / gist:1ddae00809db4f2cc22e7e34c8549d87

Created December 27, 2024 17:50

{"GEMV": {}, "GEMV_REVSPLITK": {"(1, 1536, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 1}, "(1, 4096, 1024, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 8, "A_load_order": 1, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 1}, "(1, 7168, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 512, "BLOCK_SIZE_K": 8, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 4, "num_ctas": 1, "num_stages": 1}, "(1, 4096, 3584, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 16, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 1}, "(1, 6144, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 8, "A_l

jerryzh168 / gist:315e6e9bf61990b1c0cd256856260128

Created December 26, 2024 22:37

	model.safetensors.index.json: 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 23.9k/23.9k [00:00<00:00, 125MB/s]
	model-00001-of-00004.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 4.98G/4.98G [01:58<00:00, 42.0MB/s]
	model-00002-of-00004.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 5.00G/5.00G [01:58<00:00, 42.2MB/s]
	model-00003-of-00004.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 4.92G/4.92G [01:56<00:00, 42.0MB/s]
	model-00004-of-00004.safetensors: 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

jerryzh168 / gist:a54e38de2ec9fdf1328cf7d37bf8ca5d

Created December 26, 2024 22:37

	Processed prompts: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:05<00:00, 5.77s/it, est. speed input: 1.39 toks/s, output: 177.49 toks/s]
	1
	Processed prompts: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:05<00:00, 5.57s/it, est. speed input: 2.16 toks/s, output: 183.93 toks/s]
	4
	Processed prompts: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████\| 4/4 [00:05<00:00, 1.44s/it, est. speed input: 8.35 toks/s, output: 518.36 toks/s]
	8
	Processed prompts: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████\| 8/8 [00:06<00:00, 1.33it/s, est. speed input: 18.29 toks/s, output: 1052.92 toks/s]
	16
	Processed prompts: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████\| 16/16 [00:06<00:00, 2.33it

jerryzh168 / gist:0f77c289b76f673b376e829a57bea764

Created December 26, 2024 19:28

{"GEMV": {}, "GEMV_REVSPLITK": {"(1, 6144, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 8, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 1}, "(1, 4096, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 16, "A_load_order": 1, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 2}, "(1, 28672, 4096, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 8, "A_load_order": 1, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 2, "num_ctas": 1, "num_stages": 2}, "(1, 4096, 14336, 64, 8)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 512, "BLOCK_SIZE_K": 8, "A_load_order": 0, "meta_evict_policy": "", "atomic_mode": "relaxed", "dot_prod_mode": 0, "num_warps": 4, "num_ctas": 1, "num_stages": 2}, "(1, 6144, 4096, 64, 2)": {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "A

jerryzh168 / gist:ebb6546b0b9bd68c52632f45235a2fd0

Created December 19, 2024 19:02

	[2024-12-19 10:59:34 TP5] Scheduler hit an exception: Traceback (most recent call last):
	File "/data/users/jerryzh/sglang/python/sglang/srt/managers/scheduler.py", line 1528, in run_scheduler_process
	scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)
	File "/data/users/jerryzh/sglang/python/sglang/srt/managers/scheduler.py", line 192, in __init__
	self.tp_worker = TpWorkerClass(
	File "/data/users/jerryzh/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 62, in __init__
	self.worker = TpModelWorker(server_args, gpu_id, tp_rank, dp_rank, nccl_port)
	File "/data/users/jerryzh/sglang/python/sglang/srt/managers/tp_worker.py", line 62, in __init__
	self.model_runner = ModelRunner(
	File "/data/users/jerryzh/sglang/python/sglang/srt/model_executor/model_runner.py", line 158, in __init__

jerryzh168 / gist:1542b036b8f1ed0777a9e7fb0a629acb

Created December 17, 2024 23:54

	[2024-12-17 15:09:47 TP0] Decode batch. #running-req: 968, #token: 511530, token usage: 0.11, gen throughput (token/s): 1554.84, #queue-req: 0
	[2024-12-17 15:09:48 TP0] Decode batch. #running-req: 922, #token: 523719, token usage: 0.11, gen throughput (token/s): 47852.88, #queue-req: 0
	[2024-12-17 15:09:49 TP0] Decode batch. #running-req: 883, #token: 535168, token usage: 0.11, gen throughput (token/s): 46588.76, #queue-req: 0
	[2024-12-17 15:09:50 TP0] Decode batch. #running-req: 847, #token: 548080, token usage: 0.11, gen throughput (token/s): 44284.98, #queue-req: 0
	[2024-12-17 15:09:50 TP0] Decode batch. #running-req: 799, #token: 545397, token usage: 0.11, gen throughput (token/s): 42336.33, #queue-req: 0
	[2024-12-17 15:09:51 TP0] Decode batch. #running-req: 767, #token: 556549, token usage: 0.12, gen throughput (token/s): 41241.09, #queue-req: 0
	[2024-12-17 15:09:52 TP0] Decode batch. #running-req: 730, #token: 558371, token usage: 0.12, gen throughput (token/s): 39677.03, #queue-req: 0
	[2024-12-17 15:09

	model.safetensors.index.json: 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 23.9k/23.9k [00:00<00:00, 125MB/s]
	model-00001-of-00004.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 4.98G/4.98G [01:58<00:00, 42.0MB/s]
	model-00002-of-00004.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 5.00G/5.00G [01:58<00:00, 42.2MB/s]
	model-00003-of-00004.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 4.92G/4.92G [01:56<00:00, 42.0MB/s]
	model-00004-of-00004.safetensors: 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

	Processed prompts: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:05<00:00, 5.77s/it, est. speed input: 1.39 toks/s, output: 177.49 toks/s]
	1
	Processed prompts: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:05<00:00, 5.57s/it, est. speed input: 2.16 toks/s, output: 183.93 toks/s]
	4
	Processed prompts: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████\| 4/4 [00:05<00:00, 1.44s/it, est. speed input: 8.35 toks/s, output: 518.36 toks/s]
	8
	Processed prompts: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████\| 8/8 [00:06<00:00, 1.33it/s, est. speed input: 18.29 toks/s, output: 1052.92 toks/s]
	16
	Processed prompts: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████\| 16/16 [00:06<00:00, 2.33it

Jerry Zhang jerryzh168