December 2, 2024 22:02 · November 2, 2024 00:19 · October 5, 2024 01:26 · October 5, 2024 01:22 · October 3, 2024 20:58 · September 26, 2024 17:17
 W1202 13:59:56.994000 2658530 site-packages/torch/_logging/_internal.py:1084] [1/0] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored                            13:59:58 [117/1719]
 Traceback (most recent call last):
  File "/data/users/jerryzh/ao/examples/sam2_amg_server/server.py", line 709, in <module>
    fire.Fire(main)
  File "/home/jerryzh/.conda/envs/ao/lib/python3.10/site-packages/fire/core.py", line 135, in Fire
    component_trace = _Fire(component, args, parsed_flag_args, context, name)
  File "/home/jerryzh/.conda/envs/ao/lib/python3.10/site-packages/fire/core.py", line 468, in _Fire
    component, remaining_args = _CallAndUpdateTrace(
  File "/home/jerryzh/.conda/envs/ao/lib/python3.10/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace
    component = fn(*varargs, **kwargs)
 diff --git a/python/pyproject.toml b/python/pyproject.toml
 index d9749e1..fbcc0fd 100644
 --- a/python/pyproject.toml
 +++ b/python/pyproject.toml
 @@ -20,7 +20,7 @@ runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hu
     "orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
     "torchao", "uvicorn", "uvloop", "zmq",
     "outlines>=0.0.44", "modelscope"]
 -srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
 +srt = ["sglang[runtime_common]", "torch", "vllm"]
 mport torch
 from diffusers import FluxPipeline, FluxTransformer2DModel
 import torch.utils.benchmark as benchmark
 from functools import partial

 def get_example_inputs():
    example_inputs = {
        "hidden_states": torch.randn(1, 4096, 64, dtype=torch.bfloat16, device="cuda"),
        "encoder_hidden_states": torch.randn(1, 512, 4096, dtype=torch.bfloat16, device="cuda"),
        "pooled_projections": torch.randn(1, 768, dtype=torch.bfloat16, device="cuda"),
 from torchvision import models
 import torch
 ## compilation configs
 torch._dynamo.config.automatic_dynamic_shapes = False
 torch._inductor.config.force_fuse_int_mm_with_mul = True
 torch._inductor.config.use_mixed_mm = True
 ## compilation configs end
 # temporary workaround to recover the perf with quantized model under torch.compile
 torch.backends.mha.set_fastpath_enabled(False)
 import torch
 diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py
 index b63aaf1..9c268ab 100644
 --- a/python/sglang/srt/models/llama.py
 +++ b/python/sglang/srt/models/llama.py
 @@ -18,6 +18,7 @@ limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
 from typing import Any, Dict, Iterable, Optional, Tuple
 +from torch.nn.parameter import Parameter
 
 baseline (no tp)

 python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 1 --input 128 --output 8

 [15:07:14 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=79.41 GB
 [15:07:14 TP0] Memory pool end. avail mem=11.16 GB
 [15:07:14 TP0] Capture cuda graph begin. This can take up to several minutes.
 max_total_num_tokens=557684
 Warmup ...
 Prefill. latency: 0.03870 s, throughput:   3307.61 token/s
 [rank0]:     run_once()
 [rank0]:   File "/data/users/jerryzh/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 199, in run_once
 [rank0]:     return forward(input_ids, input_metadata.positions, input_metadata)
 [rank0]:   File "/home/jerryzh/anaconda3/envs/sglang/lib/python3.10/site-packages/torch-2.4.0-py3.10-linux-x86_64.egg/torch/utils/_contextlib.py", line 116, in decorate_context
 [rank0]:     return func(*args, **kwargs)
 [rank0]:   File "/data/users/jerryzh/sglang/python/sglang/srt/models/llama.py", line 320, in forward
 [rank0]:     hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
 [rank0]:   File "/home/jerryzh/anaconda3/envs/sglang/lib/python3.10/site-packages/torch-2.4.0-py3.10-linux-x86_64.egg/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
 [rank0]:     return self._call_impl(*args, **kwargs)
 [rank0]:   File "/home/jerryzh/anaconda3/envs/sglang/lib/python3.10/site-packages/torch-2.4.0-py3.10-linux-x86_64.egg/torch/nn/modules/module.py", line 
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
 import torch.utils.benchmark as benchmark
 def benchmark_fn(f, *args, **kwargs):
    t0 = benchmark.Timer(
        stmt="f(*args, **kwargs)",
        globals={"args": args, "kwargs": kwargs, "f": f},
        num_threads=torch.get_num_threads(),
    )
    return f"{(t0.blocked_autorange().mean):.3f}"
 +    @common_utils.parametrize("device", COMMON_DEVICES)
 +    @common_utils.parametrize("dtype", COMMON_DTYPES)
 +    def test_linear_compile(self, device, dtype):
 +        hp_tensor = torch.randn(4, 128, device=device, dtype=dtype)
 +        lp_tensor = self.FACTORY_FN(hp_tensor, **self.kwargs)
 +
 +        hp_act_tensor = torch.randn(32, 128, device=device, dtype=dtype)
 +        hp_res = torch.nn.functional.linear(hp_act_tensor, hp_tensor)
 +        l = torch.nn.Linear(128, 4, bias=False, device=device, dtype=dtype)
 +        l.weight = torch.nn.Parameter(lp_tensor)
 ...........frames [('total', 1), ('ok', 1)]
 inductor [('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('ok', 1)]
 .frames [('total', 1), ('ok', 1)]
 inductor [('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('ok', 1)]
	W1202 13:59:56.994000 2658530 site-packages/torch/_logging/_internal.py:1084] [1/0] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored 13:59:58 [117/1719]
	Traceback (most recent call last):
	File "/data/users/jerryzh/ao/examples/sam2_amg_server/server.py", line 709, in <module>
	fire.Fire(main)
	File "/home/jerryzh/.conda/envs/ao/lib/python3.10/site-packages/fire/core.py", line 135, in Fire
	component_trace = _Fire(component, args, parsed_flag_args, context, name)
	File "/home/jerryzh/.conda/envs/ao/lib/python3.10/site-packages/fire/core.py", line 468, in _Fire
	component, remaining_args = _CallAndUpdateTrace(
	File "/home/jerryzh/.conda/envs/ao/lib/python3.10/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace
	component = fn(varargs, *kwargs)
	diff --git a/python/pyproject.toml b/python/pyproject.toml
	index d9749e1..fbcc0fd 100644
	--- a/python/pyproject.toml
	+++ b/python/pyproject.toml
	@@ -20,7 +20,7 @@ runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hu
	"orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
	"torchao", "uvicorn", "uvloop", "zmq",
	"outlines>=0.0.44", "modelscope"]
	-srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
	+srt = ["sglang[runtime_common]", "torch", "vllm"]
	mport torch
	from diffusers import FluxPipeline, FluxTransformer2DModel
	import torch.utils.benchmark as benchmark
	from functools import partial

	def get_example_inputs():
	example_inputs = {
	"hidden_states": torch.randn(1, 4096, 64, dtype=torch.bfloat16, device="cuda"),
	"encoder_hidden_states": torch.randn(1, 512, 4096, dtype=torch.bfloat16, device="cuda"),
	"pooled_projections": torch.randn(1, 768, dtype=torch.bfloat16, device="cuda"),
	from torchvision import models
	import torch
	## compilation configs
	torch._dynamo.config.automatic_dynamic_shapes = False
	torch._inductor.config.force_fuse_int_mm_with_mul = True
	torch._inductor.config.use_mixed_mm = True
	## compilation configs end
	# temporary workaround to recover the perf with quantized model under torch.compile
	torch.backends.mha.set_fastpath_enabled(False)
	import torch
	diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py
	index b63aaf1..9c268ab 100644
	--- a/python/sglang/srt/models/llama.py
	+++ b/python/sglang/srt/models/llama.py
	@@ -18,6 +18,7 @@ limitations under the License.
	"""Inference-only LLaMA model compatible with HuggingFace weights."""

	from typing import Any, Dict, Iterable, Optional, Tuple
	+from torch.nn.parameter import Parameter
	baseline (no tp)

	python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 1 --input 128 --output 8

	[15:07:14 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=79.41 GB
	[15:07:14 TP0] Memory pool end. avail mem=11.16 GB
	[15:07:14 TP0] Capture cuda graph begin. This can take up to several minutes.
	max_total_num_tokens=557684
	Warmup ...
	Prefill. latency: 0.03870 s, throughput: 3307.61 token/s
	[rank0]: run_once()
	[rank0]: File "/data/users/jerryzh/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 199, in run_once
	[rank0]: return forward(input_ids, input_metadata.positions, input_metadata)
	[rank0]: File "/home/jerryzh/anaconda3/envs/sglang/lib/python3.10/site-packages/torch-2.4.0-py3.10-linux-x86_64.egg/torch/utils/_contextlib.py", line 116, in decorate_context
	[rank0]: return func(args, *kwargs)
	[rank0]: File "/data/users/jerryzh/sglang/python/sglang/srt/models/llama.py", line 320, in forward
	[rank0]: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
	[rank0]: File "/home/jerryzh/anaconda3/envs/sglang/lib/python3.10/site-packages/torch-2.4.0-py3.10-linux-x86_64.egg/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
	[rank0]: return self._call_impl(args, *kwargs)
	[rank0]: File "/home/jerryzh/anaconda3/envs/sglang/lib/python3.10/site-packages/torch-2.4.0-py3.10-linux-x86_64.egg/torch/nn/modules/module.py", line
	import torch
	from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
	import torch.utils.benchmark as benchmark
	def benchmark_fn(f, args, *kwargs):
	t0 = benchmark.Timer(
	stmt="f(args, *kwargs)",
	globals={"args": args, "kwargs": kwargs, "f": f},
	num_threads=torch.get_num_threads(),
	)
	return f"{(t0.blocked_autorange().mean):.3f}"
	+ @common_utils.parametrize("device", COMMON_DEVICES)
	+ @common_utils.parametrize("dtype", COMMON_DTYPES)
	+ def test_linear_compile(self, device, dtype):
	+ hp_tensor = torch.randn(4, 128, device=device, dtype=dtype)
	+ lp_tensor = self.FACTORY_FN(hp_tensor, **self.kwargs)
	+
	+ hp_act_tensor = torch.randn(32, 128, device=device, dtype=dtype)
	+ hp_res = torch.nn.functional.linear(hp_act_tensor, hp_tensor)
	+ l = torch.nn.Linear(128, 4, bias=False, device=device, dtype=dtype)
	+ l.weight = torch.nn.Parameter(lp_tensor)
	...........frames [('total', 1), ('ok', 1)]
	inductor [('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
	inline_call []
	stats [('calls_captured', 1), ('unique_graphs', 1)]
	aot_autograd [('total', 1), ('ok', 1)]
	.frames [('total', 1), ('ok', 1)]
	inductor [('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
	inline_call []
	stats [('calls_captured', 1), ('unique_graphs', 1)]
	aot_autograd [('total', 1), ('ok', 1)]