Created
May 1, 2025 16:29
-
-
Save AmosLewis/1bc94989102d48a6bdbd553639eb3034 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(.venv) ➜ shark-ai git:(ad958230) ✗ git checkout b58783029e0cc3e1890b22339d470c394a66dcb4 | |
M requirements-iree-pinned.txt | |
Previous HEAD position was ad958230 Add ops.mean for SplitPrimitiveTensor (#1308) | |
HEAD is now at b5878302 Add perplexity calculation for Tensor and Pipeline parallized Llama models (#1279) | |
(.venv) ➜ shark-ai git:(b5878302) ✗ pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --iree-device=hip://4 -k testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 | |
================================================= test session starts ================================================== | |
platform linux -- Python 3.12.9, pytest-8.0.0, pluggy-1.5.0 -- /home/chi/src/shark-ai/.venv/bin/python | |
cachedir: .pytest_cache | |
metadata: {'Python': '3.12.9', 'Platform': 'Linux-6.8.0-52-generic-x86_64-with-glibc2.35', 'Packages': {'pytest': '8.0.0', 'pluggy': '1.5.0'}, 'Plugins': {'timeout': '2.3.1', 'anyio': '4.9.0', 'metadata': '3.1.1', 'html': '4.1.1', 'asyncio': '0.23.8', 'xdist': '3.5.0'}} | |
rootdir: /home/chi/src/shark-ai/sharktank | |
configfile: pyproject.toml | |
plugins: timeout-2.3.1, anyio-4.9.0, metadata-3.1.1, html-4.1.1, asyncio-0.23.8, xdist-3.5.0 | |
asyncio: mode=Mode.STRICT | |
collected 13 items / 12 deselected / 1 selected | |
sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_405B::testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 FAILED | |
======================================================= FAILURES ======================================================= | |
____________________ BenchmarkLlama3_1_405B.testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 _____________________ | |
self = <tests.models.llama.benchmark_amdgpu_test.BenchmarkLlama3_1_405B testMethod=testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128> | |
@pytest.mark.xfail( | |
reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException | |
) | |
def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128(self): | |
output_file_name = self.dir_path_405b / "f16_torch_128" | |
output_mlir = self.llama405b_f16_torch_sdpa_artifacts.create_file( | |
suffix=".mlir", prefix=output_file_name | |
) | |
output_json = self.llama405b_f16_torch_sdpa_artifacts.create_file( | |
suffix=".json", prefix=output_file_name | |
) | |
output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file( | |
suffix=".vmfb", prefix=output_file_name | |
) | |
output_shard_file_name = ( | |
self.artifacts_dir | |
/ f"tp8/llama3_405b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa" | |
) | |
if output_shard_file_name.exists(): | |
self.llama405b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name | |
> export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir( | |
output_mlir=output_mlir, | |
output_config=output_json, | |
) | |
sharktank/tests/models/llama/benchmark_amdgpu_test.py:929: | |
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ | |
sharktank/sharktank/utils/export_artifacts.py:126: in wrapper | |
result = func(*args, **kwargs) | |
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ | |
self = <sharktank.utils.export_artifacts.ExportArtifacts object at 0x7e3c9a5c2480> | |
@timeit | |
def export_to_mlir( | |
self, | |
*, | |
output_mlir: str, | |
output_config: str, | |
skip_decode: Optional[bool] = None, | |
): | |
export_args = [ | |
"python3", | |
"-m", | |
"sharktank.examples.export_paged_llm_v1", | |
f"--irpa-file={self.irpa_path}", | |
f"--output-mlir={output_mlir}", | |
f"--output-config={output_config}", | |
f"--bs-prefill={str(self.batch_size)}", | |
f"--bs-decode={str(self.batch_size)}", | |
f"--block-seq-stride={self.block_seq_stride}", | |
f"--attention-dtype={self.attention_dtype}", | |
f"--activation-dtype={self.activation_dtype}", | |
f"--tensor-parallelism-size={self.tensor_parallelism_size}", | |
f"--pipeline-parallelism-size={self.pipeline_parallelism_size}", | |
] | |
assert self.attention_kernel in [ | |
"decomposed", | |
"torch", | |
"sharktank", | |
], "Only torch (sdpa), decomposed or sharktank --attention-kernel types are supported" | |
export_args.append(f"--attention-kernel={self.attention_kernel}") | |
if self.kv_cache_dtype is not None: | |
export_args.append(f"--kv-cache-dtype={self.kv_cache_dtype}") | |
if skip_decode: | |
export_args.append("--skip-decode") | |
if self.use_attention_mask: | |
export_args.append("--use-attention-mask") | |
if self.use_hf: | |
export_args.append("--use-hf") | |
cwd = self.sharktank_dir | |
cmd = subprocess.list2cmdline(export_args) | |
logger.info(f" Exporting mlir:\n" f"cd {cwd} && {cmd}") | |
proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd, text=True) | |
if proc.returncode != 0: | |
> raise ExportMlirException(proc, cwd) | |
E sharktank.utils.export_artifacts.ExportMlirException: Error invoking export_paged_llama_v1.py | |
E Error code: 1 | |
E Stderr diagnostics: | |
E /home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.) | |
E return torch.from_numpy(wrapper) | |
E Traceback (most recent call last): | |
E File "<frozen runpy>", line 198, in _run_module_as_main | |
E File "<frozen runpy>", line 88, in _run_code | |
E File "/home/chi/src/shark-ai/sharktank/sharktank/examples/export_paged_llm_v1.py", line 447, in <module> | |
E main() | |
E File "/home/chi/src/shark-ai/sharktank/sharktank/examples/export_paged_llm_v1.py", line 425, in main | |
E generate_batch_prefill(bs) | |
E File "/home/chi/src/shark-ai/sharktank/sharktank/examples/export_paged_llm_v1.py", line 224, in generate_batch_prefill | |
E @fxb.export_program( | |
E ^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/iree/turbine/aot/fx_programs.py", line 239, in export_program | |
E program = torch.export.export( | |
E ^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/__init__.py", line 368, in export | |
E return _export( | |
E ^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1035, in wrapper | |
E raise e | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1008, in wrapper | |
E ep = fn(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/exported_program.py", line 128, in wrapper | |
E return fn(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1970, in _export | |
E return _export_for_training( | |
E ^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1035, in wrapper | |
E raise e | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1008, in wrapper | |
E ep = fn(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/exported_program.py", line 128, in wrapper | |
E return fn(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1834, in _export_for_training | |
E export_artifact = export_func( # type: ignore[operator] | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1772, in _non_strict_export | |
E aten_export_artifact = _to_aten_func( # type: ignore[operator] | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1564, in _export_to_aten_ir_make_fx | |
E gm, graph_signature = transform(_make_fx_helper)( | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1702, in _aot_export_non_strict | |
E gm, sig = aot_export(wrapped_mod, args, kwargs=kwargs, **flags) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1485, in _make_fx_helper | |
E gm = make_fx( | |
E ^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2196, in wrapped | |
E return make_fx_tracer.trace(f, *args) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2134, in trace | |
E return self._trace_inner(f, *args) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2105, in _trace_inner | |
E t = dispatch_trace( | |
E ^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/_compile.py", line 32, in inner | |
E return disable_fn(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 745, in _fn | |
E return fn(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1138, in dispatch_trace | |
E graph = tracer.trace(root, concrete_args) # type: ignore[arg-type] | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1694, in trace | |
E res = super().trace(root, concrete_args) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 843, in trace | |
E (self.create_arg(fn(*args)),), | |
E ^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1193, in wrapped | |
E out = f(*tensors) # type:ignore[call-arg] | |
E ^^^^^^^^^^^ | |
E File "<string>", line 1, in <lambda> | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1469, in wrapped_fn | |
E return tuple(flat_fn(*args)) | |
E ^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/_functorch/_aot_autograd/utils.py", line 184, in flat_fn | |
E tree_out = fn(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/_functorch/_aot_autograd/traced_function_transforms.py", line 879, in functional_call | |
E out = mod(*args[params_len:], **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 821, in module_call_wrapper | |
E return self.call_module(mod, forward, args, kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1764, in call_module | |
E return Tracer.call_module(self, m, forward, args, kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 539, in call_module | |
E ret_val = forward(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 814, in forward | |
E return _orig_module_call(mod, *args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl | |
E return self._call_impl(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl | |
E return forward_call(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1689, in forward | |
E tree_out = mod(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 821, in module_call_wrapper | |
E return self.call_module(mod, forward, args, kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1764, in call_module | |
E return Tracer.call_module(self, m, forward, args, kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 539, in call_module | |
E ret_val = forward(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 814, in forward | |
E return _orig_module_call(mod, *args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl | |
E return self._call_impl(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl | |
E return forward_call(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/iree/turbine/aot/fx_programs.py", line 226, in new_forward | |
E return f(self.root, *forward_args, **forward_kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/sharktank/sharktank/examples/export_paged_llm_v1.py", line 277, in _ | |
E logits = model.prefill( | |
E ^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/sharktank/sharktank/models/llm/llm.py", line 153, in prefill | |
E h = self.token_embedding(tokens) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 821, in module_call_wrapper | |
E return self.call_module(mod, forward, args, kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1764, in call_module | |
E return Tracer.call_module(self, m, forward, args, kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 539, in call_module | |
E ret_val = forward(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 814, in forward | |
E return _orig_module_call(mod, *args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl | |
E return self._call_impl(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl | |
E return forward_call(*args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/sharktank/sharktank/layers/token_embedding.py", line 28, in forward | |
E return ops.embedding_lookup(input, self.weight, dtype=self.dtype) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/sharktank/sharktank/ops/_registry.py", line 197, in __call__ | |
E selected_override, *results = trampoline(self, *args, **kwargs) | |
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E File "/home/chi/src/shark-ai/sharktank/sharktank/ops/signatures.py", line 268, in _embedding_lookup_trampoline | |
E d.fail(tensors) | |
E File "/home/chi/src/shark-ai/sharktank/sharktank/ops/_registry.py", line 246, in fail | |
E raise NotImplementedError( | |
E NotImplementedError: Overridable operator sharktank.ops.signatures.embedding_lookup does not have an implementation for argument types: [<class 'sharktank.types.tensors.ReplicatedTensor'>, <class 'sharktank.types.tensors.DefaultPrimitiveTensor'>] | |
E | |
E | |
E Invoked with: | |
E cd /home/chi/src/shark-ai && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --output-mlir=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir --output-config=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.json --bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 --activation-dtype=float16 --tensor-parallelism-size=8 --pipeline-parallelism-size=1 --attention-kernel=torch | |
sharktank/sharktank/utils/export_artifacts.py:230: ExportMlirException | |
-------------------------------------------------- Captured log call --------------------------------------------------- | |
INFO eval:export_artifacts.py:226 Exporting mlir: | |
cd /home/chi/src/shark-ai && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --output-mlir=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir --output-config=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.json --bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 --activation-dtype=float16 --tensor-parallelism-size=8 --pipeline-parallelism-size=1 --attention-kernel=torch | |
=============================================== short test summary info ================================================ | |
FAILED sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_405B::testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 - sharktank.utils.export_artifacts.ExportMlirException: Error invoking export_paged_llama_v1.py |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment