Created
May 1, 2025 16:26
-
-
Save AmosLewis/8b0f6f6aabafda9bdd359a9ddad5f6bf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(.venv) ➜ shark-ai git:(b5878302) ✗ git checkout 3a73dc3f233bec037275dc75841a9f6112a32a45 | |
M requirements-iree-pinned.txt | |
Previous HEAD position was b5878302 Add perplexity calculation for Tensor and Pipeline parallized Llama models (#1279) | |
HEAD is now at 3a73dc3f [tuner][NFC] remove walk function over input module (#1307) | |
(.venv) ➜ shark-ai git:(3a73dc3f) ✗ pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --iree-device=hip://4 -k testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 | |
================================================= test session starts ================================================== | |
platform linux -- Python 3.12.9, pytest-8.0.0, pluggy-1.5.0 -- /home/chi/src/shark-ai/.venv/bin/python | |
cachedir: .pytest_cache | |
metadata: {'Python': '3.12.9', 'Platform': 'Linux-6.8.0-52-generic-x86_64-with-glibc2.35', 'Packages': {'pytest': '8.0.0', 'pluggy': '1.5.0'}, 'Plugins': {'timeout': '2.3.1', 'anyio': '4.9.0', 'metadata': '3.1.1', 'html': '4.1.1', 'asyncio': '0.23.8', 'xdist': '3.5.0'}} | |
rootdir: /home/chi/src/shark-ai/sharktank | |
configfile: pyproject.toml | |
plugins: timeout-2.3.1, anyio-4.9.0, metadata-3.1.1, html-4.1.1, asyncio-0.23.8, xdist-3.5.0 | |
asyncio: mode=Mode.STRICT | |
collected 13 items / 12 deselected / 1 selected | |
sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_405B::testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 XFAIL | |
====================================================== XFAILURES ======================================================= | |
____________________ BenchmarkLlama3_1_405B.testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 _____________________ | |
self = <tests.models.llama.benchmark_amdgpu_test.BenchmarkLlama3_1_405B testMethod=testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128> | |
@pytest.mark.xfail( | |
reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException | |
) | |
def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128(self): | |
output_file_name = self.dir_path_405b / "f16_torch_128" | |
output_mlir = self.llama405b_f16_torch_sdpa_artifacts.create_file( | |
suffix=".mlir", prefix=output_file_name | |
) | |
output_json = self.llama405b_f16_torch_sdpa_artifacts.create_file( | |
suffix=".json", prefix=output_file_name | |
) | |
output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file( | |
suffix=".vmfb", prefix=output_file_name | |
) | |
output_shard_file_name = ( | |
self.artifacts_dir | |
/ f"tp8/llama3_405b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa" | |
) | |
if output_shard_file_name.exists(): | |
self.llama405b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name | |
export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir( | |
output_mlir=output_mlir, | |
output_config=output_json, | |
) | |
self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb( | |
output_mlir=str(output_mlir), | |
output_vmfb=output_vmfb, | |
hal_dump_path=output_file_name, | |
cwd=self.repo_root, | |
args=self.compile_args, | |
) | |
# benchmark prefill | |
> self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( | |
hip_device_id=self.iree_device, | |
vmfb_name=output_vmfb, | |
irpa_path=self.irpa_path, | |
args=self.iree_run_prefill_nondecomposed_args_128_tp8_fp16, | |
cwd=self.repo_root, | |
) | |
sharktank/tests/models/llama/benchmark_amdgpu_test.py:931: | |
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ | |
self = <sharktank.utils.export_artifacts.ExportArtifacts object at 0x721e9d8d1f10> | |
def iree_benchmark_vmfb( | |
self, | |
*, | |
hip_device_id: str, | |
vmfb_name: str, | |
irpa_path: str, | |
benchmark_filename: Optional[Path] = None, | |
args: List[str], | |
cwd: str | Path, | |
): | |
"""Runs a compiled program with the given args using `iree-benchmark-module`. | |
This assumes that the `iree-benchmark-module` command is available (usually via PATH). | |
Args: | |
vmfb_name: Name of the .vmfb file (relative to `cwd`). | |
args: List of arguments to pass to `iree-benchmark-module`. | |
cwd: Working directory to run the command within. (either string or Path works) | |
compile_cmd: Command used to compile the program, for inclusion in error messages. | |
Raises Exception if running fails for some reason. | |
""" | |
benchmark_args = [] | |
if self.tensor_parallelism_size > 1: | |
base_irpa_path, _ = os.path.splitext(irpa_path) | |
rocr_visible_devices = [ | |
f"ROCR_VISIBLE_DEVICES={','.join(str(i) for i in range(self.tensor_parallelism_size))}" | |
] | |
params = [f"--parameters=model={base_irpa_path}.irpa"] | |
params += [ | |
f"--parameters=model={base_irpa_path}.rank{i}.irpa" | |
for i in range(self.tensor_parallelism_size) | |
] | |
devices = [ | |
f"--device=hip://{i}" for i in range(self.tensor_parallelism_size) | |
] | |
else: | |
hip_device_arg = int(hip_device_id.split("://")[1]) | |
rocr_visible_devices = [ | |
f"ROCR_VISIBLE_DEVICES={','.join(str(i) for i in range(hip_device_arg + 1))}" | |
] | |
params = [f"--parameters=model={irpa_path}"] | |
devices = [f"--device={hip_device_id}"] | |
benchmark_args += rocr_visible_devices | |
benchmark_args += [ | |
"iree-benchmark-module", | |
"--hip_use_streams=true", | |
f"--module={vmfb_name}", | |
] | |
benchmark_args += params | |
benchmark_args += devices | |
benchmark_args += args | |
benchmark_args += [str(benchmark_filename)] | |
cmd = subprocess.list2cmdline(benchmark_args) | |
logger.info(f" Launching run command:\n" f"cd {cwd} && {cmd}") | |
proc = subprocess.run(cmd, shell=True, stdout=sys.stdout, cwd=cwd) | |
return_code = proc.returncode | |
if return_code != 0: | |
> raise IreeBenchmarkException(proc, cwd) | |
E sharktank.utils.export_artifacts.IreeBenchmarkException: Error invoking iree-benchmark-module | |
E Error code: 5 | |
E Stderr diagnostics: | |
E None | |
E Stdout diagnostics: | |
E None | |
E Run with: | |
E cd /home/chi/src/shark-ai && ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 iree-benchmark-module --hip_use_streams=true --module=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.vmfb --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank0.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank1.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank2.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank3.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank4.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank5.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank6.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank7.irpa --device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 --function=prefill_bs4 --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/tokens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_lens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_block_ids.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy --benchmark_repetitions=3 >> None | |
sharktank/sharktank/utils/export_artifacts.py:335: IreeBenchmarkException | |
-------------------------------------------------- Captured log call --------------------------------------------------- | |
INFO eval:export_artifacts.py:219 Exporting mlir: | |
cd /home/chi/src/shark-ai && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --output-mlir=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir --output-config=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.json --bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 --activation-dtype=float16 --attention-kernel=torch | |
INFO eval:export_artifacts.py:225 Exported to mlir successfully: | |
Exporting prefill_bs4 | |
Exporting decode_bs4 | |
GENERATED! | |
Exporting | |
Saving to '/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir' | |
INFO eval:export_artifacts.py:137 export_to_mlir: 00 hrs : 03 mins : 30.66 secs | |
INFO eval:export_artifacts.py:274 Launching compile command: | |
cd /home/chi/src/shark-ai && iree-compile /home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir --iree-hip-target=gfx942 -o=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.vmfb --iree-hal-target-device=hip[0] --iree-hal-target-device=hip[1] --iree-hal-target-device=hip[2] --iree-hal-target-device=hip[3] --iree-hal-target-device=hip[4] --iree-hal-target-device=hip[5] --iree-hal-target-device=hip[6] --iree-hal-target-device=hip[7] --iree-hal-dump-executable-files-to=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128/files --iree-opt-level=O3 --iree-hal-indirect-command-buffers=true --iree-stream-resource-memory-model=discrete --iree-hal-memoization=true | |
INFO eval:export_artifacts.py:137 compile_to_vmfb: 45.13 secs | |
INFO eval:export_artifacts.py:331 Launching run command: | |
cd /home/chi/src/shark-ai && ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 iree-benchmark-module --hip_use_streams=true --module=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.vmfb --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank0.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank1.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank2.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank3.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank4.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank5.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank6.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank7.irpa --device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 --function=prefill_bs4 --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/tokens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_lens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_block_ids.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy --benchmark_repetitions=3 >> None | |
=============================================== short test summary info ================================================ | |
XFAIL sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_405B::testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 - Benchmarking Error | |
==================================== 12 deselected, 1 xfailed in 258.22s (0:04:18) ===================================== | |
(.venv) ➜ shark-ai git:(3a73dc3f) ✗ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment