Skip to content

Instantly share code, notes, and snippets.

@AmosLewis
Created May 1, 2025 16:26
Show Gist options
  • Save AmosLewis/8b0f6f6aabafda9bdd359a9ddad5f6bf to your computer and use it in GitHub Desktop.
Save AmosLewis/8b0f6f6aabafda9bdd359a9ddad5f6bf to your computer and use it in GitHub Desktop.
(.venv) ➜ shark-ai git:(b5878302) ✗ git checkout 3a73dc3f233bec037275dc75841a9f6112a32a45
M requirements-iree-pinned.txt
Previous HEAD position was b5878302 Add perplexity calculation for Tensor and Pipeline parallized Llama models (#1279)
HEAD is now at 3a73dc3f [tuner][NFC] remove walk function over input module (#1307)
(.venv) ➜ shark-ai git:(3a73dc3f) ✗ pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --iree-device=hip://4 -k testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128
================================================= test session starts ==================================================
platform linux -- Python 3.12.9, pytest-8.0.0, pluggy-1.5.0 -- /home/chi/src/shark-ai/.venv/bin/python
cachedir: .pytest_cache
metadata: {'Python': '3.12.9', 'Platform': 'Linux-6.8.0-52-generic-x86_64-with-glibc2.35', 'Packages': {'pytest': '8.0.0', 'pluggy': '1.5.0'}, 'Plugins': {'timeout': '2.3.1', 'anyio': '4.9.0', 'metadata': '3.1.1', 'html': '4.1.1', 'asyncio': '0.23.8', 'xdist': '3.5.0'}}
rootdir: /home/chi/src/shark-ai/sharktank
configfile: pyproject.toml
plugins: timeout-2.3.1, anyio-4.9.0, metadata-3.1.1, html-4.1.1, asyncio-0.23.8, xdist-3.5.0
asyncio: mode=Mode.STRICT
collected 13 items / 12 deselected / 1 selected
sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_405B::testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 XFAIL
====================================================== XFAILURES =======================================================
____________________ BenchmarkLlama3_1_405B.testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 _____________________
self = <tests.models.llama.benchmark_amdgpu_test.BenchmarkLlama3_1_405B testMethod=testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128>
@pytest.mark.xfail(
reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
)
def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128(self):
output_file_name = self.dir_path_405b / "f16_torch_128"
output_mlir = self.llama405b_f16_torch_sdpa_artifacts.create_file(
suffix=".mlir", prefix=output_file_name
)
output_json = self.llama405b_f16_torch_sdpa_artifacts.create_file(
suffix=".json", prefix=output_file_name
)
output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file(
suffix=".vmfb", prefix=output_file_name
)
output_shard_file_name = (
self.artifacts_dir
/ f"tp8/llama3_405b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa"
)
if output_shard_file_name.exists():
self.llama405b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir(
output_mlir=output_mlir,
output_config=output_json,
)
self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb(
output_mlir=str(output_mlir),
output_vmfb=output_vmfb,
hal_dump_path=output_file_name,
cwd=self.repo_root,
args=self.compile_args,
)
# benchmark prefill
> self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_prefill_nondecomposed_args_128_tp8_fp16,
cwd=self.repo_root,
)
sharktank/tests/models/llama/benchmark_amdgpu_test.py:931:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <sharktank.utils.export_artifacts.ExportArtifacts object at 0x721e9d8d1f10>
def iree_benchmark_vmfb(
self,
*,
hip_device_id: str,
vmfb_name: str,
irpa_path: str,
benchmark_filename: Optional[Path] = None,
args: List[str],
cwd: str | Path,
):
"""Runs a compiled program with the given args using `iree-benchmark-module`.
This assumes that the `iree-benchmark-module` command is available (usually via PATH).
Args:
vmfb_name: Name of the .vmfb file (relative to `cwd`).
args: List of arguments to pass to `iree-benchmark-module`.
cwd: Working directory to run the command within. (either string or Path works)
compile_cmd: Command used to compile the program, for inclusion in error messages.
Raises Exception if running fails for some reason.
"""
benchmark_args = []
if self.tensor_parallelism_size > 1:
base_irpa_path, _ = os.path.splitext(irpa_path)
rocr_visible_devices = [
f"ROCR_VISIBLE_DEVICES={','.join(str(i) for i in range(self.tensor_parallelism_size))}"
]
params = [f"--parameters=model={base_irpa_path}.irpa"]
params += [
f"--parameters=model={base_irpa_path}.rank{i}.irpa"
for i in range(self.tensor_parallelism_size)
]
devices = [
f"--device=hip://{i}" for i in range(self.tensor_parallelism_size)
]
else:
hip_device_arg = int(hip_device_id.split("://")[1])
rocr_visible_devices = [
f"ROCR_VISIBLE_DEVICES={','.join(str(i) for i in range(hip_device_arg + 1))}"
]
params = [f"--parameters=model={irpa_path}"]
devices = [f"--device={hip_device_id}"]
benchmark_args += rocr_visible_devices
benchmark_args += [
"iree-benchmark-module",
"--hip_use_streams=true",
f"--module={vmfb_name}",
]
benchmark_args += params
benchmark_args += devices
benchmark_args += args
benchmark_args += [str(benchmark_filename)]
cmd = subprocess.list2cmdline(benchmark_args)
logger.info(f" Launching run command:\n" f"cd {cwd} && {cmd}")
proc = subprocess.run(cmd, shell=True, stdout=sys.stdout, cwd=cwd)
return_code = proc.returncode
if return_code != 0:
> raise IreeBenchmarkException(proc, cwd)
E sharktank.utils.export_artifacts.IreeBenchmarkException: Error invoking iree-benchmark-module
E Error code: 5
E Stderr diagnostics:
E None
E Stdout diagnostics:
E None
E Run with:
E cd /home/chi/src/shark-ai && ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 iree-benchmark-module --hip_use_streams=true --module=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.vmfb --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank0.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank1.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank2.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank3.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank4.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank5.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank6.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank7.irpa --device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 --function=prefill_bs4 --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/tokens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_lens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_block_ids.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy --benchmark_repetitions=3 >> None
sharktank/sharktank/utils/export_artifacts.py:335: IreeBenchmarkException
-------------------------------------------------- Captured log call ---------------------------------------------------
INFO eval:export_artifacts.py:219 Exporting mlir:
cd /home/chi/src/shark-ai && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --output-mlir=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir --output-config=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.json --bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 --activation-dtype=float16 --attention-kernel=torch
INFO eval:export_artifacts.py:225 Exported to mlir successfully:
Exporting prefill_bs4
Exporting decode_bs4
GENERATED!
Exporting
Saving to '/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir'
INFO eval:export_artifacts.py:137 export_to_mlir: 00 hrs : 03 mins : 30.66 secs
INFO eval:export_artifacts.py:274 Launching compile command:
cd /home/chi/src/shark-ai && iree-compile /home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir --iree-hip-target=gfx942 -o=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.vmfb --iree-hal-target-device=hip[0] --iree-hal-target-device=hip[1] --iree-hal-target-device=hip[2] --iree-hal-target-device=hip[3] --iree-hal-target-device=hip[4] --iree-hal-target-device=hip[5] --iree-hal-target-device=hip[6] --iree-hal-target-device=hip[7] --iree-hal-dump-executable-files-to=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128/files --iree-opt-level=O3 --iree-hal-indirect-command-buffers=true --iree-stream-resource-memory-model=discrete --iree-hal-memoization=true
INFO eval:export_artifacts.py:137 compile_to_vmfb: 45.13 secs
INFO eval:export_artifacts.py:331 Launching run command:
cd /home/chi/src/shark-ai && ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 iree-benchmark-module --hip_use_streams=true --module=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.vmfb --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank0.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank1.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank2.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank3.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank4.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank5.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank6.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank7.irpa --device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 --function=prefill_bs4 --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/tokens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_lens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_block_ids.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy --benchmark_repetitions=3 >> None
=============================================== short test summary info ================================================
XFAIL sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_405B::testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 - Benchmarking Error
==================================== 12 deselected, 1 xfailed in 258.22s (0:04:18) =====================================
(.venv) ➜ shark-ai git:(3a73dc3f) ✗
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment