AmosLewis · May 1, 2025 16:26
diff --git a/3a73dc3f.txt b/3a73dc3f.txt
 (.venv) ➜  shark-ai git:(b5878302) ✗ git checkout 3a73dc3f233bec037275dc75841a9f6112a32a45
 M       requirements-iree-pinned.txt
 Previous HEAD position was b5878302 Add perplexity calculation for Tensor and Pipeline parallized Llama models (#1279)
 HEAD is now at 3a73dc3f [tuner][NFC] remove walk function over input module (#1307)
 (.venv) ➜  shark-ai git:(3a73dc3f) ✗ pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --iree-device=hip://4 -k testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128
 ================================================= test session starts ==================================================
 platform linux -- Python 3.12.9, pytest-8.0.0, pluggy-1.5.0 -- /home/chi/src/shark-ai/.venv/bin/python
 cachedir: .pytest_cache
 metadata: {'Python': '3.12.9', 'Platform': 'Linux-6.8.0-52-generic-x86_64-with-glibc2.35', 'Packages': {'pytest': '8.0.0', 'pluggy': '1.5.0'}, 'Plugins': {'timeout': '2.3.1', 'anyio': '4.9.0', 'metadata': '3.1.1', 'html': '4.1.1', 'asyncio': '0.23.8', 'xdist': '3.5.0'}}
 rootdir: /home/chi/src/shark-ai/sharktank
 configfile: pyproject.toml
 plugins: timeout-2.3.1, anyio-4.9.0, metadata-3.1.1, html-4.1.1, asyncio-0.23.8, xdist-3.5.0
 asyncio: mode=Mode.STRICT
 collected 13 items / 12 deselected / 1 selected

 sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_405B::testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 XFAIL

 ====================================================== XFAILURES =======================================================
 ____________________ BenchmarkLlama3_1_405B.testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 _____________________

 self = <tests.models.llama.benchmark_amdgpu_test.BenchmarkLlama3_1_405B testMethod=testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128>

    @pytest.mark.xfail(
        reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
    )
    def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128(self):
        output_file_name = self.dir_path_405b / "f16_torch_128"
        output_mlir = self.llama405b_f16_torch_sdpa_artifacts.create_file(
            suffix=".mlir", prefix=output_file_name
        )
        output_json = self.llama405b_f16_torch_sdpa_artifacts.create_file(
            suffix=".json", prefix=output_file_name
        )
        output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file(
            suffix=".vmfb", prefix=output_file_name
        )
        output_shard_file_name = (
            self.artifacts_dir
            / f"tp8/llama3_405b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa"
        )
        if output_shard_file_name.exists():
            self.llama405b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
        export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir(
            output_mlir=output_mlir,
            output_config=output_json,
        )
        self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb(
            output_mlir=str(output_mlir),
            output_vmfb=output_vmfb,
            hal_dump_path=output_file_name,
            cwd=self.repo_root,
            args=self.compile_args,
        )
        # benchmark prefill
 >       self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
            hip_device_id=self.iree_device,
            vmfb_name=output_vmfb,
            irpa_path=self.irpa_path,
            args=self.iree_run_prefill_nondecomposed_args_128_tp8_fp16,
            cwd=self.repo_root,
        )

 sharktank/tests/models/llama/benchmark_amdgpu_test.py:931:
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

 self = <sharktank.utils.export_artifacts.ExportArtifacts object at 0x721e9d8d1f10>

    def iree_benchmark_vmfb(
        self,
        *,
        hip_device_id: str,
        vmfb_name: str,
        irpa_path: str,
        benchmark_filename: Optional[Path] = None,
        args: List[str],
        cwd: str | Path,
    ):
        """Runs a compiled program with the given args using `iree-benchmark-module`.
        This assumes that the `iree-benchmark-module` command is available (usually via PATH).
        Args:
            vmfb_name: Name of the .vmfb file (relative to `cwd`).
            args: List of arguments to pass to `iree-benchmark-module`.
            cwd: Working directory to run the command within. (either string or Path works)
            compile_cmd: Command used to compile the program, for inclusion in error messages.
        Raises Exception if running fails for some reason.
        """
        benchmark_args = []
        if self.tensor_parallelism_size > 1:
            base_irpa_path, _ = os.path.splitext(irpa_path)
            rocr_visible_devices = [
                f"ROCR_VISIBLE_DEVICES={','.join(str(i) for i in range(self.tensor_parallelism_size))}"
            ]
            params = [f"--parameters=model={base_irpa_path}.irpa"]
            params += [
                f"--parameters=model={base_irpa_path}.rank{i}.irpa"
                for i in range(self.tensor_parallelism_size)
            ]
            devices = [
                f"--device=hip://{i}" for i in range(self.tensor_parallelism_size)
            ]
        else:
            hip_device_arg = int(hip_device_id.split("://")[1])
            rocr_visible_devices = [
                f"ROCR_VISIBLE_DEVICES={','.join(str(i) for i in range(hip_device_arg + 1))}"
            ]
            params = [f"--parameters=model={irpa_path}"]
            devices = [f"--device={hip_device_id}"]
        benchmark_args += rocr_visible_devices
        benchmark_args += [
            "iree-benchmark-module",
            "--hip_use_streams=true",
            f"--module={vmfb_name}",
        ]
        benchmark_args += params
        benchmark_args += devices
        benchmark_args += args
        benchmark_args += [str(benchmark_filename)]
        cmd = subprocess.list2cmdline(benchmark_args)
        logger.info(f" Launching run command:\n" f"cd {cwd} && {cmd}")
        proc = subprocess.run(cmd, shell=True, stdout=sys.stdout, cwd=cwd)
        return_code = proc.returncode
        if return_code != 0:
 >           raise IreeBenchmarkException(proc, cwd)
 E           sharktank.utils.export_artifacts.IreeBenchmarkException: Error invoking iree-benchmark-module
 E           Error code: 5
 E           Stderr diagnostics:
 E           None
 E           Stdout diagnostics:
 E           None
 E           Run with:
 E             cd /home/chi/src/shark-ai && ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 iree-benchmark-module --hip_use_streams=true --module=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.vmfb --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank0.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank1.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank2.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank3.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank4.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank5.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank6.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank7.irpa --device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 --function=prefill_bs4 --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/tokens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_lens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_block_ids.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy --benchmark_repetitions=3 >> None

 sharktank/sharktank/utils/export_artifacts.py:335: IreeBenchmarkException
 -------------------------------------------------- Captured log call ---------------------------------------------------
 INFO     eval:export_artifacts.py:219  Exporting mlir:
 cd /home/chi/src/shark-ai && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --output-mlir=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir --output-config=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.json --bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 --activation-dtype=float16 --attention-kernel=torch
 INFO     eval:export_artifacts.py:225  Exported to mlir successfully:
 Exporting prefill_bs4
 Exporting decode_bs4
 GENERATED!
 Exporting
 Saving to '/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir'

 INFO     eval:export_artifacts.py:137  export_to_mlir: 00 hrs : 03 mins : 30.66 secs
 INFO     eval:export_artifacts.py:274  Launching compile command:
 cd /home/chi/src/shark-ai && iree-compile /home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir --iree-hip-target=gfx942 -o=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.vmfb --iree-hal-target-device=hip[0] --iree-hal-target-device=hip[1] --iree-hal-target-device=hip[2] --iree-hal-target-device=hip[3] --iree-hal-target-device=hip[4] --iree-hal-target-device=hip[5] --iree-hal-target-device=hip[6] --iree-hal-target-device=hip[7] --iree-hal-dump-executable-files-to=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128/files --iree-opt-level=O3 --iree-hal-indirect-command-buffers=true --iree-stream-resource-memory-model=discrete --iree-hal-memoization=true
 INFO     eval:export_artifacts.py:137  compile_to_vmfb: 45.13 secs
 INFO     eval:export_artifacts.py:331  Launching run command:
 cd /home/chi/src/shark-ai && ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 iree-benchmark-module --hip_use_streams=true --module=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.vmfb --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank0.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank1.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank2.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank3.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank4.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank5.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank6.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank7.irpa --device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 --function=prefill_bs4 --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/tokens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_lens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_block_ids.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy --benchmark_repetitions=3 >> None
 =============================================== short test summary info ================================================
 XFAIL sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_405B::testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 - Benchmarking Error
 ==================================== 12 deselected, 1 xfailed in 258.22s (0:04:18) =====================================
 (.venv) ➜  shark-ai git:(3a73dc3f) ✗
	(.venv) ➜ shark-ai git:(b5878302) ✗ git checkout 3a73dc3f233bec037275dc75841a9f6112a32a45
	M requirements-iree-pinned.txt
	Previous HEAD position was b5878302 Add perplexity calculation for Tensor and Pipeline parallized Llama models (#1279)
	HEAD is now at 3a73dc3f [tuner][NFC] remove walk function over input module (#1307)
	(.venv) ➜ shark-ai git:(3a73dc3f) ✗ pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --iree-device=hip://4 -k testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128
	================================================= test session starts ==================================================
	platform linux -- Python 3.12.9, pytest-8.0.0, pluggy-1.5.0 -- /home/chi/src/shark-ai/.venv/bin/python
	cachedir: .pytest_cache
	metadata: {'Python': '3.12.9', 'Platform': 'Linux-6.8.0-52-generic-x86_64-with-glibc2.35', 'Packages': {'pytest': '8.0.0', 'pluggy': '1.5.0'}, 'Plugins': {'timeout': '2.3.1', 'anyio': '4.9.0', 'metadata': '3.1.1', 'html': '4.1.1', 'asyncio': '0.23.8', 'xdist': '3.5.0'}}
	rootdir: /home/chi/src/shark-ai/sharktank
	configfile: pyproject.toml
	plugins: timeout-2.3.1, anyio-4.9.0, metadata-3.1.1, html-4.1.1, asyncio-0.23.8, xdist-3.5.0
	asyncio: mode=Mode.STRICT
	collected 13 items / 12 deselected / 1 selected

	sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_405B::testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 XFAIL

	====================================================== XFAILURES =======================================================
	____________________ BenchmarkLlama3_1_405B.testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 _____________________

	self = <tests.models.llama.benchmark_amdgpu_test.BenchmarkLlama3_1_405B testMethod=testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128>

	@pytest.mark.xfail(
	reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
	)
	def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128(self):
	output_file_name = self.dir_path_405b / "f16_torch_128"
	output_mlir = self.llama405b_f16_torch_sdpa_artifacts.create_file(
	suffix=".mlir", prefix=output_file_name
	)
	output_json = self.llama405b_f16_torch_sdpa_artifacts.create_file(
	suffix=".json", prefix=output_file_name
	)
	output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file(
	suffix=".vmfb", prefix=output_file_name
	)
	output_shard_file_name = (
	self.artifacts_dir
	/ f"tp8/llama3_405b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa"
	)
	if output_shard_file_name.exists():
	self.llama405b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
	export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir(
	output_mlir=output_mlir,
	output_config=output_json,
	)
	self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb(
	output_mlir=str(output_mlir),
	output_vmfb=output_vmfb,
	hal_dump_path=output_file_name,
	cwd=self.repo_root,
	args=self.compile_args,
	)
	# benchmark prefill
	> self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
	hip_device_id=self.iree_device,
	vmfb_name=output_vmfb,
	irpa_path=self.irpa_path,
	args=self.iree_run_prefill_nondecomposed_args_128_tp8_fp16,
	cwd=self.repo_root,
	)

	sharktank/tests/models/llama/benchmark_amdgpu_test.py:931:
	_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

	self = <sharktank.utils.export_artifacts.ExportArtifacts object at 0x721e9d8d1f10>

	def iree_benchmark_vmfb(
	self,
	*,
	hip_device_id: str,
	vmfb_name: str,
	irpa_path: str,
	benchmark_filename: Optional[Path] = None,
	args: List[str],
	cwd: str \| Path,
	):
	"""Runs a compiled program with the given args using `iree-benchmark-module`.
	This assumes that the `iree-benchmark-module` command is available (usually via PATH).
	Args:
	vmfb_name: Name of the .vmfb file (relative to `cwd`).
	args: List of arguments to pass to `iree-benchmark-module`.
	cwd: Working directory to run the command within. (either string or Path works)
	compile_cmd: Command used to compile the program, for inclusion in error messages.
	Raises Exception if running fails for some reason.
	"""
	benchmark_args = []
	if self.tensor_parallelism_size > 1:
	base_irpa_path, _ = os.path.splitext(irpa_path)
	rocr_visible_devices = [
	f"ROCR_VISIBLE_DEVICES={','.join(str(i) for i in range(self.tensor_parallelism_size))}"
	]
	params = [f"--parameters=model={base_irpa_path}.irpa"]
	params += [
	f"--parameters=model={base_irpa_path}.rank{i}.irpa"
	for i in range(self.tensor_parallelism_size)
	]
	devices = [
	f"--device=hip://{i}" for i in range(self.tensor_parallelism_size)
	]
	else:
	hip_device_arg = int(hip_device_id.split("://")[1])
	rocr_visible_devices = [
	f"ROCR_VISIBLE_DEVICES={','.join(str(i) for i in range(hip_device_arg + 1))}"
	]
	params = [f"--parameters=model={irpa_path}"]
	devices = [f"--device={hip_device_id}"]
	benchmark_args += rocr_visible_devices
	benchmark_args += [
	"iree-benchmark-module",
	"--hip_use_streams=true",
	f"--module={vmfb_name}",
	]
	benchmark_args += params
	benchmark_args += devices
	benchmark_args += args
	benchmark_args += [str(benchmark_filename)]
	cmd = subprocess.list2cmdline(benchmark_args)
	logger.info(f" Launching run command:\n" f"cd {cwd} && {cmd}")
	proc = subprocess.run(cmd, shell=True, stdout=sys.stdout, cwd=cwd)
	return_code = proc.returncode
	if return_code != 0:
	> raise IreeBenchmarkException(proc, cwd)
	E sharktank.utils.export_artifacts.IreeBenchmarkException: Error invoking iree-benchmark-module
	E Error code: 5
	E Stderr diagnostics:
	E None
	E Stdout diagnostics:
	E None
	E Run with:
	E cd /home/chi/src/shark-ai && ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 iree-benchmark-module --hip_use_streams=true --module=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.vmfb --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank0.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank1.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank2.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank3.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank4.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank5.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank6.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank7.irpa --device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 --function=prefill_bs4 --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/tokens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_lens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_block_ids.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy --benchmark_repetitions=3 >> None

	sharktank/sharktank/utils/export_artifacts.py:335: IreeBenchmarkException
	-------------------------------------------------- Captured log call ---------------------------------------------------
	INFO eval:export_artifacts.py:219 Exporting mlir:
	cd /home/chi/src/shark-ai && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --output-mlir=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir --output-config=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.json --bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 --activation-dtype=float16 --attention-kernel=torch
	INFO eval:export_artifacts.py:225 Exported to mlir successfully:
	Exporting prefill_bs4
	Exporting decode_bs4
	GENERATED!
	Exporting
	Saving to '/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir'

	INFO eval:export_artifacts.py:137 export_to_mlir: 00 hrs : 03 mins : 30.66 secs
	INFO eval:export_artifacts.py:274 Launching compile command:
	cd /home/chi/src/shark-ai && iree-compile /home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir --iree-hip-target=gfx942 -o=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.vmfb --iree-hal-target-device=hip[0] --iree-hal-target-device=hip[1] --iree-hal-target-device=hip[2] --iree-hal-target-device=hip[3] --iree-hal-target-device=hip[4] --iree-hal-target-device=hip[5] --iree-hal-target-device=hip[6] --iree-hal-target-device=hip[7] --iree-hal-dump-executable-files-to=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128/files --iree-opt-level=O3 --iree-hal-indirect-command-buffers=true --iree-stream-resource-memory-model=discrete --iree-hal-memoization=true
	INFO eval:export_artifacts.py:137 compile_to_vmfb: 45.13 secs
	INFO eval:export_artifacts.py:331 Launching run command:
	cd /home/chi/src/shark-ai && ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 iree-benchmark-module --hip_use_streams=true --module=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.vmfb --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank0.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank1.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank2.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank3.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank4.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank5.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank6.irpa --parameters=model=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.rank7.irpa --device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 --function=prefill_bs4 --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/tokens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_lens.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/seq_block_ids.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy --input=@/shark-dev/405b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy --benchmark_repetitions=3 >> None
	=============================================== short test summary info ================================================
	XFAIL sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_405B::testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 - Benchmarking Error
	==================================== 12 deselected, 1 xfailed in 258.22s (0:04:18) =====================================
	(.venv) ➜ shark-ai git:(3a73dc3f) ✗