AmosLewis · May 1, 2025 16:29
diff --git a/b5878302.txt b/b5878302.txt
 (.venv) ➜  shark-ai git:(ad958230) ✗ git checkout b58783029e0cc3e1890b22339d470c394a66dcb4
 M       requirements-iree-pinned.txt
 Previous HEAD position was ad958230 Add ops.mean for SplitPrimitiveTensor (#1308)
 HEAD is now at b5878302 Add perplexity calculation for Tensor and Pipeline parallized Llama models (#1279)
 (.venv) ➜  shark-ai git:(b5878302) ✗ pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --iree-device=hip://4 -k testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128
 ================================================= test session starts ==================================================
 platform linux -- Python 3.12.9, pytest-8.0.0, pluggy-1.5.0 -- /home/chi/src/shark-ai/.venv/bin/python
 cachedir: .pytest_cache
 metadata: {'Python': '3.12.9', 'Platform': 'Linux-6.8.0-52-generic-x86_64-with-glibc2.35', 'Packages': {'pytest': '8.0.0', 'pluggy': '1.5.0'}, 'Plugins': {'timeout': '2.3.1', 'anyio': '4.9.0', 'metadata': '3.1.1', 'html': '4.1.1', 'asyncio': '0.23.8', 'xdist': '3.5.0'}}
 rootdir: /home/chi/src/shark-ai/sharktank
 configfile: pyproject.toml
 plugins: timeout-2.3.1, anyio-4.9.0, metadata-3.1.1, html-4.1.1, asyncio-0.23.8, xdist-3.5.0
 asyncio: mode=Mode.STRICT
 collected 13 items / 12 deselected / 1 selected

 sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_405B::testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 FAILED

 ======================================================= FAILURES =======================================================
 ____________________ BenchmarkLlama3_1_405B.testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 _____________________

 self = <tests.models.llama.benchmark_amdgpu_test.BenchmarkLlama3_1_405B testMethod=testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128>

    @pytest.mark.xfail(
        reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
    )
    def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128(self):
        output_file_name = self.dir_path_405b / "f16_torch_128"
        output_mlir = self.llama405b_f16_torch_sdpa_artifacts.create_file(
            suffix=".mlir", prefix=output_file_name
        )
        output_json = self.llama405b_f16_torch_sdpa_artifacts.create_file(
            suffix=".json", prefix=output_file_name
        )
        output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file(
            suffix=".vmfb", prefix=output_file_name
        )
        output_shard_file_name = (
            self.artifacts_dir
            / f"tp8/llama3_405b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa"
        )
        if output_shard_file_name.exists():
            self.llama405b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
 >       export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir(
            output_mlir=output_mlir,
            output_config=output_json,
        )

 sharktank/tests/models/llama/benchmark_amdgpu_test.py:929:
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
 sharktank/sharktank/utils/export_artifacts.py:126: in wrapper
    result = func(*args, **kwargs)
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

 self = <sharktank.utils.export_artifacts.ExportArtifacts object at 0x7e3c9a5c2480>

    @timeit
    def export_to_mlir(
        self,
        *,
        output_mlir: str,
        output_config: str,
        skip_decode: Optional[bool] = None,
    ):
        export_args = [
            "python3",
            "-m",
            "sharktank.examples.export_paged_llm_v1",
            f"--irpa-file={self.irpa_path}",
            f"--output-mlir={output_mlir}",
            f"--output-config={output_config}",
            f"--bs-prefill={str(self.batch_size)}",
            f"--bs-decode={str(self.batch_size)}",
            f"--block-seq-stride={self.block_seq_stride}",
            f"--attention-dtype={self.attention_dtype}",
            f"--activation-dtype={self.activation_dtype}",
            f"--tensor-parallelism-size={self.tensor_parallelism_size}",
            f"--pipeline-parallelism-size={self.pipeline_parallelism_size}",
        ]

        assert self.attention_kernel in [
            "decomposed",
            "torch",
            "sharktank",
        ], "Only torch (sdpa), decomposed or sharktank --attention-kernel types are supported"

        export_args.append(f"--attention-kernel={self.attention_kernel}")

        if self.kv_cache_dtype is not None:
            export_args.append(f"--kv-cache-dtype={self.kv_cache_dtype}")
        if skip_decode:
            export_args.append("--skip-decode")
        if self.use_attention_mask:
            export_args.append("--use-attention-mask")
        if self.use_hf:
            export_args.append("--use-hf")

        cwd = self.sharktank_dir
        cmd = subprocess.list2cmdline(export_args)

        logger.info(f" Exporting mlir:\n" f"cd {cwd} && {cmd}")

        proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd, text=True)
        if proc.returncode != 0:
 >           raise ExportMlirException(proc, cwd)
 E           sharktank.utils.export_artifacts.ExportMlirException: Error invoking export_paged_llama_v1.py
 E           Error code: 1
 E           Stderr diagnostics:
 E           /home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)
 E             return torch.from_numpy(wrapper)
 E           Traceback (most recent call last):
 E             File "<frozen runpy>", line 198, in _run_module_as_main
 E             File "<frozen runpy>", line 88, in _run_code
 E             File "/home/chi/src/shark-ai/sharktank/sharktank/examples/export_paged_llm_v1.py", line 447, in <module>
 E               main()
 E             File "/home/chi/src/shark-ai/sharktank/sharktank/examples/export_paged_llm_v1.py", line 425, in main
 E               generate_batch_prefill(bs)
 E             File "/home/chi/src/shark-ai/sharktank/sharktank/examples/export_paged_llm_v1.py", line 224, in generate_batch_prefill
 E               @fxb.export_program(
 E                ^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/iree/turbine/aot/fx_programs.py", line 239, in export_program
 E               program = torch.export.export(
 E                         ^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/__init__.py", line 368, in export
 E               return _export(
 E                      ^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1035, in wrapper
 E               raise e
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1008, in wrapper
 E               ep = fn(*args, **kwargs)
 E                    ^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/exported_program.py", line 128, in wrapper
 E               return fn(*args, **kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1970, in _export
 E               return _export_for_training(
 E                      ^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1035, in wrapper
 E               raise e
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1008, in wrapper
 E               ep = fn(*args, **kwargs)
 E                    ^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/exported_program.py", line 128, in wrapper
 E               return fn(*args, **kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1834, in _export_for_training
 E               export_artifact = export_func(  # type: ignore[operator]
 E                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1772, in _non_strict_export
 E               aten_export_artifact = _to_aten_func(  # type: ignore[operator]
 E                                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1564, in _export_to_aten_ir_make_fx
 E               gm, graph_signature = transform(_make_fx_helper)(
 E                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1702, in _aot_export_non_strict
 E               gm, sig = aot_export(wrapped_mod, args, kwargs=kwargs, **flags)
 E                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1485, in _make_fx_helper
 E               gm = make_fx(
 E                    ^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2196, in wrapped
 E               return make_fx_tracer.trace(f, *args)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2134, in trace
 E               return self._trace_inner(f, *args)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2105, in _trace_inner
 E               t = dispatch_trace(
 E                   ^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/_compile.py", line 32, in inner
 E               return disable_fn(*args, **kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 745, in _fn
 E               return fn(*args, **kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1138, in dispatch_trace
 E               graph = tracer.trace(root, concrete_args)  # type: ignore[arg-type]
 E                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1694, in trace
 E               res = super().trace(root, concrete_args)
 E                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 843, in trace
 E               (self.create_arg(fn(*args)),),
 E                                ^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1193, in wrapped
 E               out = f(*tensors)  # type:ignore[call-arg]
 E                     ^^^^^^^^^^^
 E             File "<string>", line 1, in <lambda>
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1469, in wrapped_fn
 E               return tuple(flat_fn(*args))
 E                            ^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/_functorch/_aot_autograd/utils.py", line 184, in flat_fn
 E               tree_out = fn(*args, **kwargs)
 E                          ^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/_functorch/_aot_autograd/traced_function_transforms.py", line 879, in functional_call
 E               out = mod(*args[params_len:], **kwargs)
 E                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 821, in module_call_wrapper
 E               return self.call_module(mod, forward, args, kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1764, in call_module
 E               return Tracer.call_module(self, m, forward, args, kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 539, in call_module
 E               ret_val = forward(*args, **kwargs)
 E                         ^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 814, in forward
 E               return _orig_module_call(mod, *args, **kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
 E               return self._call_impl(*args, **kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
 E               return forward_call(*args, **kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1689, in forward
 E               tree_out = mod(*args, **kwargs)
 E                          ^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 821, in module_call_wrapper
 E               return self.call_module(mod, forward, args, kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1764, in call_module
 E               return Tracer.call_module(self, m, forward, args, kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 539, in call_module
 E               ret_val = forward(*args, **kwargs)
 E                         ^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 814, in forward
 E               return _orig_module_call(mod, *args, **kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
 E               return self._call_impl(*args, **kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
 E               return forward_call(*args, **kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/iree/turbine/aot/fx_programs.py", line 226, in new_forward
 E               return f(self.root, *forward_args, **forward_kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/sharktank/sharktank/examples/export_paged_llm_v1.py", line 277, in _
 E               logits = model.prefill(
 E                        ^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/sharktank/sharktank/models/llm/llm.py", line 153, in prefill
 E               h = self.token_embedding(tokens)
 E                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 821, in module_call_wrapper
 E               return self.call_module(mod, forward, args, kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1764, in call_module
 E               return Tracer.call_module(self, m, forward, args, kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 539, in call_module
 E               ret_val = forward(*args, **kwargs)
 E                         ^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 814, in forward
 E               return _orig_module_call(mod, *args, **kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
 E               return self._call_impl(*args, **kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
 E               return forward_call(*args, **kwargs)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/sharktank/sharktank/layers/token_embedding.py", line 28, in forward
 E               return ops.embedding_lookup(input, self.weight, dtype=self.dtype)
 E                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/sharktank/sharktank/ops/_registry.py", line 197, in __call__
 E               selected_override, *results = trampoline(self, *args, **kwargs)
 E                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E             File "/home/chi/src/shark-ai/sharktank/sharktank/ops/signatures.py", line 268, in _embedding_lookup_trampoline
 E               d.fail(tensors)
 E             File "/home/chi/src/shark-ai/sharktank/sharktank/ops/_registry.py", line 246, in fail
 E               raise NotImplementedError(
 E           NotImplementedError: Overridable operator sharktank.ops.signatures.embedding_lookup does not have an implementation for argument types: [<class 'sharktank.types.tensors.ReplicatedTensor'>, <class 'sharktank.types.tensors.DefaultPrimitiveTensor'>]
 E
 E
 E           Invoked with:
 E             cd /home/chi/src/shark-ai && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --output-mlir=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir --output-config=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.json --bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 --activation-dtype=float16 --tensor-parallelism-size=8 --pipeline-parallelism-size=1 --attention-kernel=torch

 sharktank/sharktank/utils/export_artifacts.py:230: ExportMlirException
 -------------------------------------------------- Captured log call ---------------------------------------------------
 INFO     eval:export_artifacts.py:226  Exporting mlir:
 cd /home/chi/src/shark-ai && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --output-mlir=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir --output-config=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.json --bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 --activation-dtype=float16 --tensor-parallelism-size=8 --pipeline-parallelism-size=1 --attention-kernel=torch
 =============================================== short test summary info ================================================
 FAILED sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_405B::testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 - sharktank.utils.export_artifacts.ExportMlirException: Error invoking export_paged_llama_v1.py
	(.venv) ➜ shark-ai git:(ad958230) ✗ git checkout b58783029e0cc3e1890b22339d470c394a66dcb4
	M requirements-iree-pinned.txt
	Previous HEAD position was ad958230 Add ops.mean for SplitPrimitiveTensor (#1308)
	HEAD is now at b5878302 Add perplexity calculation for Tensor and Pipeline parallized Llama models (#1279)
	(.venv) ➜ shark-ai git:(b5878302) ✗ pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --iree-device=hip://4 -k testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128
	================================================= test session starts ==================================================
	platform linux -- Python 3.12.9, pytest-8.0.0, pluggy-1.5.0 -- /home/chi/src/shark-ai/.venv/bin/python
	cachedir: .pytest_cache
	metadata: {'Python': '3.12.9', 'Platform': 'Linux-6.8.0-52-generic-x86_64-with-glibc2.35', 'Packages': {'pytest': '8.0.0', 'pluggy': '1.5.0'}, 'Plugins': {'timeout': '2.3.1', 'anyio': '4.9.0', 'metadata': '3.1.1', 'html': '4.1.1', 'asyncio': '0.23.8', 'xdist': '3.5.0'}}
	rootdir: /home/chi/src/shark-ai/sharktank
	configfile: pyproject.toml
	plugins: timeout-2.3.1, anyio-4.9.0, metadata-3.1.1, html-4.1.1, asyncio-0.23.8, xdist-3.5.0
	asyncio: mode=Mode.STRICT
	collected 13 items / 12 deselected / 1 selected

	sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_405B::testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 FAILED

	======================================================= FAILURES =======================================================
	____________________ BenchmarkLlama3_1_405B.testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 _____________________

	self = <tests.models.llama.benchmark_amdgpu_test.BenchmarkLlama3_1_405B testMethod=testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128>

	@pytest.mark.xfail(
	reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
	)
	def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128(self):
	output_file_name = self.dir_path_405b / "f16_torch_128"
	output_mlir = self.llama405b_f16_torch_sdpa_artifacts.create_file(
	suffix=".mlir", prefix=output_file_name
	)
	output_json = self.llama405b_f16_torch_sdpa_artifacts.create_file(
	suffix=".json", prefix=output_file_name
	)
	output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file(
	suffix=".vmfb", prefix=output_file_name
	)
	output_shard_file_name = (
	self.artifacts_dir
	/ f"tp8/llama3_405b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa"
	)
	if output_shard_file_name.exists():
	self.llama405b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
	> export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir(
	output_mlir=output_mlir,
	output_config=output_json,
	)

	sharktank/tests/models/llama/benchmark_amdgpu_test.py:929:
	_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
	sharktank/sharktank/utils/export_artifacts.py:126: in wrapper
	result = func(args, *kwargs)
	_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

	self = <sharktank.utils.export_artifacts.ExportArtifacts object at 0x7e3c9a5c2480>

	@timeit
	def export_to_mlir(
	self,
	*,
	output_mlir: str,
	output_config: str,
	skip_decode: Optional[bool] = None,
	):
	export_args = [
	"python3",
	"-m",
	"sharktank.examples.export_paged_llm_v1",
	f"--irpa-file={self.irpa_path}",
	f"--output-mlir={output_mlir}",
	f"--output-config={output_config}",
	f"--bs-prefill={str(self.batch_size)}",
	f"--bs-decode={str(self.batch_size)}",
	f"--block-seq-stride={self.block_seq_stride}",
	f"--attention-dtype={self.attention_dtype}",
	f"--activation-dtype={self.activation_dtype}",
	f"--tensor-parallelism-size={self.tensor_parallelism_size}",
	f"--pipeline-parallelism-size={self.pipeline_parallelism_size}",
	]

	assert self.attention_kernel in [
	"decomposed",
	"torch",
	"sharktank",
	], "Only torch (sdpa), decomposed or sharktank --attention-kernel types are supported"

	export_args.append(f"--attention-kernel={self.attention_kernel}")

	if self.kv_cache_dtype is not None:
	export_args.append(f"--kv-cache-dtype={self.kv_cache_dtype}")
	if skip_decode:
	export_args.append("--skip-decode")
	if self.use_attention_mask:
	export_args.append("--use-attention-mask")
	if self.use_hf:
	export_args.append("--use-hf")

	cwd = self.sharktank_dir
	cmd = subprocess.list2cmdline(export_args)

	logger.info(f" Exporting mlir:\n" f"cd {cwd} && {cmd}")

	proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd, text=True)
	if proc.returncode != 0:
	> raise ExportMlirException(proc, cwd)
	E sharktank.utils.export_artifacts.ExportMlirException: Error invoking export_paged_llama_v1.py
	E Error code: 1
	E Stderr diagnostics:
	E /home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)
	E return torch.from_numpy(wrapper)
	E Traceback (most recent call last):
	E File "<frozen runpy>", line 198, in _run_module_as_main
	E File "<frozen runpy>", line 88, in _run_code
	E File "/home/chi/src/shark-ai/sharktank/sharktank/examples/export_paged_llm_v1.py", line 447, in <module>
	E main()
	E File "/home/chi/src/shark-ai/sharktank/sharktank/examples/export_paged_llm_v1.py", line 425, in main
	E generate_batch_prefill(bs)
	E File "/home/chi/src/shark-ai/sharktank/sharktank/examples/export_paged_llm_v1.py", line 224, in generate_batch_prefill
	E @fxb.export_program(
	E ^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/iree/turbine/aot/fx_programs.py", line 239, in export_program
	E program = torch.export.export(
	E ^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/__init__.py", line 368, in export
	E return _export(
	E ^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1035, in wrapper
	E raise e
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1008, in wrapper
	E ep = fn(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/exported_program.py", line 128, in wrapper
	E return fn(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1970, in _export
	E return _export_for_training(
	E ^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1035, in wrapper
	E raise e
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1008, in wrapper
	E ep = fn(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/exported_program.py", line 128, in wrapper
	E return fn(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1834, in _export_for_training
	E export_artifact = export_func( # type: ignore[operator]
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1772, in _non_strict_export
	E aten_export_artifact = _to_aten_func( # type: ignore[operator]
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1564, in _export_to_aten_ir_make_fx
	E gm, graph_signature = transform(_make_fx_helper)(
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1702, in _aot_export_non_strict
	E gm, sig = aot_export(wrapped_mod, args, kwargs=kwargs, **flags)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1485, in _make_fx_helper
	E gm = make_fx(
	E ^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2196, in wrapped
	E return make_fx_tracer.trace(f, *args)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2134, in trace
	E return self._trace_inner(f, *args)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2105, in _trace_inner
	E t = dispatch_trace(
	E ^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/_compile.py", line 32, in inner
	E return disable_fn(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 745, in _fn
	E return fn(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1138, in dispatch_trace
	E graph = tracer.trace(root, concrete_args) # type: ignore[arg-type]
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1694, in trace
	E res = super().trace(root, concrete_args)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 843, in trace
	E (self.create_arg(fn(*args)),),
	E ^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1193, in wrapped
	E out = f(*tensors) # type:ignore[call-arg]
	E ^^^^^^^^^^^
	E File "<string>", line 1, in <lambda>
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1469, in wrapped_fn
	E return tuple(flat_fn(*args))
	E ^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/_functorch/_aot_autograd/utils.py", line 184, in flat_fn
	E tree_out = fn(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/_functorch/_aot_autograd/traced_function_transforms.py", line 879, in functional_call
	E out = mod(args[params_len:], *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 821, in module_call_wrapper
	E return self.call_module(mod, forward, args, kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1764, in call_module
	E return Tracer.call_module(self, m, forward, args, kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 539, in call_module
	E ret_val = forward(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 814, in forward
	E return _orig_module_call(mod, args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
	E return self._call_impl(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
	E return forward_call(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/export/_trace.py", line 1689, in forward
	E tree_out = mod(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 821, in module_call_wrapper
	E return self.call_module(mod, forward, args, kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1764, in call_module
	E return Tracer.call_module(self, m, forward, args, kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 539, in call_module
	E ret_val = forward(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 814, in forward
	E return _orig_module_call(mod, args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
	E return self._call_impl(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
	E return forward_call(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/iree/turbine/aot/fx_programs.py", line 226, in new_forward
	E return f(self.root, forward_args, *forward_kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/sharktank/sharktank/examples/export_paged_llm_v1.py", line 277, in _
	E logits = model.prefill(
	E ^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/sharktank/sharktank/models/llm/llm.py", line 153, in prefill
	E h = self.token_embedding(tokens)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 821, in module_call_wrapper
	E return self.call_module(mod, forward, args, kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1764, in call_module
	E return Tracer.call_module(self, m, forward, args, kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 539, in call_module
	E ret_val = forward(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 814, in forward
	E return _orig_module_call(mod, args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
	E return self._call_impl(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
	E return forward_call(args, *kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/sharktank/sharktank/layers/token_embedding.py", line 28, in forward
	E return ops.embedding_lookup(input, self.weight, dtype=self.dtype)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/sharktank/sharktank/ops/_registry.py", line 197, in __call__
	E selected_override, results = trampoline(self, args, **kwargs)
	E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	E File "/home/chi/src/shark-ai/sharktank/sharktank/ops/signatures.py", line 268, in _embedding_lookup_trampoline
	E d.fail(tensors)
	E File "/home/chi/src/shark-ai/sharktank/sharktank/ops/_registry.py", line 246, in fail
	E raise NotImplementedError(
	E NotImplementedError: Overridable operator sharktank.ops.signatures.embedding_lookup does not have an implementation for argument types: [<class 'sharktank.types.tensors.ReplicatedTensor'>, <class 'sharktank.types.tensors.DefaultPrimitiveTensor'>]
	E
	E
	E Invoked with:
	E cd /home/chi/src/shark-ai && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --output-mlir=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir --output-config=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.json --bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 --activation-dtype=float16 --tensor-parallelism-size=8 --pipeline-parallelism-size=1 --attention-kernel=torch

	sharktank/sharktank/utils/export_artifacts.py:230: ExportMlirException
	-------------------------------------------------- Captured log call ---------------------------------------------------
	INFO eval:export_artifacts.py:226 Exporting mlir:
	cd /home/chi/src/shark-ai && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa --output-mlir=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.mlir --output-config=/home/chi/src/shark-ai/2025-05-01/llama-405b/f16_torch_128.json --bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 --activation-dtype=float16 --tensor-parallelism-size=8 --pipeline-parallelism-size=1 --attention-kernel=torch
	=============================================== short test summary info ================================================
	FAILED sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_405B::testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128 - sharktank.utils.export_artifacts.ExportMlirException: Error invoking export_paged_llama_v1.py