AmosLewis · April 8, 2025 00:35 · AmosLewis · Apr 8, 2025
diff --git a/error_log_testBenchmark8B_fp8_TP1_Non_Decomposed.txt b/error_log_testBenchmark8B_fp8_TP1_Non_Decomposed.txt
 (.venv) ➜  shark-ai git:(chi/xfail_f16) ✗ pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s -m "expensive" --iree-hip-target=gfx942 --iree-device=hip://4 -k testBenchmark8B_fp8_TP1_Non_Decomposed
 ======================================================================================= test session starts =======================================================================================
 platform linux -- Python 3.12.9, pytest-8.0.0, pluggy-1.5.0 -- /home/chi/src/shark-ai/.venv/bin/python
 cachedir: .pytest_cache
 metadata: {'Python': '3.12.9', 'Platform': 'Linux-6.8.0-52-generic-x86_64-with-glibc2.35', 'Packages': {'pytest': '8.0.0', 'pluggy': '1.5.0'}, 'Plugins': {'timeout': '2.3.1', 'anyio': '4.9.0', 'metadata': '3.1.1', 'html': '4.1.1', 'asyncio': '0.23.8', 'xdist': '3.5.0'}}
 rootdir: /home/chi/src/shark-ai/sharktank
 configfile: pyproject.toml
 plugins: timeout-2.3.1, anyio-4.9.0, metadata-3.1.1, html-4.1.1, asyncio-0.23.8, xdist-3.5.0
 asyncio: mode=Mode.STRICT
 collected 13 items / 12 deselected / 1 selected                                                                                                                                                   

 sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_8B::testBenchmark8B_fp8_TP1_Non_Decomposed 
 2025-04-07T17:35:38-07:00
 Running /home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/iree/_runtime_libs/iree-benchmark-module
 Run on (96 X 3810.79 MHz CPU s)
 CPU Caches:
  L1 Data 32 KiB (x96)
  L1 Instruction 32 KiB (x96)
  L2 Unified 1024 KiB (x96)
  L3 Unified 32768 KiB (x16)
 Load Average: 5.01, 2.31, 2.13
 ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 2025-04-07T17:35:43-07:00
 Running /home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/iree/_runtime_libs/iree-benchmark-module
 Run on (96 X 3810.79 MHz CPU s)
 CPU Caches:
  L1 Data 32 KiB (x96)
  L1 Instruction 32 KiB (x96)
  L2 Unified 1024 KiB (x96)
  L3 Unified 32768 KiB (x16)
 Load Average: 4.69, 2.29, 2.13
 ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 FAILED

 ============================================================================================ FAILURES =============================================================================================
 ___________________________________________________________________ BenchmarkLlama3_1_8B.testBenchmark8B_fp8_TP1_Non_Decomposed ___________________________________________________________________
 [XPASS(strict)] Benchmarking Error
 ---------------------------------------------------------------------------------------- Captured log call ----------------------------------------------------------------------------------------
 INFO     eval:export_artifacts.py:216  Exporting mlir:
 cd /home/chi/src/shark-ai && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/shark-dev/8b/fp8/native_fp8_e4m3fnuz_llama3_8b.irpa --output-mlir=/home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.mlir --output-config=/home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.json --bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=bfloat16 --activation-dtype=bfloat16 --kv-cache-dtype=float8_e4m3fnuz --attention-kernel=torch --use-hf
 INFO     eval:export_artifacts.py:222  Exported to mlir successfully:
 Exporting prefill_bs4
 Exporting decode_bs4
 GENERATED!
 Exporting
 Saving to '/home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.mlir'

 INFO     eval:export_artifacts.py:140  export_to_mlir: 00 hrs : 01 mins : 30.95 secs
 INFO     eval:export_artifacts.py:271  Launching compile command:
 cd /home/chi/src/shark-ai && iree-compile /home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.mlir --iree-hip-target=gfx942 -o=/home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.vmfb --iree-hal-target-device=hip --iree-hal-dump-executable-files-to=/home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1/files --iree-opt-level=O3 --iree-hal-indirect-command-buffers=true --iree-stream-resource-memory-model=discrete --iree-hal-memoization=true
 INFO     eval:export_artifacts.py:140  compile_to_vmfb: 19.67 secs
 INFO     eval:export_artifacts.py:328  Launching run command:
 cd /home/chi/src/shark-ai && ROCR_VISIBLE_DEVICES=0,1,2,3,4 iree-benchmark-module --hip_use_streams=true --module=/home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.vmfb --parameters=model=/shark-dev/8b/fp8/native_fp8_e4m3fnuz_llama3_8b.irpa --device=hip://4 --function=prefill_bs4 --input=4x128xi64=@/shark-dev/8b/prefill_args_fp8/tokens.bin --input=4xi64=@/shark-dev/8b/prefill_args_fp8/seq_lens.bin --input=4x4xi64=@/shark-dev/8b/prefill_args_fp8/seq_block_ids.bin --input=261x2097152xf8E4M3FNUZ=@/shark-dev/8b/prefill_args_fp8/cs_f8E4M3FNUZ.bin --benchmark_repetitions=3 >> /home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.txt
 INFO     eval:export_artifacts.py:328  Launching run command:
 cd /home/chi/src/shark-ai && ROCR_VISIBLE_DEVICES=0,1,2,3,4 iree-benchmark-module --hip_use_streams=true --module=/home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.vmfb --parameters=model=/shark-dev/8b/fp8/native_fp8_e4m3fnuz_llama3_8b.irpa --device=hip://4 --function=decode_bs4 --input=4x1xi64=@/shark-dev/8b/decode_args_fp8/next_tokens.bin --input=4xi64=@/shark-dev/8b/decode_args_fp8/seq_lens.bin --input=4xi64=@/shark-dev/8b/decode_args_fp8/start_positions.bin --input=4x5xi64=@/shark-dev/8b/decode_args_fp8/seq_block_ids.bin --input=261x2097152xf8E4M3FNUZ=@/shark-dev/8b/decode_args_fp8/cs_f8E4M3FNUZ.bin --benchmark_repetitions=3 >> /home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.txt
 ===================================================================================== short test summary info =====================================================================================
 FAILED sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_8B::testBenchmark8B_fp8_TP1_Non_Decomposed
 ========================================================================== 1 failed, 12 deselected in 122.52s (0:02:02) ===========================================================
	(.venv) ➜ shark-ai git:(chi/xfail_f16) ✗ pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s -m "expensive" --iree-hip-target=gfx942 --iree-device=hip://4 -k testBenchmark8B_fp8_TP1_Non_Decomposed
	======================================================================================= test session starts =======================================================================================
	platform linux -- Python 3.12.9, pytest-8.0.0, pluggy-1.5.0 -- /home/chi/src/shark-ai/.venv/bin/python
	cachedir: .pytest_cache
	metadata: {'Python': '3.12.9', 'Platform': 'Linux-6.8.0-52-generic-x86_64-with-glibc2.35', 'Packages': {'pytest': '8.0.0', 'pluggy': '1.5.0'}, 'Plugins': {'timeout': '2.3.1', 'anyio': '4.9.0', 'metadata': '3.1.1', 'html': '4.1.1', 'asyncio': '0.23.8', 'xdist': '3.5.0'}}
	rootdir: /home/chi/src/shark-ai/sharktank
	configfile: pyproject.toml
	plugins: timeout-2.3.1, anyio-4.9.0, metadata-3.1.1, html-4.1.1, asyncio-0.23.8, xdist-3.5.0
	asyncio: mode=Mode.STRICT
	collected 13 items / 12 deselected / 1 selected

	sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_8B::testBenchmark8B_fp8_TP1_Non_Decomposed
	2025-04-07T17:35:38-07:00
	Running /home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/iree/_runtime_libs/iree-benchmark-module
	Run on (96 X 3810.79 MHz CPU s)
	CPU Caches:
	L1 Data 32 KiB (x96)
	L1 Instruction 32 KiB (x96)
	L2 Unified 1024 KiB (x96)
	L3 Unified 32768 KiB (x16)
	Load Average: 5.01, 2.31, 2.13
	*WARNING* CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
	2025-04-07T17:35:43-07:00
	Running /home/chi/src/shark-ai/.venv/lib/python3.12/site-packages/iree/_runtime_libs/iree-benchmark-module
	Run on (96 X 3810.79 MHz CPU s)
	CPU Caches:
	L1 Data 32 KiB (x96)
	L1 Instruction 32 KiB (x96)
	L2 Unified 1024 KiB (x96)
	L3 Unified 32768 KiB (x16)
	Load Average: 4.69, 2.29, 2.13
	*WARNING* CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
	FAILED

	============================================================================================ FAILURES =============================================================================================
	___________________________________________________________________ BenchmarkLlama3_1_8B.testBenchmark8B_fp8_TP1_Non_Decomposed ___________________________________________________________________
	[XPASS(strict)] Benchmarking Error
	---------------------------------------------------------------------------------------- Captured log call ----------------------------------------------------------------------------------------
	INFO eval:export_artifacts.py:216 Exporting mlir:
	cd /home/chi/src/shark-ai && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/shark-dev/8b/fp8/native_fp8_e4m3fnuz_llama3_8b.irpa --output-mlir=/home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.mlir --output-config=/home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.json --bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=bfloat16 --activation-dtype=bfloat16 --kv-cache-dtype=float8_e4m3fnuz --attention-kernel=torch --use-hf
	INFO eval:export_artifacts.py:222 Exported to mlir successfully:
	Exporting prefill_bs4
	Exporting decode_bs4
	GENERATED!
	Exporting
	Saving to '/home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.mlir'

	INFO eval:export_artifacts.py:140 export_to_mlir: 00 hrs : 01 mins : 30.95 secs
	INFO eval:export_artifacts.py:271 Launching compile command:
	cd /home/chi/src/shark-ai && iree-compile /home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.mlir --iree-hip-target=gfx942 -o=/home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.vmfb --iree-hal-target-device=hip --iree-hal-dump-executable-files-to=/home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1/files --iree-opt-level=O3 --iree-hal-indirect-command-buffers=true --iree-stream-resource-memory-model=discrete --iree-hal-memoization=true
	INFO eval:export_artifacts.py:140 compile_to_vmfb: 19.67 secs
	INFO eval:export_artifacts.py:328 Launching run command:
	cd /home/chi/src/shark-ai && ROCR_VISIBLE_DEVICES=0,1,2,3,4 iree-benchmark-module --hip_use_streams=true --module=/home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.vmfb --parameters=model=/shark-dev/8b/fp8/native_fp8_e4m3fnuz_llama3_8b.irpa --device=hip://4 --function=prefill_bs4 --input=4x128xi64=@/shark-dev/8b/prefill_args_fp8/tokens.bin --input=4xi64=@/shark-dev/8b/prefill_args_fp8/seq_lens.bin --input=4x4xi64=@/shark-dev/8b/prefill_args_fp8/seq_block_ids.bin --input=261x2097152xf8E4M3FNUZ=@/shark-dev/8b/prefill_args_fp8/cs_f8E4M3FNUZ.bin --benchmark_repetitions=3 >> /home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.txt
	INFO eval:export_artifacts.py:328 Launching run command:
	cd /home/chi/src/shark-ai && ROCR_VISIBLE_DEVICES=0,1,2,3,4 iree-benchmark-module --hip_use_streams=true --module=/home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.vmfb --parameters=model=/shark-dev/8b/fp8/native_fp8_e4m3fnuz_llama3_8b.irpa --device=hip://4 --function=decode_bs4 --input=4x1xi64=@/shark-dev/8b/decode_args_fp8/next_tokens.bin --input=4xi64=@/shark-dev/8b/decode_args_fp8/seq_lens.bin --input=4xi64=@/shark-dev/8b/decode_args_fp8/start_positions.bin --input=4x5xi64=@/shark-dev/8b/decode_args_fp8/seq_block_ids.bin --input=261x2097152xf8E4M3FNUZ=@/shark-dev/8b/decode_args_fp8/cs_f8E4M3FNUZ.bin --benchmark_repetitions=3 >> /home/chi/src/shark-ai/2025-04-07/llama-8b/fp8_torch_tp1.txt
	===================================================================================== short test summary info =====================================================================================
	FAILED sharktank/tests/models/llama/benchmark_amdgpu_test.py::BenchmarkLlama3_1_8B::testBenchmark8B_fp8_TP1_Non_Decomposed
	========================================================================== 1 failed, 12 deselected in 122.52s (0:02:02) ===========================================================