AmosLewis · February 24, 2025 22:40
diff --git a/run_bm_0212mlir_ir0209_ir0224.log b/run_bm_0212mlir_ir0209_ir0224.log
 iree-base-compiler       3.2.0rc20250209
 iree-base-runtime        3.2.0rc20250209

 iree-compile /sharedfile/128/fp8_128_0212.mlir \
  --iree-hip-target=gfx942 \
  -o=/sharedfile/128/fp8_128_0212_ir0209.vmfb \
  --iree-hal-target-device=hip \
  --iree-dispatch-creation-enable-aggressive-fusion=true \
  --iree-global-opt-propagate-transposes=true \
  --iree-opt-aggressively-propagate-transposes=true \
  --iree-opt-data-tiling=false \
  --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-opt-strip-assertions

 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 iree-benchmark-module \
 --hip_use_streams=true \
 --module=/sharedfile/128/fp8_128_0212_ir0209.vmfb \
 --parameters=model=/sharedfile/llama3_8b_fp8.irpa \
 --device=hip://4 \
 --function=prefill_bs4 \
 --input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
 --input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
 --input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
 --input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
 --benchmark_repetitions=3
 # 2025-02-24T13:55:25-08:00
 # Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
 # Run on (96 X 3810.79 MHz CPU s)
 # CPU Caches:
 #   L1 Data 32 KiB (x96)
 #   L1 Instruction 32 KiB (x96)
 #   L2 Unified 1024 KiB (x96)
 #   L3 Unified 32768 KiB (x16)
 # Load Average: 78.74, 85.03, 65.62
 # ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 # -------------------------------------------------------------------------------------------------------
 # Benchmark                                             Time             CPU   Iterations UserCounters...
 # -------------------------------------------------------------------------------------------------------
 # BM_prefill_bs4/process_time/real_time              46.5 ms         47.1 ms           16 items_per_second=21.5046/s
 # BM_prefill_bs4/process_time/real_time              46.9 ms         47.4 ms           16 items_per_second=21.3415/s
 # BM_prefill_bs4/process_time/real_time              45.7 ms         46.2 ms           16 items_per_second=21.8821/s
 # BM_prefill_bs4/process_time/real_time_mean         46.4 ms         46.9 ms            3 items_per_second=21.5761/s
 # BM_prefill_bs4/process_time/real_time_median       46.5 ms         47.1 ms            3 items_per_second=21.5046/s
 # BM_prefill_bs4/process_time/real_time_stddev      0.593 ms        0.611 ms            3 items_per_second=0.277284/s
 # BM_prefill_bs4/process_time/real_time_cv           1.28 %          1.30 %             3 items_per_second=1.29%



 iree-base-compiler==3.3.0rc20250214
 iree-base-runtime==3.3.0rc20250214
 iree-turbine==3.3.0rc20250214

 iree-compile /sharedfile/128/fp8_128_0212.mlir \
  --iree-hip-target=gfx942 \
  -o=/sharedfile/128/fp8_128_0212_ir0214.vmfb \
  --iree-hal-target-device=hip \
  --iree-dispatch-creation-enable-aggressive-fusion=true \
  --iree-global-opt-propagate-transposes=true \
  --iree-opt-aggressively-propagate-transposes=true \
  --iree-opt-data-tiling=false \
  --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-opt-strip-assertions

 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 iree-benchmark-module \
 --hip_use_streams=true \
 --module=/sharedfile/128/fp8_128_0212_ir0214.vmfb \
 --parameters=model=/sharedfile/llama3_8b_fp8.irpa \
 --device=hip://4 \
 --function=prefill_bs4 \
 --input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
 --input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
 --input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
 --input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
 --benchmark_repetitions=3
 # 2025-02-24T13:58:42-08:00
 # Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
 # Run on (96 X 3810.79 MHz CPU s)
 # CPU Caches:
 #   L1 Data 32 KiB (x96)
 #   L1 Instruction 32 KiB (x96)
 #   L2 Unified 1024 KiB (x96)
 #   L3 Unified 32768 KiB (x16)
 # Load Average: 9.74, 48.19, 54.78
 # ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 # -------------------------------------------------------------------------------------------------------
 # Benchmark                                             Time             CPU   Iterations UserCounters...
 # -------------------------------------------------------------------------------------------------------
 # BM_prefill_bs4/process_time/real_time              42.8 ms         43.3 ms           16 items_per_second=23.3496/s
 # BM_prefill_bs4/process_time/real_time              42.8 ms         43.5 ms           16 items_per_second=23.351/s
 # BM_prefill_bs4/process_time/real_time              42.9 ms         43.6 ms           16 items_per_second=23.293/s
 # BM_prefill_bs4/process_time/real_time_mean         42.9 ms         43.5 ms            3 items_per_second=23.3312/s
 # BM_prefill_bs4/process_time/real_time_median       42.8 ms         43.5 ms            3 items_per_second=23.3496/s
 # BM_prefill_bs4/process_time/real_time_stddev      0.061 ms        0.172 ms            3 items_per_second=0.0331315/s
 # BM_prefill_bs4/process_time/real_time_cv           0.14 %          0.40 %             3 items_per_second=0.14%





 iree-base-compiler==3.3.0rc20250215
 iree-base-runtime==3.3.0rc20250215
 iree-turbine==3.3.0rc20250215

 iree-compile /sharedfile/128/fp8_128_0212.mlir \
  --iree-hip-target=gfx942 \
  -o=/sharedfile/128/fp8_128_0212_ir0215.vmfb \
  --iree-hal-target-device=hip \
  --iree-dispatch-creation-enable-aggressive-fusion=true \
  --iree-global-opt-propagate-transposes=true \
  --iree-opt-aggressively-propagate-transposes=true \
  --iree-opt-data-tiling=false \
  --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-opt-strip-assertions

 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 iree-benchmark-module \
 --hip_use_streams=true \
 --module=/sharedfile/128/fp8_128_0212_ir0215.vmfb \
 --parameters=model=/sharedfile/llama3_8b_fp8.irpa \
 --device=hip://4 \
 --function=prefill_bs4 \
 --input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
 --input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
 --input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
 --input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
 --benchmark_repetitions=3
 # 2025-02-24T14:08:35-08:00
 # Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
 # Run on (96 X 3810.79 MHz CPU s)
 # CPU Caches:
 #   L1 Data 32 KiB (x96)
 #   L1 Instruction 32 KiB (x96)
 #   L2 Unified 1024 KiB (x96)
 #   L3 Unified 32768 KiB (x16)
 # Load Average: 16.22, 14.06, 32.57
 # ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 # -------------------------------------------------------------------------------------------------------
 # Benchmark                                             Time             CPU   Iterations UserCounters...
 # -------------------------------------------------------------------------------------------------------
 # BM_prefill_bs4/process_time/real_time               122 ms          122 ms            6 items_per_second=8.2023/s
 # BM_prefill_bs4/process_time/real_time               122 ms          123 ms            6 items_per_second=8.18316/s
 # BM_prefill_bs4/process_time/real_time               122 ms          123 ms            6 items_per_second=8.17781/s
 # BM_prefill_bs4/process_time/real_time_mean          122 ms          123 ms            3 items_per_second=8.18776/s
 # BM_prefill_bs4/process_time/real_time_median        122 ms          123 ms            3 items_per_second=8.18316/s
 # BM_prefill_bs4/process_time/real_time_stddev      0.192 ms        0.388 ms            3 items_per_second=0.0128779/s
 # BM_prefill_bs4/process_time/real_time_cv           0.16 %          0.32 %             3 items_per_second=0.16%




 iree-base-compiler==3.3.0rc20250216
 iree-base-runtime==3.3.0rc20250216
 iree-turbine==3.3.0rc20250216


 iree-compile /sharedfile/128/fp8_128_0212.mlir \
  --iree-hip-target=gfx942 \
  -o=/sharedfile/128/fp8_128_0212_ir0216.vmfb \
  --iree-hal-target-device=hip \
  --iree-dispatch-creation-enable-aggressive-fusion=true \
  --iree-global-opt-propagate-transposes=true \
  --iree-opt-aggressively-propagate-transposes=true \
  --iree-opt-data-tiling=false \
  --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-opt-strip-assertions

 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 iree-benchmark-module \
 --hip_use_streams=true \
 --module=/sharedfile/128/fp8_128_0212_ir0216.vmfb \
 --parameters=model=/sharedfile/llama3_8b_fp8.irpa \
 --device=hip://4 \
 --function=prefill_bs4 \
 --input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
 --input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
 --input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
 --input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
 --benchmark_repetitions=3
 # 2025-02-24T14:06:47-08:00
 # Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
 # Run on (96 X 3810.79 MHz CPU s)
 # CPU Caches:
 #   L1 Data 32 KiB (x96)
 #   L1 Instruction 32 KiB (x96)
 #   L2 Unified 1024 KiB (x96)
 #   L3 Unified 32768 KiB (x16)
 # Load Average: 5.00, 14.50, 35.13
 # ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 # -------------------------------------------------------------------------------------------------------
 # Benchmark                                             Time             CPU   Iterations UserCounters...
 # -------------------------------------------------------------------------------------------------------
 # BM_prefill_bs4/process_time/real_time               122 ms          122 ms            6 items_per_second=8.2059/s
 # BM_prefill_bs4/process_time/real_time               122 ms          123 ms            6 items_per_second=8.19767/s
 # BM_prefill_bs4/process_time/real_time               122 ms          123 ms            6 items_per_second=8.19858/s
 # BM_prefill_bs4/process_time/real_time_mean          122 ms          123 ms            3 items_per_second=8.20072/s
 # BM_prefill_bs4/process_time/real_time_median        122 ms          123 ms            3 items_per_second=8.19858/s
 # BM_prefill_bs4/process_time/real_time_stddev      0.067 ms        0.192 ms            3 items_per_second=4.51233m/s
 # BM_prefill_bs4/process_time/real_time_cv           0.06 %          0.16 %             3 items_per_second=0.06%





 iree-base-compiler==3.3.0rc20250217
 iree-base-runtime==3.3.0rc20250217
 iree-turbine==3.3.0rc20250217


 iree-compile /sharedfile/128/fp8_128_0212.mlir \
  --iree-hip-target=gfx942 \
  -o=/sharedfile/128/fp8_128_0212_ir0217.vmfb \
  --iree-hal-target-device=hip \
  --iree-dispatch-creation-enable-aggressive-fusion=true \
  --iree-global-opt-propagate-transposes=true \
  --iree-opt-aggressively-propagate-transposes=true \
  --iree-opt-data-tiling=false \
  --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-opt-strip-assertions

 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 iree-benchmark-module \
 --hip_use_streams=true \
 --module=/sharedfile/128/fp8_128_0212_ir0217.vmfb \
 --parameters=model=/sharedfile/llama3_8b_fp8.irpa \
 --device=hip://4 \
 --function=prefill_bs4 \
 --input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
 --input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
 --input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
 --input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
 --benchmark_repetitions=3
 # 2025-02-24T14:05:05-08:00
 # Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
 # Run on (96 X 3810.79 MHz CPU s)
 # CPU Caches:
 #   L1 Data 32 KiB (x96)
 #   L1 Instruction 32 KiB (x96)
 #   L2 Unified 1024 KiB (x96)
 #   L3 Unified 32768 KiB (x16)
 # Load Average: 8.15, 18.76, 38.70
 # ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 # -------------------------------------------------------------------------------------------------------
 # Benchmark                                             Time             CPU   Iterations UserCounters...
 # -------------------------------------------------------------------------------------------------------
 # BM_prefill_bs4/process_time/real_time               122 ms          122 ms            6 items_per_second=8.20514/s
 # BM_prefill_bs4/process_time/real_time               122 ms          122 ms            6 items_per_second=8.19995/s
 # BM_prefill_bs4/process_time/real_time               124 ms          124 ms            6 items_per_second=8.08852/s
 # BM_prefill_bs4/process_time/real_time_mean          122 ms          123 ms            3 items_per_second=8.16454/s
 # BM_prefill_bs4/process_time/real_time_median        122 ms          122 ms            3 items_per_second=8.19995/s
 # BM_prefill_bs4/process_time/real_time_stddev      0.993 ms        0.967 ms            3 items_per_second=0.0658833/s
 # BM_prefill_bs4/process_time/real_time_cv           0.81 %          0.79 %             3 items_per_second=0.81%



 iree-base-compiler==3.3.0rc20250218
 iree-base-runtime==3.3.0rc20250218
 iree-turbine==3.3.0rc20250218

 iree-compile /sharedfile/128/fp8_128_0212.mlir \
  --iree-hip-target=gfx942 \
  -o=/sharedfile/128/fp8_128_0212_ir0218.vmfb \
  --iree-hal-target-device=hip \
  --iree-dispatch-creation-enable-aggressive-fusion=true \
  --iree-global-opt-propagate-transposes=true \
  --iree-opt-aggressively-propagate-transposes=true \
  --iree-opt-data-tiling=false \
  --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-opt-strip-assertions

 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 iree-benchmark-module \
 --hip_use_streams=true \
 --module=/sharedfile/128/fp8_128_0212_ir0218.vmfb \
 --parameters=model=/sharedfile/llama3_8b_fp8.irpa \
 --device=hip://4 \
 --function=prefill_bs4 \
 --input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
 --input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
 --input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
 --input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
 --benchmark_repetitions=3
 # 2025-02-24T14:03:05-08:00
 # Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
 # Run on (96 X 3810.79 MHz CPU s)
 # CPU Caches:
 #   L1 Data 32 KiB (x96)
 #   L1 Instruction 32 KiB (x96)
 #   L2 Unified 1024 KiB (x96)
 #   L3 Unified 32768 KiB (x16)
 # Load Average: 8.53, 24.32, 43.03
 # ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 # -------------------------------------------------------------------------------------------------------
 # Benchmark                                             Time             CPU   Iterations UserCounters...
 # -------------------------------------------------------------------------------------------------------
 # BM_prefill_bs4/process_time/real_time               122 ms          122 ms            6 items_per_second=8.20761/s
 # BM_prefill_bs4/process_time/real_time               122 ms          123 ms            6 items_per_second=8.20234/s
 # BM_prefill_bs4/process_time/real_time               122 ms          123 ms            6 items_per_second=8.19966/s
 # BM_prefill_bs4/process_time/real_time_mean          122 ms          123 ms            3 items_per_second=8.20321/s
 # BM_prefill_bs4/process_time/real_time_median        122 ms          123 ms            3 items_per_second=8.20234/s
 # BM_prefill_bs4/process_time/real_time_stddev      0.060 ms        0.183 ms            3 items_per_second=4.04547m/s
 # BM_prefill_bs4/process_time/real_time_cv           0.05 %          0.15 %             3 items_per_second=0.05%



 iree-base-compiler==3.3.0rc20250219
 iree-base-runtime==3.3.0rc20250219
 iree-turbine==3.3.0rc20250219

 iree-compile /sharedfile/128/fp8_128_0212.mlir \
  --iree-hip-target=gfx942 \
  -o=/sharedfile/128/fp8_128_0212_ir0219.vmfb \
  --iree-hal-target-device=hip \
  --iree-dispatch-creation-enable-aggressive-fusion=true \
  --iree-global-opt-propagate-transposes=true \
  --iree-opt-aggressively-propagate-transposes=true \
  --iree-opt-data-tiling=false \
  --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-opt-strip-assertions

 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 iree-benchmark-module \
 --hip_use_streams=true \
 --module=/sharedfile/128/fp8_128_0212_ir0219.vmfb \
 --parameters=model=/sharedfile/llama3_8b_fp8.irpa \
 --device=hip://4 \
 --function=prefill_bs4 \
 --input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
 --input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
 --input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
 --input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
 --benchmark_repetitions=3
 # 2025-02-24T14:01:29-08:00
 # Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
 # Run on (96 X 3810.79 MHz CPU s)
 # CPU Caches:
 #   L1 Data 32 KiB (x96)
 #   L1 Instruction 32 KiB (x96)
 #   L2 Unified 1024 KiB (x96)
 #   L3 Unified 32768 KiB (x16)
 # Load Average: 7.11, 30.24, 46.76
 # ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 # -------------------------------------------------------------------------------------------------------
 # Benchmark                                             Time             CPU   Iterations UserCounters...
 # -------------------------------------------------------------------------------------------------------
 # BM_prefill_bs4/process_time/real_time               122 ms          123 ms            6 items_per_second=8.20024/s
 # BM_prefill_bs4/process_time/real_time               122 ms          123 ms            6 items_per_second=8.19028/s
 # BM_prefill_bs4/process_time/real_time               122 ms          123 ms            6 items_per_second=8.20129/s
 # BM_prefill_bs4/process_time/real_time_mean          122 ms          123 ms            3 items_per_second=8.19727/s
 # BM_prefill_bs4/process_time/real_time_median        122 ms          123 ms            3 items_per_second=8.20024/s
 # BM_prefill_bs4/process_time/real_time_stddev      0.090 ms        0.111 ms            3 items_per_second=6.07556m/s
 # BM_prefill_bs4/process_time/real_time_cv           0.07 %          0.09 %             3 items_per_second=0.07%



 iree-base-compiler       3.3.0rc20250223
 iree-base-runtime        3.3.0rc20250223


 iree-compile /sharedfile/128/fp8_128_0212.mlir \
  --iree-hip-target=gfx942 \
  -o=/sharedfile/128/fp8_128_212.vmfb \
  --iree-hal-target-device=hip \
  --iree-dispatch-creation-enable-aggressive-fusion=true \
  --iree-global-opt-propagate-transposes=true \
  --iree-opt-aggressively-propagate-transposes=true \
  --iree-opt-data-tiling=false \
  --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-opt-strip-assertions

 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 iree-benchmark-module \
 --hip_use_streams=true \
 --module=/sharedfile/128/fp8_128_0212.vmfb \
 --parameters=model=/sharedfile/llama3_8b_fp8.irpa \
 --device=hip://4 \
 --function=prefill_bs4 \
 --input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
 --input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
 --input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
 --input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
 --benchmark_repetitions=3
 # 2025-02-24T13:43:20-08:00
 # Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
 # Run on (96 X 3810.79 MHz CPU s)
 # CPU Caches:
 #   L1 Data 32 KiB (x96)
 #   L1 Instruction 32 KiB (x96)
 #   L2 Unified 1024 KiB (x96)
 #   L3 Unified 32768 KiB (x16)
 # Load Average: 213.65, 147.04, 64.43
 # ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 # -------------------------------------------------------------------------------------------------------
 # Benchmark                                             Time             CPU   Iterations UserCounters...
 # -------------------------------------------------------------------------------------------------------
 # BM_prefill_bs4/process_time/real_time               122 ms          123 ms            6 items_per_second=8.18784/s
 # BM_prefill_bs4/process_time/real_time               122 ms          123 ms            6 items_per_second=8.16995/s
 # BM_prefill_bs4/process_time/real_time               122 ms          122 ms            6 items_per_second=8.20021/s
 # BM_prefill_bs4/process_time/real_time_mean          122 ms          123 ms            3 items_per_second=8.186/s
 # BM_prefill_bs4/process_time/real_time_median        122 ms          123 ms            3 items_per_second=8.18784/s
 # BM_prefill_bs4/process_time/real_time_stddev      0.227 ms        0.103 ms            3 items_per_second=0.0152123/s
 # BM_prefill_bs4/process_time/real_time_cv           0.19 %          0.08 %             3 items_per_second=0.19%