Skip to content

Instantly share code, notes, and snippets.

@AmosLewis
Last active February 24, 2025 22:40
Show Gist options
  • Save AmosLewis/7064af2a2610cbc80547ba24af147a19 to your computer and use it in GitHub Desktop.
Save AmosLewis/7064af2a2610cbc80547ba24af147a19 to your computer and use it in GitHub Desktop.
iree-base-compiler 3.2.0rc20250209
iree-base-runtime 3.2.0rc20250209
iree-compile /sharedfile/128/fp8_128_0212.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/128/fp8_128_0212_ir0209.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-opt-strip-assertions
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=/sharedfile/128/fp8_128_0212_ir0209.vmfb \
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
--device=hip://4 \
--function=prefill_bs4 \
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
--benchmark_repetitions=3
# 2025-02-24T13:55:25-08:00
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
# Run on (96 X 3810.79 MHz CPU s)
# CPU Caches:
# L1 Data 32 KiB (x96)
# L1 Instruction 32 KiB (x96)
# L2 Unified 1024 KiB (x96)
# L3 Unified 32768 KiB (x16)
# Load Average: 78.74, 85.03, 65.62
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
# -------------------------------------------------------------------------------------------------------
# Benchmark Time CPU Iterations UserCounters...
# -------------------------------------------------------------------------------------------------------
# BM_prefill_bs4/process_time/real_time 46.5 ms 47.1 ms 16 items_per_second=21.5046/s
# BM_prefill_bs4/process_time/real_time 46.9 ms 47.4 ms 16 items_per_second=21.3415/s
# BM_prefill_bs4/process_time/real_time 45.7 ms 46.2 ms 16 items_per_second=21.8821/s
# BM_prefill_bs4/process_time/real_time_mean 46.4 ms 46.9 ms 3 items_per_second=21.5761/s
# BM_prefill_bs4/process_time/real_time_median 46.5 ms 47.1 ms 3 items_per_second=21.5046/s
# BM_prefill_bs4/process_time/real_time_stddev 0.593 ms 0.611 ms 3 items_per_second=0.277284/s
# BM_prefill_bs4/process_time/real_time_cv 1.28 % 1.30 % 3 items_per_second=1.29%
iree-base-compiler==3.3.0rc20250214
iree-base-runtime==3.3.0rc20250214
iree-turbine==3.3.0rc20250214
iree-compile /sharedfile/128/fp8_128_0212.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/128/fp8_128_0212_ir0214.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-opt-strip-assertions
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=/sharedfile/128/fp8_128_0212_ir0214.vmfb \
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
--device=hip://4 \
--function=prefill_bs4 \
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
--benchmark_repetitions=3
# 2025-02-24T13:58:42-08:00
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
# Run on (96 X 3810.79 MHz CPU s)
# CPU Caches:
# L1 Data 32 KiB (x96)
# L1 Instruction 32 KiB (x96)
# L2 Unified 1024 KiB (x96)
# L3 Unified 32768 KiB (x16)
# Load Average: 9.74, 48.19, 54.78
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
# -------------------------------------------------------------------------------------------------------
# Benchmark Time CPU Iterations UserCounters...
# -------------------------------------------------------------------------------------------------------
# BM_prefill_bs4/process_time/real_time 42.8 ms 43.3 ms 16 items_per_second=23.3496/s
# BM_prefill_bs4/process_time/real_time 42.8 ms 43.5 ms 16 items_per_second=23.351/s
# BM_prefill_bs4/process_time/real_time 42.9 ms 43.6 ms 16 items_per_second=23.293/s
# BM_prefill_bs4/process_time/real_time_mean 42.9 ms 43.5 ms 3 items_per_second=23.3312/s
# BM_prefill_bs4/process_time/real_time_median 42.8 ms 43.5 ms 3 items_per_second=23.3496/s
# BM_prefill_bs4/process_time/real_time_stddev 0.061 ms 0.172 ms 3 items_per_second=0.0331315/s
# BM_prefill_bs4/process_time/real_time_cv 0.14 % 0.40 % 3 items_per_second=0.14%
iree-base-compiler==3.3.0rc20250215
iree-base-runtime==3.3.0rc20250215
iree-turbine==3.3.0rc20250215
iree-compile /sharedfile/128/fp8_128_0212.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/128/fp8_128_0212_ir0215.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-opt-strip-assertions
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=/sharedfile/128/fp8_128_0212_ir0215.vmfb \
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
--device=hip://4 \
--function=prefill_bs4 \
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
--benchmark_repetitions=3
# 2025-02-24T14:08:35-08:00
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
# Run on (96 X 3810.79 MHz CPU s)
# CPU Caches:
# L1 Data 32 KiB (x96)
# L1 Instruction 32 KiB (x96)
# L2 Unified 1024 KiB (x96)
# L3 Unified 32768 KiB (x16)
# Load Average: 16.22, 14.06, 32.57
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
# -------------------------------------------------------------------------------------------------------
# Benchmark Time CPU Iterations UserCounters...
# -------------------------------------------------------------------------------------------------------
# BM_prefill_bs4/process_time/real_time 122 ms 122 ms 6 items_per_second=8.2023/s
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.18316/s
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.17781/s
# BM_prefill_bs4/process_time/real_time_mean 122 ms 123 ms 3 items_per_second=8.18776/s
# BM_prefill_bs4/process_time/real_time_median 122 ms 123 ms 3 items_per_second=8.18316/s
# BM_prefill_bs4/process_time/real_time_stddev 0.192 ms 0.388 ms 3 items_per_second=0.0128779/s
# BM_prefill_bs4/process_time/real_time_cv 0.16 % 0.32 % 3 items_per_second=0.16%
iree-base-compiler==3.3.0rc20250216
iree-base-runtime==3.3.0rc20250216
iree-turbine==3.3.0rc20250216
iree-compile /sharedfile/128/fp8_128_0212.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/128/fp8_128_0212_ir0216.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-opt-strip-assertions
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=/sharedfile/128/fp8_128_0212_ir0216.vmfb \
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
--device=hip://4 \
--function=prefill_bs4 \
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
--benchmark_repetitions=3
# 2025-02-24T14:06:47-08:00
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
# Run on (96 X 3810.79 MHz CPU s)
# CPU Caches:
# L1 Data 32 KiB (x96)
# L1 Instruction 32 KiB (x96)
# L2 Unified 1024 KiB (x96)
# L3 Unified 32768 KiB (x16)
# Load Average: 5.00, 14.50, 35.13
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
# -------------------------------------------------------------------------------------------------------
# Benchmark Time CPU Iterations UserCounters...
# -------------------------------------------------------------------------------------------------------
# BM_prefill_bs4/process_time/real_time 122 ms 122 ms 6 items_per_second=8.2059/s
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.19767/s
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.19858/s
# BM_prefill_bs4/process_time/real_time_mean 122 ms 123 ms 3 items_per_second=8.20072/s
# BM_prefill_bs4/process_time/real_time_median 122 ms 123 ms 3 items_per_second=8.19858/s
# BM_prefill_bs4/process_time/real_time_stddev 0.067 ms 0.192 ms 3 items_per_second=4.51233m/s
# BM_prefill_bs4/process_time/real_time_cv 0.06 % 0.16 % 3 items_per_second=0.06%
iree-base-compiler==3.3.0rc20250217
iree-base-runtime==3.3.0rc20250217
iree-turbine==3.3.0rc20250217
iree-compile /sharedfile/128/fp8_128_0212.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/128/fp8_128_0212_ir0217.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-opt-strip-assertions
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=/sharedfile/128/fp8_128_0212_ir0217.vmfb \
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
--device=hip://4 \
--function=prefill_bs4 \
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
--benchmark_repetitions=3
# 2025-02-24T14:05:05-08:00
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
# Run on (96 X 3810.79 MHz CPU s)
# CPU Caches:
# L1 Data 32 KiB (x96)
# L1 Instruction 32 KiB (x96)
# L2 Unified 1024 KiB (x96)
# L3 Unified 32768 KiB (x16)
# Load Average: 8.15, 18.76, 38.70
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
# -------------------------------------------------------------------------------------------------------
# Benchmark Time CPU Iterations UserCounters...
# -------------------------------------------------------------------------------------------------------
# BM_prefill_bs4/process_time/real_time 122 ms 122 ms 6 items_per_second=8.20514/s
# BM_prefill_bs4/process_time/real_time 122 ms 122 ms 6 items_per_second=8.19995/s
# BM_prefill_bs4/process_time/real_time 124 ms 124 ms 6 items_per_second=8.08852/s
# BM_prefill_bs4/process_time/real_time_mean 122 ms 123 ms 3 items_per_second=8.16454/s
# BM_prefill_bs4/process_time/real_time_median 122 ms 122 ms 3 items_per_second=8.19995/s
# BM_prefill_bs4/process_time/real_time_stddev 0.993 ms 0.967 ms 3 items_per_second=0.0658833/s
# BM_prefill_bs4/process_time/real_time_cv 0.81 % 0.79 % 3 items_per_second=0.81%
iree-base-compiler==3.3.0rc20250218
iree-base-runtime==3.3.0rc20250218
iree-turbine==3.3.0rc20250218
iree-compile /sharedfile/128/fp8_128_0212.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/128/fp8_128_0212_ir0218.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-opt-strip-assertions
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=/sharedfile/128/fp8_128_0212_ir0218.vmfb \
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
--device=hip://4 \
--function=prefill_bs4 \
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
--benchmark_repetitions=3
# 2025-02-24T14:03:05-08:00
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
# Run on (96 X 3810.79 MHz CPU s)
# CPU Caches:
# L1 Data 32 KiB (x96)
# L1 Instruction 32 KiB (x96)
# L2 Unified 1024 KiB (x96)
# L3 Unified 32768 KiB (x16)
# Load Average: 8.53, 24.32, 43.03
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
# -------------------------------------------------------------------------------------------------------
# Benchmark Time CPU Iterations UserCounters...
# -------------------------------------------------------------------------------------------------------
# BM_prefill_bs4/process_time/real_time 122 ms 122 ms 6 items_per_second=8.20761/s
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.20234/s
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.19966/s
# BM_prefill_bs4/process_time/real_time_mean 122 ms 123 ms 3 items_per_second=8.20321/s
# BM_prefill_bs4/process_time/real_time_median 122 ms 123 ms 3 items_per_second=8.20234/s
# BM_prefill_bs4/process_time/real_time_stddev 0.060 ms 0.183 ms 3 items_per_second=4.04547m/s
# BM_prefill_bs4/process_time/real_time_cv 0.05 % 0.15 % 3 items_per_second=0.05%
iree-base-compiler==3.3.0rc20250219
iree-base-runtime==3.3.0rc20250219
iree-turbine==3.3.0rc20250219
iree-compile /sharedfile/128/fp8_128_0212.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/128/fp8_128_0212_ir0219.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-opt-strip-assertions
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=/sharedfile/128/fp8_128_0212_ir0219.vmfb \
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
--device=hip://4 \
--function=prefill_bs4 \
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
--benchmark_repetitions=3
# 2025-02-24T14:01:29-08:00
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
# Run on (96 X 3810.79 MHz CPU s)
# CPU Caches:
# L1 Data 32 KiB (x96)
# L1 Instruction 32 KiB (x96)
# L2 Unified 1024 KiB (x96)
# L3 Unified 32768 KiB (x16)
# Load Average: 7.11, 30.24, 46.76
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
# -------------------------------------------------------------------------------------------------------
# Benchmark Time CPU Iterations UserCounters...
# -------------------------------------------------------------------------------------------------------
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.20024/s
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.19028/s
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.20129/s
# BM_prefill_bs4/process_time/real_time_mean 122 ms 123 ms 3 items_per_second=8.19727/s
# BM_prefill_bs4/process_time/real_time_median 122 ms 123 ms 3 items_per_second=8.20024/s
# BM_prefill_bs4/process_time/real_time_stddev 0.090 ms 0.111 ms 3 items_per_second=6.07556m/s
# BM_prefill_bs4/process_time/real_time_cv 0.07 % 0.09 % 3 items_per_second=0.07%
iree-base-compiler 3.3.0rc20250223
iree-base-runtime 3.3.0rc20250223
iree-compile /sharedfile/128/fp8_128_0212.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/128/fp8_128_212.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-opt-strip-assertions
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=/sharedfile/128/fp8_128_0212.vmfb \
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
--device=hip://4 \
--function=prefill_bs4 \
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
--benchmark_repetitions=3
# 2025-02-24T13:43:20-08:00
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
# Run on (96 X 3810.79 MHz CPU s)
# CPU Caches:
# L1 Data 32 KiB (x96)
# L1 Instruction 32 KiB (x96)
# L2 Unified 1024 KiB (x96)
# L3 Unified 32768 KiB (x16)
# Load Average: 213.65, 147.04, 64.43
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
# -------------------------------------------------------------------------------------------------------
# Benchmark Time CPU Iterations UserCounters...
# -------------------------------------------------------------------------------------------------------
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.18784/s
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.16995/s
# BM_prefill_bs4/process_time/real_time 122 ms 122 ms 6 items_per_second=8.20021/s
# BM_prefill_bs4/process_time/real_time_mean 122 ms 123 ms 3 items_per_second=8.186/s
# BM_prefill_bs4/process_time/real_time_median 122 ms 123 ms 3 items_per_second=8.18784/s
# BM_prefill_bs4/process_time/real_time_stddev 0.227 ms 0.103 ms 3 items_per_second=0.0152123/s
# BM_prefill_bs4/process_time/real_time_cv 0.19 % 0.08 % 3 items_per_second=0.19%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment