Last active
February 24, 2025 22:40
-
-
Save AmosLewis/7064af2a2610cbc80547ba24af147a19 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
iree-base-compiler 3.2.0rc20250209 | |
iree-base-runtime 3.2.0rc20250209 | |
iree-compile /sharedfile/128/fp8_128_0212.mlir \ | |
--iree-hip-target=gfx942 \ | |
-o=/sharedfile/128/fp8_128_0212_ir0209.vmfb \ | |
--iree-hal-target-device=hip \ | |
--iree-dispatch-creation-enable-aggressive-fusion=true \ | |
--iree-global-opt-propagate-transposes=true \ | |
--iree-opt-aggressively-propagate-transposes=true \ | |
--iree-opt-data-tiling=false \ | |
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-opt-strip-assertions | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=/sharedfile/128/fp8_128_0212_ir0209.vmfb \ | |
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \ | |
--device=hip://4 \ | |
--function=prefill_bs4 \ | |
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \ | |
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \ | |
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=3 | |
# 2025-02-24T13:55:25-08:00 | |
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module | |
# Run on (96 X 3810.79 MHz CPU s) | |
# CPU Caches: | |
# L1 Data 32 KiB (x96) | |
# L1 Instruction 32 KiB (x96) | |
# L2 Unified 1024 KiB (x96) | |
# L3 Unified 32768 KiB (x16) | |
# Load Average: 78.74, 85.03, 65.62 | |
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
# ------------------------------------------------------------------------------------------------------- | |
# Benchmark Time CPU Iterations UserCounters... | |
# ------------------------------------------------------------------------------------------------------- | |
# BM_prefill_bs4/process_time/real_time 46.5 ms 47.1 ms 16 items_per_second=21.5046/s | |
# BM_prefill_bs4/process_time/real_time 46.9 ms 47.4 ms 16 items_per_second=21.3415/s | |
# BM_prefill_bs4/process_time/real_time 45.7 ms 46.2 ms 16 items_per_second=21.8821/s | |
# BM_prefill_bs4/process_time/real_time_mean 46.4 ms 46.9 ms 3 items_per_second=21.5761/s | |
# BM_prefill_bs4/process_time/real_time_median 46.5 ms 47.1 ms 3 items_per_second=21.5046/s | |
# BM_prefill_bs4/process_time/real_time_stddev 0.593 ms 0.611 ms 3 items_per_second=0.277284/s | |
# BM_prefill_bs4/process_time/real_time_cv 1.28 % 1.30 % 3 items_per_second=1.29% | |
iree-base-compiler==3.3.0rc20250214 | |
iree-base-runtime==3.3.0rc20250214 | |
iree-turbine==3.3.0rc20250214 | |
iree-compile /sharedfile/128/fp8_128_0212.mlir \ | |
--iree-hip-target=gfx942 \ | |
-o=/sharedfile/128/fp8_128_0212_ir0214.vmfb \ | |
--iree-hal-target-device=hip \ | |
--iree-dispatch-creation-enable-aggressive-fusion=true \ | |
--iree-global-opt-propagate-transposes=true \ | |
--iree-opt-aggressively-propagate-transposes=true \ | |
--iree-opt-data-tiling=false \ | |
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-opt-strip-assertions | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=/sharedfile/128/fp8_128_0212_ir0214.vmfb \ | |
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \ | |
--device=hip://4 \ | |
--function=prefill_bs4 \ | |
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \ | |
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \ | |
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=3 | |
# 2025-02-24T13:58:42-08:00 | |
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module | |
# Run on (96 X 3810.79 MHz CPU s) | |
# CPU Caches: | |
# L1 Data 32 KiB (x96) | |
# L1 Instruction 32 KiB (x96) | |
# L2 Unified 1024 KiB (x96) | |
# L3 Unified 32768 KiB (x16) | |
# Load Average: 9.74, 48.19, 54.78 | |
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
# ------------------------------------------------------------------------------------------------------- | |
# Benchmark Time CPU Iterations UserCounters... | |
# ------------------------------------------------------------------------------------------------------- | |
# BM_prefill_bs4/process_time/real_time 42.8 ms 43.3 ms 16 items_per_second=23.3496/s | |
# BM_prefill_bs4/process_time/real_time 42.8 ms 43.5 ms 16 items_per_second=23.351/s | |
# BM_prefill_bs4/process_time/real_time 42.9 ms 43.6 ms 16 items_per_second=23.293/s | |
# BM_prefill_bs4/process_time/real_time_mean 42.9 ms 43.5 ms 3 items_per_second=23.3312/s | |
# BM_prefill_bs4/process_time/real_time_median 42.8 ms 43.5 ms 3 items_per_second=23.3496/s | |
# BM_prefill_bs4/process_time/real_time_stddev 0.061 ms 0.172 ms 3 items_per_second=0.0331315/s | |
# BM_prefill_bs4/process_time/real_time_cv 0.14 % 0.40 % 3 items_per_second=0.14% | |
iree-base-compiler==3.3.0rc20250215 | |
iree-base-runtime==3.3.0rc20250215 | |
iree-turbine==3.3.0rc20250215 | |
iree-compile /sharedfile/128/fp8_128_0212.mlir \ | |
--iree-hip-target=gfx942 \ | |
-o=/sharedfile/128/fp8_128_0212_ir0215.vmfb \ | |
--iree-hal-target-device=hip \ | |
--iree-dispatch-creation-enable-aggressive-fusion=true \ | |
--iree-global-opt-propagate-transposes=true \ | |
--iree-opt-aggressively-propagate-transposes=true \ | |
--iree-opt-data-tiling=false \ | |
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-opt-strip-assertions | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=/sharedfile/128/fp8_128_0212_ir0215.vmfb \ | |
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \ | |
--device=hip://4 \ | |
--function=prefill_bs4 \ | |
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \ | |
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \ | |
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=3 | |
# 2025-02-24T14:08:35-08:00 | |
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module | |
# Run on (96 X 3810.79 MHz CPU s) | |
# CPU Caches: | |
# L1 Data 32 KiB (x96) | |
# L1 Instruction 32 KiB (x96) | |
# L2 Unified 1024 KiB (x96) | |
# L3 Unified 32768 KiB (x16) | |
# Load Average: 16.22, 14.06, 32.57 | |
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
# ------------------------------------------------------------------------------------------------------- | |
# Benchmark Time CPU Iterations UserCounters... | |
# ------------------------------------------------------------------------------------------------------- | |
# BM_prefill_bs4/process_time/real_time 122 ms 122 ms 6 items_per_second=8.2023/s | |
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.18316/s | |
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.17781/s | |
# BM_prefill_bs4/process_time/real_time_mean 122 ms 123 ms 3 items_per_second=8.18776/s | |
# BM_prefill_bs4/process_time/real_time_median 122 ms 123 ms 3 items_per_second=8.18316/s | |
# BM_prefill_bs4/process_time/real_time_stddev 0.192 ms 0.388 ms 3 items_per_second=0.0128779/s | |
# BM_prefill_bs4/process_time/real_time_cv 0.16 % 0.32 % 3 items_per_second=0.16% | |
iree-base-compiler==3.3.0rc20250216 | |
iree-base-runtime==3.3.0rc20250216 | |
iree-turbine==3.3.0rc20250216 | |
iree-compile /sharedfile/128/fp8_128_0212.mlir \ | |
--iree-hip-target=gfx942 \ | |
-o=/sharedfile/128/fp8_128_0212_ir0216.vmfb \ | |
--iree-hal-target-device=hip \ | |
--iree-dispatch-creation-enable-aggressive-fusion=true \ | |
--iree-global-opt-propagate-transposes=true \ | |
--iree-opt-aggressively-propagate-transposes=true \ | |
--iree-opt-data-tiling=false \ | |
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-opt-strip-assertions | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=/sharedfile/128/fp8_128_0212_ir0216.vmfb \ | |
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \ | |
--device=hip://4 \ | |
--function=prefill_bs4 \ | |
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \ | |
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \ | |
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=3 | |
# 2025-02-24T14:06:47-08:00 | |
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module | |
# Run on (96 X 3810.79 MHz CPU s) | |
# CPU Caches: | |
# L1 Data 32 KiB (x96) | |
# L1 Instruction 32 KiB (x96) | |
# L2 Unified 1024 KiB (x96) | |
# L3 Unified 32768 KiB (x16) | |
# Load Average: 5.00, 14.50, 35.13 | |
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
# ------------------------------------------------------------------------------------------------------- | |
# Benchmark Time CPU Iterations UserCounters... | |
# ------------------------------------------------------------------------------------------------------- | |
# BM_prefill_bs4/process_time/real_time 122 ms 122 ms 6 items_per_second=8.2059/s | |
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.19767/s | |
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.19858/s | |
# BM_prefill_bs4/process_time/real_time_mean 122 ms 123 ms 3 items_per_second=8.20072/s | |
# BM_prefill_bs4/process_time/real_time_median 122 ms 123 ms 3 items_per_second=8.19858/s | |
# BM_prefill_bs4/process_time/real_time_stddev 0.067 ms 0.192 ms 3 items_per_second=4.51233m/s | |
# BM_prefill_bs4/process_time/real_time_cv 0.06 % 0.16 % 3 items_per_second=0.06% | |
iree-base-compiler==3.3.0rc20250217 | |
iree-base-runtime==3.3.0rc20250217 | |
iree-turbine==3.3.0rc20250217 | |
iree-compile /sharedfile/128/fp8_128_0212.mlir \ | |
--iree-hip-target=gfx942 \ | |
-o=/sharedfile/128/fp8_128_0212_ir0217.vmfb \ | |
--iree-hal-target-device=hip \ | |
--iree-dispatch-creation-enable-aggressive-fusion=true \ | |
--iree-global-opt-propagate-transposes=true \ | |
--iree-opt-aggressively-propagate-transposes=true \ | |
--iree-opt-data-tiling=false \ | |
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-opt-strip-assertions | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=/sharedfile/128/fp8_128_0212_ir0217.vmfb \ | |
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \ | |
--device=hip://4 \ | |
--function=prefill_bs4 \ | |
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \ | |
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \ | |
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=3 | |
# 2025-02-24T14:05:05-08:00 | |
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module | |
# Run on (96 X 3810.79 MHz CPU s) | |
# CPU Caches: | |
# L1 Data 32 KiB (x96) | |
# L1 Instruction 32 KiB (x96) | |
# L2 Unified 1024 KiB (x96) | |
# L3 Unified 32768 KiB (x16) | |
# Load Average: 8.15, 18.76, 38.70 | |
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
# ------------------------------------------------------------------------------------------------------- | |
# Benchmark Time CPU Iterations UserCounters... | |
# ------------------------------------------------------------------------------------------------------- | |
# BM_prefill_bs4/process_time/real_time 122 ms 122 ms 6 items_per_second=8.20514/s | |
# BM_prefill_bs4/process_time/real_time 122 ms 122 ms 6 items_per_second=8.19995/s | |
# BM_prefill_bs4/process_time/real_time 124 ms 124 ms 6 items_per_second=8.08852/s | |
# BM_prefill_bs4/process_time/real_time_mean 122 ms 123 ms 3 items_per_second=8.16454/s | |
# BM_prefill_bs4/process_time/real_time_median 122 ms 122 ms 3 items_per_second=8.19995/s | |
# BM_prefill_bs4/process_time/real_time_stddev 0.993 ms 0.967 ms 3 items_per_second=0.0658833/s | |
# BM_prefill_bs4/process_time/real_time_cv 0.81 % 0.79 % 3 items_per_second=0.81% | |
iree-base-compiler==3.3.0rc20250218 | |
iree-base-runtime==3.3.0rc20250218 | |
iree-turbine==3.3.0rc20250218 | |
iree-compile /sharedfile/128/fp8_128_0212.mlir \ | |
--iree-hip-target=gfx942 \ | |
-o=/sharedfile/128/fp8_128_0212_ir0218.vmfb \ | |
--iree-hal-target-device=hip \ | |
--iree-dispatch-creation-enable-aggressive-fusion=true \ | |
--iree-global-opt-propagate-transposes=true \ | |
--iree-opt-aggressively-propagate-transposes=true \ | |
--iree-opt-data-tiling=false \ | |
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-opt-strip-assertions | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=/sharedfile/128/fp8_128_0212_ir0218.vmfb \ | |
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \ | |
--device=hip://4 \ | |
--function=prefill_bs4 \ | |
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \ | |
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \ | |
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=3 | |
# 2025-02-24T14:03:05-08:00 | |
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module | |
# Run on (96 X 3810.79 MHz CPU s) | |
# CPU Caches: | |
# L1 Data 32 KiB (x96) | |
# L1 Instruction 32 KiB (x96) | |
# L2 Unified 1024 KiB (x96) | |
# L3 Unified 32768 KiB (x16) | |
# Load Average: 8.53, 24.32, 43.03 | |
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
# ------------------------------------------------------------------------------------------------------- | |
# Benchmark Time CPU Iterations UserCounters... | |
# ------------------------------------------------------------------------------------------------------- | |
# BM_prefill_bs4/process_time/real_time 122 ms 122 ms 6 items_per_second=8.20761/s | |
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.20234/s | |
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.19966/s | |
# BM_prefill_bs4/process_time/real_time_mean 122 ms 123 ms 3 items_per_second=8.20321/s | |
# BM_prefill_bs4/process_time/real_time_median 122 ms 123 ms 3 items_per_second=8.20234/s | |
# BM_prefill_bs4/process_time/real_time_stddev 0.060 ms 0.183 ms 3 items_per_second=4.04547m/s | |
# BM_prefill_bs4/process_time/real_time_cv 0.05 % 0.15 % 3 items_per_second=0.05% | |
iree-base-compiler==3.3.0rc20250219 | |
iree-base-runtime==3.3.0rc20250219 | |
iree-turbine==3.3.0rc20250219 | |
iree-compile /sharedfile/128/fp8_128_0212.mlir \ | |
--iree-hip-target=gfx942 \ | |
-o=/sharedfile/128/fp8_128_0212_ir0219.vmfb \ | |
--iree-hal-target-device=hip \ | |
--iree-dispatch-creation-enable-aggressive-fusion=true \ | |
--iree-global-opt-propagate-transposes=true \ | |
--iree-opt-aggressively-propagate-transposes=true \ | |
--iree-opt-data-tiling=false \ | |
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-opt-strip-assertions | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=/sharedfile/128/fp8_128_0212_ir0219.vmfb \ | |
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \ | |
--device=hip://4 \ | |
--function=prefill_bs4 \ | |
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \ | |
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \ | |
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=3 | |
# 2025-02-24T14:01:29-08:00 | |
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module | |
# Run on (96 X 3810.79 MHz CPU s) | |
# CPU Caches: | |
# L1 Data 32 KiB (x96) | |
# L1 Instruction 32 KiB (x96) | |
# L2 Unified 1024 KiB (x96) | |
# L3 Unified 32768 KiB (x16) | |
# Load Average: 7.11, 30.24, 46.76 | |
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
# ------------------------------------------------------------------------------------------------------- | |
# Benchmark Time CPU Iterations UserCounters... | |
# ------------------------------------------------------------------------------------------------------- | |
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.20024/s | |
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.19028/s | |
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.20129/s | |
# BM_prefill_bs4/process_time/real_time_mean 122 ms 123 ms 3 items_per_second=8.19727/s | |
# BM_prefill_bs4/process_time/real_time_median 122 ms 123 ms 3 items_per_second=8.20024/s | |
# BM_prefill_bs4/process_time/real_time_stddev 0.090 ms 0.111 ms 3 items_per_second=6.07556m/s | |
# BM_prefill_bs4/process_time/real_time_cv 0.07 % 0.09 % 3 items_per_second=0.07% | |
iree-base-compiler 3.3.0rc20250223 | |
iree-base-runtime 3.3.0rc20250223 | |
iree-compile /sharedfile/128/fp8_128_0212.mlir \ | |
--iree-hip-target=gfx942 \ | |
-o=/sharedfile/128/fp8_128_212.vmfb \ | |
--iree-hal-target-device=hip \ | |
--iree-dispatch-creation-enable-aggressive-fusion=true \ | |
--iree-global-opt-propagate-transposes=true \ | |
--iree-opt-aggressively-propagate-transposes=true \ | |
--iree-opt-data-tiling=false \ | |
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-opt-strip-assertions | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=/sharedfile/128/fp8_128_0212.vmfb \ | |
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \ | |
--device=hip://4 \ | |
--function=prefill_bs4 \ | |
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \ | |
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \ | |
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=3 | |
# 2025-02-24T13:43:20-08:00 | |
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module | |
# Run on (96 X 3810.79 MHz CPU s) | |
# CPU Caches: | |
# L1 Data 32 KiB (x96) | |
# L1 Instruction 32 KiB (x96) | |
# L2 Unified 1024 KiB (x96) | |
# L3 Unified 32768 KiB (x16) | |
# Load Average: 213.65, 147.04, 64.43 | |
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
# ------------------------------------------------------------------------------------------------------- | |
# Benchmark Time CPU Iterations UserCounters... | |
# ------------------------------------------------------------------------------------------------------- | |
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.18784/s | |
# BM_prefill_bs4/process_time/real_time 122 ms 123 ms 6 items_per_second=8.16995/s | |
# BM_prefill_bs4/process_time/real_time 122 ms 122 ms 6 items_per_second=8.20021/s | |
# BM_prefill_bs4/process_time/real_time_mean 122 ms 123 ms 3 items_per_second=8.186/s | |
# BM_prefill_bs4/process_time/real_time_median 122 ms 123 ms 3 items_per_second=8.18784/s | |
# BM_prefill_bs4/process_time/real_time_stddev 0.227 ms 0.103 ms 3 items_per_second=0.0152123/s | |
# BM_prefill_bs4/process_time/real_time_cv 0.19 % 0.08 % 3 items_per_second=0.19% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment