AmosLewis · February 12, 2025 18:59
diff --git a/llama_fp8_bs4_128_tp1.zshrc b/llama_fp8_bs4_128_tp1.zshrc
 Patch used: https://github.com/nod-ai/shark-ai/pull/896 
 IREE version==3.2.0rc20250209
 instruct weight: /shark-dev/8b/fp8/native_fp8_e4m3fnuz_llama3_8b.irpa
 instruct tokenizer-config-json: /shark-dev/8b/instruct/tokenizer.json 

 # Generated the input.bin
 # Although the run fail, it does generated the input.bin for prefill.(TBD)
 # For bs=4, name of prefill_seq_lens_1xi64.bin should be change to prefill_seq_lens_4xi64.bin (TBD)
 # https://gist.github.com/AmosLewis/d2a325a815c106fcf6e964dd249940ba
 python -m sharktank.examples.paged_llm_v1 --irpa-file=/sharedfile/llama3_8b_fp8.irpa  \        
 --tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer.json --dump-bins "t"
 # decode_cache_state_261x2097152xf8E4M3FNUZ.bin  decode_seq_block_ids_tensor_4x6xi64.bin  decode_start_positions_4xi64.bin                prefill_seq_lens_4xi64.bin
 # decode_next_tokens_4x1xi64.bin                 decode_seq_block_ids_tensor_4x7xi64.bin  prefill_cache_state_261x2097152xf8E4M3FNUZ.bin  prefill_token_ids_4x128xi64.bin
 # decode_seq_block_ids_tensor_4x5xi64.bin        decode_seq_lens_4xi64.bin                prefill_seq_block_ids_4x4xi64.bin


 # Generate mlir file
 python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/sharedfile/llama3_8b_fp8.irpa  \
 --output-mlir=/sharedfile/128/fp8_128.mlir \
 --output-config=/sharedfile/128/config_128.json \
 --bs=4 --attention-kernel torch \
 # --attention-dtype=float8_e4m3fnuz --activation-dtype=bfloat16 this 2 flag might not the one we should use for fp8_atten16.mlir
 # /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)
 #   return torch.from_numpy(wrapper)
 # Exporting prefill_bs4
 # /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/torch/_export/non_strict_utils.py:520: UserWarning: Tensor.T is deprecated on 0-D tensors. This function is the identity in these cases. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3691.)
 #   return func(*args, **kwargs)
 # Exporting decode_bs4
 # GENERATED!
 # Exporting
 # Saving to '/sharedfile/2048/fp8_128.mlir'


 # f8_ perplexity_iree
 python -m sharktank.evaluate.perplexity_iree \
 --irpa-file=/sharedfile/llama3_8b_fp8.irpa  \
 --tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer.json \
 --iree-device='hip://4' \
 --iree-hal-target-device=hip \
 --iree-hip-target=gfx942 \
 --attention-kernel decomposed \
 --num-prompts=1


 # iree comiple and run without tracy bin
 iree-compile /sharedfile/128/fp8_128.mlir \
  --iree-hip-target=gfx942 \
  -o=/sharedfile/128/fp8_128.vmfb \
  --iree-hal-target-device=hip \
  --iree-dispatch-creation-enable-aggressive-fusion=true \
  --iree-global-opt-propagate-transposes=true \
  --iree-opt-aggressively-propagate-transposes=true \
  --iree-opt-data-tiling=false \
  --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-opt-strip-assertions

 iree-run-module \
 --hip_use_streams=true \
 --module=/sharedfile/128/fp8_128.vmfb \
 --parameters=model=/sharedfile/llama3_8b_fp8.irpa \
 --device=hip://4 \
 --function=prefill_bs4 \
 --input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
 --input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
 --input=4x64xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x64xi64.bin \
 --input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
 # EXEC @prefill_bs4
 # result[0]: hal.buffer_view
 # 4x2048x128256xbf16=[[-3.90625 -3.67188 ...


 # benchmark run
 # 128 prefill
 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 iree-benchmark-module \
 --hip_use_streams=true \
 --module=/sharedfile/128/fp8_128.vmfb \
 --parameters=model=/sharedfile/llama3_8b_fp8.irpa \
 --device=hip://4 \
 --function=prefill_bs4 \
 --input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
 --input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
 --input=4x64xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x64xi64.bin \
 --input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
 --benchmark_repetitions=3
 # 2025-02-11T10:26:13-08:00
 # Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
 # Run on (96 X 3810.79 MHz CPU s)
 # CPU Caches:
 #   L1 Data 32 KiB (x96)
 #   L1 Instruction 32 KiB (x96)
 #   L2 Unified 1024 KiB (x96)
 #   L3 Unified 32768 KiB (x16)
 # Load Average: 3.68, 2.49, 1.34
 # ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 # -------------------------------------------------------------------------------------------------------
 # Benchmark                                             Time             CPU   Iterations UserCounters...
 # -------------------------------------------------------------------------------------------------------
 # BM_prefill_bs4/process_time/real_time               713 ms          713 ms            1 items_per_second=1.40236/s
 # BM_prefill_bs4/process_time/real_time               713 ms          714 ms            1 items_per_second=1.4018/s
 # BM_prefill_bs4/process_time/real_time               714 ms          714 ms            1 items_per_second=1.39994/s
 # BM_prefill_bs4/process_time/real_time_mean          714 ms          714 ms            3 items_per_second=1.40136/s
 # BM_prefill_bs4/process_time/real_time_median        713 ms          714 ms            3 items_per_second=1.4018/s
 # BM_prefill_bs4/process_time/real_time_stddev      0.646 ms        0.592 ms            3 items_per_second=1.26808m/s
 # BM_prefill_bs4/process_time/real_time_cv           0.09 %          0.08 %             3 items_per_second=0.09%


 # 128 decode
 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 iree-benchmark-module \
 --hip_use_streams=true \
 --module=/sharedfile/128/fp8_128.vmfb \
 --parameters=model=/sharedfile/llama3_8b_fp8.irpa \
 --device=hip://4 \
 --function=decode_bs4 \
 --input=4x1xi64=@/sharedfile/128/decode/decode_next_tokens_4x1xi64.bin \
 --input=4xi64=@/sharedfile/128/decode/decode_seq_lens_4xi64.bin \
 --input=4xi64=@/sharedfile/128/decode/decode_start_positions_4xi64.bin \
 --input=4x5xi64=@/sharedfile/128/decode/decode_seq_block_ids_tensor_4x5xi64.bin \
 --input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/decode/decode_cache_state_261x2097152xf8E4M3FNUZ.bin \
 --benchmark_repetitions=3
 # 2025-02-11T16:30:42-08:00
 # Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
 # Run on (96 X 3810.79 MHz CPU s)
 # CPU Caches:
 #   L1 Data 32 KiB (x96)
 #   L1 Instruction 32 KiB (x96)
 #   L2 Unified 1024 KiB (x96)
 #   L3 Unified 32768 KiB (x16)
 # Load Average: 4.84, 11.44, 24.97
 # ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 # ------------------------------------------------------------------------------------------------------
 # Benchmark                                            Time             CPU   Iterations UserCounters...
 # ------------------------------------------------------------------------------------------------------
 # BM_decode_bs4/process_time/real_time              23.8 ms         24.6 ms           30 items_per_second=42.0138/s
 # BM_decode_bs4/process_time/real_time              23.9 ms         24.8 ms           30 items_per_second=41.8749/s
 # BM_decode_bs4/process_time/real_time              24.0 ms         24.6 ms           30 items_per_second=41.7428/s
 # BM_decode_bs4/process_time/real_time_mean         23.9 ms         24.7 ms            3 items_per_second=41.8772/s
 # BM_decode_bs4/process_time/real_time_median       23.9 ms         24.6 ms            3 items_per_second=41.8749/s
 # BM_decode_bs4/process_time/real_time_stddev      0.077 ms        0.148 ms            3 items_per_second=0.135524/s
 # BM_decode_bs4/process_time/real_time_cv           0.32 %          0.60 %             3 items_per_second=0.32%



 # 2048 prefill
 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 iree-benchmark-module \
 --hip_use_streams=true \
 --module=/sharedfile/2048/fp8_2048.vmfb \
 --parameters=model=/sharedfile/llama3_8b_fp8.irpa \
 --device=hip://4 \
 --function=prefill_bs4 \
 --input=4x2048xi64=@/sharedfile/2048/prefill/prefill_token_ids_4x2048xi64.bin \
 --input=4xi64=@/sharedfile/2048/prefill/prefill_seq_lens_4xi64.bin \
 --input=4x64xi64=@/sharedfile/2048/prefill/prefill_seq_block_ids_4x64xi64.bin \
 --input=261x2097152xf8E4M3FNUZ=@/sharedfile/2048/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
 --benchmark_repetitions=3
 # 2025-02-10T18:56:57-08:00
 # Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
 # Run on (96 X 3810.79 MHz CPU s)
 # CPU Caches:
 #   L1 Data 32 KiB (x96)
 #   L1 Instruction 32 KiB (x96)
 #   L2 Unified 1024 KiB (x96)
 #   L3 Unified 32768 KiB (x16)
 # Load Average: 1.13, 1.23, 3.41
 # ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 # -------------------------------------------------------------------------------------------------------
 # Benchmark                                             Time             CPU   Iterations UserCounters...
 # -------------------------------------------------------------------------------------------------------
 # BM_prefill_bs4/process_time/real_time               725 ms          725 ms            1 items_per_second=1.37975/s
 # BM_prefill_bs4/process_time/real_time               727 ms          728 ms            1 items_per_second=1.3762/s
 # BM_prefill_bs4/process_time/real_time               727 ms          728 ms            1 items_per_second=1.37512/s
 # BM_prefill_bs4/process_time/real_time_mean          726 ms          727 ms            3 items_per_second=1.37703/s
 # BM_prefill_bs4/process_time/real_time_median        727 ms          728 ms            3 items_per_second=1.3762/s
 # BM_prefill_bs4/process_time/real_time_stddev       1.28 ms         1.41 ms            3 items_per_second=2.42255m/s
 # BM_prefill_bs4/process_time/real_time_cv           0.18 %          0.19 %             3 items_per_second=0.18%

 # 2048 decode
 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 iree-benchmark-module \
 --hip_use_streams=true \
 --module=/sharedfile/2048/fp8_2048.vmfb \
 --parameters=model=/sharedfile/llama3_8b_fp8.irpa \
 --device=hip://4 \
 --function=decode_bs4 \
 --input=4x1xi64=@/sharedfile/2048/decode/decode_next_tokens_4x1xi64.bin \
 --input=4xi64=@/sharedfile/2048/decode/decode_seq_lens_4xi64.bin \
 --input=4xi64=@/sharedfile/2048/decode/decode_start_positions_4xi64.bin \
 --input=4x65xi64=@/sharedfile/2048/decode/decode_seq_block_ids_tensor_4x65xi64.bin \
 --input=261x2097152xf8E4M3FNUZ=@/sharedfile/2048/decode/decode_cache_state_261x2097152xf8E4M3FNUZ.bin \
 --benchmark_repetitions=3
 # 2025-02-11T16:27:33-08:00
 # Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
 # Run on (96 X 3810.79 MHz CPU s)
 # CPU Caches:
 #   L1 Data 32 KiB (x96)
 #   L1 Instruction 32 KiB (x96)
 #   L2 Unified 1024 KiB (x96)
 #   L3 Unified 32768 KiB (x16)
 # Load Average: 13.80, 17.30, 29.52
 # ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 # ------------------------------------------------------------------------------------------------------
 # Benchmark                                            Time             CPU   Iterations UserCounters...
 # ------------------------------------------------------------------------------------------------------
 # BM_decode_bs4/process_time/real_time               206 ms          206 ms            3 items_per_second=4.86574/s
 # BM_decode_bs4/process_time/real_time               206 ms          207 ms            3 items_per_second=4.84683/s
 # BM_decode_bs4/process_time/real_time               207 ms          207 ms            3 items_per_second=4.84188/s
 # BM_decode_bs4/process_time/real_time_mean          206 ms          207 ms            3 items_per_second=4.85148/s
 # BM_decode_bs4/process_time/real_time_median        206 ms          207 ms            3 items_per_second=4.84683/s
 # BM_decode_bs4/process_time/real_time_stddev      0.534 ms        0.506 ms            3 items_per_second=0.0125877/s
 # BM_decode_bs4/process_time/real_time_cv           0.26 %          0.24 %             3 items_per_second=0.26%
	Patch used: https://github.com/nod-ai/shark-ai/pull/896
	IREE version==3.2.0rc20250209
	instruct weight: /shark-dev/8b/fp8/native_fp8_e4m3fnuz_llama3_8b.irpa
	instruct tokenizer-config-json: /shark-dev/8b/instruct/tokenizer.json

	# Generated the input.bin
	# Although the run fail, it does generated the input.bin for prefill.(TBD)
	# For bs=4, name of prefill_seq_lens_1xi64.bin should be change to prefill_seq_lens_4xi64.bin (TBD)
	# https://gist.github.com/AmosLewis/d2a325a815c106fcf6e964dd249940ba
	python -m sharktank.examples.paged_llm_v1 --irpa-file=/sharedfile/llama3_8b_fp8.irpa \
	--tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer.json --dump-bins "t"
	# decode_cache_state_261x2097152xf8E4M3FNUZ.bin decode_seq_block_ids_tensor_4x6xi64.bin decode_start_positions_4xi64.bin prefill_seq_lens_4xi64.bin
	# decode_next_tokens_4x1xi64.bin decode_seq_block_ids_tensor_4x7xi64.bin prefill_cache_state_261x2097152xf8E4M3FNUZ.bin prefill_token_ids_4x128xi64.bin
	# decode_seq_block_ids_tensor_4x5xi64.bin decode_seq_lens_4xi64.bin prefill_seq_block_ids_4x4xi64.bin


	# Generate mlir file
	python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/sharedfile/llama3_8b_fp8.irpa \
	--output-mlir=/sharedfile/128/fp8_128.mlir \
	--output-config=/sharedfile/128/config_128.json \
	--bs=4 --attention-kernel torch \
	# --attention-dtype=float8_e4m3fnuz --activation-dtype=bfloat16 this 2 flag might not the one we should use for fp8_atten16.mlir
	# /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)
	# return torch.from_numpy(wrapper)
	# Exporting prefill_bs4
	# /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/torch/_export/non_strict_utils.py:520: UserWarning: Tensor.T is deprecated on 0-D tensors. This function is the identity in these cases. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3691.)
	# return func(args, *kwargs)
	# Exporting decode_bs4
	# GENERATED!
	# Exporting
	# Saving to '/sharedfile/2048/fp8_128.mlir'


	# f8_ perplexity_iree
	python -m sharktank.evaluate.perplexity_iree \
	--irpa-file=/sharedfile/llama3_8b_fp8.irpa \
	--tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer.json \
	--iree-device='hip://4' \
	--iree-hal-target-device=hip \
	--iree-hip-target=gfx942 \
	--attention-kernel decomposed \
	--num-prompts=1


	# iree comiple and run without tracy bin
	iree-compile /sharedfile/128/fp8_128.mlir \
	--iree-hip-target=gfx942 \
	-o=/sharedfile/128/fp8_128.vmfb \
	--iree-hal-target-device=hip \
	--iree-dispatch-creation-enable-aggressive-fusion=true \
	--iree-global-opt-propagate-transposes=true \
	--iree-opt-aggressively-propagate-transposes=true \
	--iree-opt-data-tiling=false \
	--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
	--iree-hal-indirect-command-buffers=true \
	--iree-stream-resource-memory-model=discrete \
	--iree-hal-memoization=true \
	--iree-opt-strip-assertions

	iree-run-module \
	--hip_use_streams=true \
	--module=/sharedfile/128/fp8_128.vmfb \
	--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
	--device=hip://4 \
	--function=prefill_bs4 \
	--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
	--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
	--input=4x64xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x64xi64.bin \
	--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
	# EXEC @prefill_bs4
	# result[0]: hal.buffer_view
	# 4x2048x128256xbf16=[[-3.90625 -3.67188 ...


	# benchmark run
	# 128 prefill
	ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	iree-benchmark-module \
	--hip_use_streams=true \
	--module=/sharedfile/128/fp8_128.vmfb \
	--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
	--device=hip://4 \
	--function=prefill_bs4 \
	--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
	--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
	--input=4x64xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x64xi64.bin \
	--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
	--benchmark_repetitions=3
	# 2025-02-11T10:26:13-08:00
	# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
	# Run on (96 X 3810.79 MHz CPU s)
	# CPU Caches:
	# L1 Data 32 KiB (x96)
	# L1 Instruction 32 KiB (x96)
	# L2 Unified 1024 KiB (x96)
	# L3 Unified 32768 KiB (x16)
	# Load Average: 3.68, 2.49, 1.34
	# *WARNING* CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
	# -------------------------------------------------------------------------------------------------------
	# Benchmark Time CPU Iterations UserCounters...
	# -------------------------------------------------------------------------------------------------------
	# BM_prefill_bs4/process_time/real_time 713 ms 713 ms 1 items_per_second=1.40236/s
	# BM_prefill_bs4/process_time/real_time 713 ms 714 ms 1 items_per_second=1.4018/s
	# BM_prefill_bs4/process_time/real_time 714 ms 714 ms 1 items_per_second=1.39994/s
	# BM_prefill_bs4/process_time/real_time_mean 714 ms 714 ms 3 items_per_second=1.40136/s
	# BM_prefill_bs4/process_time/real_time_median 713 ms 714 ms 3 items_per_second=1.4018/s
	# BM_prefill_bs4/process_time/real_time_stddev 0.646 ms 0.592 ms 3 items_per_second=1.26808m/s
	# BM_prefill_bs4/process_time/real_time_cv 0.09 % 0.08 % 3 items_per_second=0.09%


	# 128 decode
	ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	iree-benchmark-module \
	--hip_use_streams=true \
	--module=/sharedfile/128/fp8_128.vmfb \
	--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
	--device=hip://4 \
	--function=decode_bs4 \
	--input=4x1xi64=@/sharedfile/128/decode/decode_next_tokens_4x1xi64.bin \
	--input=4xi64=@/sharedfile/128/decode/decode_seq_lens_4xi64.bin \
	--input=4xi64=@/sharedfile/128/decode/decode_start_positions_4xi64.bin \
	--input=4x5xi64=@/sharedfile/128/decode/decode_seq_block_ids_tensor_4x5xi64.bin \
	--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/decode/decode_cache_state_261x2097152xf8E4M3FNUZ.bin \
	--benchmark_repetitions=3
	# 2025-02-11T16:30:42-08:00
	# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
	# Run on (96 X 3810.79 MHz CPU s)
	# CPU Caches:
	# L1 Data 32 KiB (x96)
	# L1 Instruction 32 KiB (x96)
	# L2 Unified 1024 KiB (x96)
	# L3 Unified 32768 KiB (x16)
	# Load Average: 4.84, 11.44, 24.97
	# *WARNING* CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
	# ------------------------------------------------------------------------------------------------------
	# Benchmark Time CPU Iterations UserCounters...
	# ------------------------------------------------------------------------------------------------------
	# BM_decode_bs4/process_time/real_time 23.8 ms 24.6 ms 30 items_per_second=42.0138/s
	# BM_decode_bs4/process_time/real_time 23.9 ms 24.8 ms 30 items_per_second=41.8749/s
	# BM_decode_bs4/process_time/real_time 24.0 ms 24.6 ms 30 items_per_second=41.7428/s
	# BM_decode_bs4/process_time/real_time_mean 23.9 ms 24.7 ms 3 items_per_second=41.8772/s
	# BM_decode_bs4/process_time/real_time_median 23.9 ms 24.6 ms 3 items_per_second=41.8749/s
	# BM_decode_bs4/process_time/real_time_stddev 0.077 ms 0.148 ms 3 items_per_second=0.135524/s
	# BM_decode_bs4/process_time/real_time_cv 0.32 % 0.60 % 3 items_per_second=0.32%



	# 2048 prefill
	ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	iree-benchmark-module \
	--hip_use_streams=true \
	--module=/sharedfile/2048/fp8_2048.vmfb \
	--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
	--device=hip://4 \
	--function=prefill_bs4 \
	--input=4x2048xi64=@/sharedfile/2048/prefill/prefill_token_ids_4x2048xi64.bin \
	--input=4xi64=@/sharedfile/2048/prefill/prefill_seq_lens_4xi64.bin \
	--input=4x64xi64=@/sharedfile/2048/prefill/prefill_seq_block_ids_4x64xi64.bin \
	--input=261x2097152xf8E4M3FNUZ=@/sharedfile/2048/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
	--benchmark_repetitions=3
	# 2025-02-10T18:56:57-08:00
	# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
	# Run on (96 X 3810.79 MHz CPU s)
	# CPU Caches:
	# L1 Data 32 KiB (x96)
	# L1 Instruction 32 KiB (x96)
	# L2 Unified 1024 KiB (x96)
	# L3 Unified 32768 KiB (x16)
	# Load Average: 1.13, 1.23, 3.41
	# *WARNING* CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
	# -------------------------------------------------------------------------------------------------------
	# Benchmark Time CPU Iterations UserCounters...
	# -------------------------------------------------------------------------------------------------------
	# BM_prefill_bs4/process_time/real_time 725 ms 725 ms 1 items_per_second=1.37975/s
	# BM_prefill_bs4/process_time/real_time 727 ms 728 ms 1 items_per_second=1.3762/s
	# BM_prefill_bs4/process_time/real_time 727 ms 728 ms 1 items_per_second=1.37512/s
	# BM_prefill_bs4/process_time/real_time_mean 726 ms 727 ms 3 items_per_second=1.37703/s
	# BM_prefill_bs4/process_time/real_time_median 727 ms 728 ms 3 items_per_second=1.3762/s
	# BM_prefill_bs4/process_time/real_time_stddev 1.28 ms 1.41 ms 3 items_per_second=2.42255m/s
	# BM_prefill_bs4/process_time/real_time_cv 0.18 % 0.19 % 3 items_per_second=0.18%

	# 2048 decode
	ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	iree-benchmark-module \
	--hip_use_streams=true \
	--module=/sharedfile/2048/fp8_2048.vmfb \
	--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
	--device=hip://4 \
	--function=decode_bs4 \
	--input=4x1xi64=@/sharedfile/2048/decode/decode_next_tokens_4x1xi64.bin \
	--input=4xi64=@/sharedfile/2048/decode/decode_seq_lens_4xi64.bin \
	--input=4xi64=@/sharedfile/2048/decode/decode_start_positions_4xi64.bin \
	--input=4x65xi64=@/sharedfile/2048/decode/decode_seq_block_ids_tensor_4x65xi64.bin \
	--input=261x2097152xf8E4M3FNUZ=@/sharedfile/2048/decode/decode_cache_state_261x2097152xf8E4M3FNUZ.bin \
	--benchmark_repetitions=3
	# 2025-02-11T16:27:33-08:00
	# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
	# Run on (96 X 3810.79 MHz CPU s)
	# CPU Caches:
	# L1 Data 32 KiB (x96)
	# L1 Instruction 32 KiB (x96)
	# L2 Unified 1024 KiB (x96)
	# L3 Unified 32768 KiB (x16)
	# Load Average: 13.80, 17.30, 29.52
	# *WARNING* CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
	# ------------------------------------------------------------------------------------------------------
	# Benchmark Time CPU Iterations UserCounters...
	# ------------------------------------------------------------------------------------------------------
	# BM_decode_bs4/process_time/real_time 206 ms 206 ms 3 items_per_second=4.86574/s
	# BM_decode_bs4/process_time/real_time 206 ms 207 ms 3 items_per_second=4.84683/s
	# BM_decode_bs4/process_time/real_time 207 ms 207 ms 3 items_per_second=4.84188/s
	# BM_decode_bs4/process_time/real_time_mean 206 ms 207 ms 3 items_per_second=4.85148/s
	# BM_decode_bs4/process_time/real_time_median 206 ms 207 ms 3 items_per_second=4.84683/s
	# BM_decode_bs4/process_time/real_time_stddev 0.534 ms 0.506 ms 3 items_per_second=0.0125877/s
	# BM_decode_bs4/process_time/real_time_cv 0.26 % 0.24 % 3 items_per_second=0.26%