Skip to content

Instantly share code, notes, and snippets.

@AmosLewis
Last active June 18, 2025 03:43
Show Gist options
  • Save AmosLewis/b484ae23230ec0eb427554df77d7b856 to your computer and use it in GitHub Desktop.
Save AmosLewis/b484ae23230ec0eb427554df77d7b856 to your computer and use it in GitHub Desktop.
# # Check if a command-line argument is provided
if [ -z "$1" ]; then
iree_day="0616"
echo "No flag provided. Using default iree_day $iree_day."
else
iree_day="$1"
fi
if [ -z "$2" ]; then
shark_day="0616"
echo "No flag provided. Using default shark_day $shark_day."
else
shark_day="$2"
fi
irpa_path=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa
# pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 \
# --iree-device=hip://4 -k testBenchmark8b_f16_TP8_Non_Decomposed_Input_Len_128
mlir_path_128="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.mlir"
config_128="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.json"
vmfb_128="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb"
vmfb_128_decode="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb"
benchmark_128_prefill="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt"
benchmark_128_decode="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.txt"
create_file_if_not_exists() {
local FILE="$1"
if [ ! -f "$FILE" ]; then
touch "$FILE"
echo "File created: $FILE"
else
echo "File already exists: $FILE"
fi
}
echo $mlir_path_128
echo $config_128
echo $vmfb_128
echo $vmfb_128_decode
echo $benchmark_128_prefill
create_file_if_not_exists $benchmark_128_prefill
echo $benchmark_128_decode
create_file_if_not_exists $benchmark_128_decode
echo "python3: "
python3 -m sharktank.examples.export_paged_llm_v1 \
--irpa-file=$irpa_path \
--output-mlir=$mlir_path_128 \
--output-config=$config_128 \
--bs-prefill=4 \
--bs-decode=4 \
--block-seq-stride=32 \
--attention-dtype=float16 \
--activation-dtype=float16 \
--tensor-parallelism-size=8 \
--pipeline-parallelism-size=1 \
--attention-kernel=torch
echo "iree-compile: "
iree-compile \
$mlir_path_128 \
--iree-hip-target=gfx942 \
-o=$vmfb_128 \
--iree-hal-target-device="hip[0]" \
--iree-hal-target-device="hip[1]" \
--iree-hal-target-device="hip[2]" \
--iree-hal-target-device="hip[3]" \
--iree-hal-target-device="hip[4]" \
--iree-hal-target-device="hip[5]" \
--iree-hal-target-device="hip[6]" \
--iree-hal-target-device="hip[7]" \
--iree-opt-level=O3 \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-codegen-enable-default-tuning-specs=true \
--iree-stream-affinity-solver-max-iterations=1024
echo "iree-benchmark-module: "
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=$vmfb_128 \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \
--device=hip://0 \
--device=hip://1 \
--device=hip://2 \
--device=hip://3 \
--device=hip://4 \
--device=hip://5 \
--device=hip://6 \
--device=hip://7 \
--function=prefill_bs4 \
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/tokens.npy \
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/seq_lens.npy \
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/seq_block_ids.npy \
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy \
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy \
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy \
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy \
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy \
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy \
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy \
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy \
--benchmark_repetitions=10 \
| tee $benchmark_128_prefill
echo "iree-compile decode: "
iree-compile \
$mlir_path_128 \
--iree-hip-target=gfx942 \
-o=$vmfb_128_decode \
--iree-hal-target-device="hip[0]" \
--iree-hal-target-device="hip[1]" \
--iree-hal-target-device="hip[2]" \
--iree-hal-target-device="hip[3]" \
--iree-hal-target-device="hip[4]" \
--iree-hal-target-device="hip[5]" \
--iree-hal-target-device="hip[6]" \
--iree-hal-target-device="hip[7]" \
--iree-opt-level=O3 \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-stream-affinity-solver-max-iterations=1024
echo "iree-benchmark-module decode: "
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=$vmfb_128_decode \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \
--device=hip://0 \
--device=hip://1 \
--device=hip://2 \
--device=hip://3 \
--device=hip://4 \
--device=hip://5 \
--device=hip://6 \
--device=hip://7 \
--function=decode_bs4 \
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/next_tokens.npy \
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/seq_lens.npy \
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/start_positions.npy \
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/seq_block_ids.npy \
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy \
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy \
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy \
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy \
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy \
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy \
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy \
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy \
--benchmark_repetitions=10 \
| tee $benchmark_128_decode
###########################################2048###############################################################################################
# pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 \
# --iree-device=hip://4 -k testBenchmark8b_f16_TP8_Non_Decomposed_Input_Len_2048
mlir_path_2048="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.mlir"
config_2048="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.json"
vmfb_2048="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb"
vmfb_2048_decode="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb"
benchmark_2048_prefill="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt"
benchmark_2048_decode="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.txt"
echo $mlir_path_2048
echo $config_2048
echo $vmfb_2048
echo $benchmark_2048_prefill
create_file_if_not_exists $benchmark_2048_prefill
echo $benchmark_2048_decode
create_file_if_not_exists $benchmark_2048_decode
echo "python3 : "
python3 -m sharktank.examples.export_paged_llm_v1 \
--irpa-file=$irpa_path \
--output-mlir=$mlir_path_2048 \
--output-config=$config_2048 \
--bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 \
--activation-dtype=float16 --tensor-parallelism-size=8 --pipeline-parallelism-size=1 --attention-kernel=torch
echo "iree-compile : "
iree-compile \
$mlir_path_2048 \
--iree-hip-target=gfx942 \
-o=$vmfb_2048\
--iree-hal-target-device="hip[0]" \
--iree-hal-target-device="hip[1]" \
--iree-hal-target-device="hip[2]" \
--iree-hal-target-device="hip[3]" \
--iree-hal-target-device="hip[4]" \
--iree-hal-target-device="hip[5]" \
--iree-hal-target-device="hip[6]" \
--iree-hal-target-device="hip[7]" \
--iree-opt-level=O3 \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-codegen-enable-default-tuning-specs=true \
--iree-stream-affinity-solver-max-iterations=1024
echo "iree-benchmark-module : "
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=$vmfb_2048 \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \
--device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \
--function=prefill_bs4 \
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/tokens.npy \
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/seq_lens.npy \
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/seq_block_ids.npy \
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_0.npy \
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_1.npy \
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_2.npy \
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_3.npy \
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_4.npy \
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_5.npy \
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_6.npy \
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_7.npy \
| tee $benchmark_2048_prefill
echo "iree-compile decode: "
iree-compile \
$mlir_path_2048 \
--iree-hip-target=gfx942 \
-o=$vmfb_2048_decode\
--iree-hal-target-device="hip[0]" \
--iree-hal-target-device="hip[1]" \
--iree-hal-target-device="hip[2]" \
--iree-hal-target-device="hip[3]" \
--iree-hal-target-device="hip[4]" \
--iree-hal-target-device="hip[5]" \
--iree-hal-target-device="hip[6]" \
--iree-hal-target-device="hip[7]" \
--iree-opt-level=O3 \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-stream-affinity-solver-max-iterations=1024
echo "iree-benchmark-module decode: "
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=$vmfb_2048_decode \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \
--device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \
--function=decode_bs4 \
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/next_tokens.npy \
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/seq_lens.npy \
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/start_positions.npy \
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/seq_block_ids.npy \
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_0.npy \
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_1.npy \
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_2.npy \
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_3.npy \
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_4.npy \
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_5.npy \
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_6.npy \
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_7.npy \
--benchmark_repetitions=10 \
| tee $benchmark_2048_decode
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment