Last active
June 18, 2025 03:43
-
-
Save AmosLewis/b484ae23230ec0eb427554df77d7b856 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# # Check if a command-line argument is provided | |
if [ -z "$1" ]; then | |
iree_day="0616" | |
echo "No flag provided. Using default iree_day $iree_day." | |
else | |
iree_day="$1" | |
fi | |
if [ -z "$2" ]; then | |
shark_day="0616" | |
echo "No flag provided. Using default shark_day $shark_day." | |
else | |
shark_day="$2" | |
fi | |
irpa_path=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa | |
# pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 \ | |
# --iree-device=hip://4 -k testBenchmark8b_f16_TP8_Non_Decomposed_Input_Len_128 | |
mlir_path_128="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.mlir" | |
config_128="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.json" | |
vmfb_128="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb" | |
vmfb_128_decode="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb" | |
benchmark_128_prefill="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt" | |
benchmark_128_decode="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.txt" | |
create_file_if_not_exists() { | |
local FILE="$1" | |
if [ ! -f "$FILE" ]; then | |
touch "$FILE" | |
echo "File created: $FILE" | |
else | |
echo "File already exists: $FILE" | |
fi | |
} | |
echo $mlir_path_128 | |
echo $config_128 | |
echo $vmfb_128 | |
echo $vmfb_128_decode | |
echo $benchmark_128_prefill | |
create_file_if_not_exists $benchmark_128_prefill | |
echo $benchmark_128_decode | |
create_file_if_not_exists $benchmark_128_decode | |
echo "python3: " | |
python3 -m sharktank.examples.export_paged_llm_v1 \ | |
--irpa-file=$irpa_path \ | |
--output-mlir=$mlir_path_128 \ | |
--output-config=$config_128 \ | |
--bs-prefill=4 \ | |
--bs-decode=4 \ | |
--block-seq-stride=32 \ | |
--attention-dtype=float16 \ | |
--activation-dtype=float16 \ | |
--tensor-parallelism-size=8 \ | |
--pipeline-parallelism-size=1 \ | |
--attention-kernel=torch | |
echo "iree-compile: " | |
iree-compile \ | |
$mlir_path_128 \ | |
--iree-hip-target=gfx942 \ | |
-o=$vmfb_128 \ | |
--iree-hal-target-device="hip[0]" \ | |
--iree-hal-target-device="hip[1]" \ | |
--iree-hal-target-device="hip[2]" \ | |
--iree-hal-target-device="hip[3]" \ | |
--iree-hal-target-device="hip[4]" \ | |
--iree-hal-target-device="hip[5]" \ | |
--iree-hal-target-device="hip[6]" \ | |
--iree-hal-target-device="hip[7]" \ | |
--iree-opt-level=O3 \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-codegen-enable-default-tuning-specs=true \ | |
--iree-stream-affinity-solver-max-iterations=1024 | |
echo "iree-benchmark-module: " | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=$vmfb_128 \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \ | |
--device=hip://0 \ | |
--device=hip://1 \ | |
--device=hip://2 \ | |
--device=hip://3 \ | |
--device=hip://4 \ | |
--device=hip://5 \ | |
--device=hip://6 \ | |
--device=hip://7 \ | |
--function=prefill_bs4 \ | |
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/tokens.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/seq_lens.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/seq_block_ids.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy \ | |
--benchmark_repetitions=10 \ | |
| tee $benchmark_128_prefill | |
echo "iree-compile decode: " | |
iree-compile \ | |
$mlir_path_128 \ | |
--iree-hip-target=gfx942 \ | |
-o=$vmfb_128_decode \ | |
--iree-hal-target-device="hip[0]" \ | |
--iree-hal-target-device="hip[1]" \ | |
--iree-hal-target-device="hip[2]" \ | |
--iree-hal-target-device="hip[3]" \ | |
--iree-hal-target-device="hip[4]" \ | |
--iree-hal-target-device="hip[5]" \ | |
--iree-hal-target-device="hip[6]" \ | |
--iree-hal-target-device="hip[7]" \ | |
--iree-opt-level=O3 \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-stream-affinity-solver-max-iterations=1024 | |
echo "iree-benchmark-module decode: " | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=$vmfb_128_decode \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \ | |
--device=hip://0 \ | |
--device=hip://1 \ | |
--device=hip://2 \ | |
--device=hip://3 \ | |
--device=hip://4 \ | |
--device=hip://5 \ | |
--device=hip://6 \ | |
--device=hip://7 \ | |
--function=decode_bs4 \ | |
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/next_tokens.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/seq_lens.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/start_positions.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/seq_block_ids.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy \ | |
--benchmark_repetitions=10 \ | |
| tee $benchmark_128_decode | |
###########################################2048############################################################################################### | |
# pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 \ | |
# --iree-device=hip://4 -k testBenchmark8b_f16_TP8_Non_Decomposed_Input_Len_2048 | |
mlir_path_2048="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.mlir" | |
config_2048="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.json" | |
vmfb_2048="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb" | |
vmfb_2048_decode="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb" | |
benchmark_2048_prefill="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt" | |
benchmark_2048_decode="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.txt" | |
echo $mlir_path_2048 | |
echo $config_2048 | |
echo $vmfb_2048 | |
echo $benchmark_2048_prefill | |
create_file_if_not_exists $benchmark_2048_prefill | |
echo $benchmark_2048_decode | |
create_file_if_not_exists $benchmark_2048_decode | |
echo "python3 : " | |
python3 -m sharktank.examples.export_paged_llm_v1 \ | |
--irpa-file=$irpa_path \ | |
--output-mlir=$mlir_path_2048 \ | |
--output-config=$config_2048 \ | |
--bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 \ | |
--activation-dtype=float16 --tensor-parallelism-size=8 --pipeline-parallelism-size=1 --attention-kernel=torch | |
echo "iree-compile : " | |
iree-compile \ | |
$mlir_path_2048 \ | |
--iree-hip-target=gfx942 \ | |
-o=$vmfb_2048\ | |
--iree-hal-target-device="hip[0]" \ | |
--iree-hal-target-device="hip[1]" \ | |
--iree-hal-target-device="hip[2]" \ | |
--iree-hal-target-device="hip[3]" \ | |
--iree-hal-target-device="hip[4]" \ | |
--iree-hal-target-device="hip[5]" \ | |
--iree-hal-target-device="hip[6]" \ | |
--iree-hal-target-device="hip[7]" \ | |
--iree-opt-level=O3 \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-codegen-enable-default-tuning-specs=true \ | |
--iree-stream-affinity-solver-max-iterations=1024 | |
echo "iree-benchmark-module : " | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=$vmfb_2048 \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \ | |
--device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \ | |
--function=prefill_bs4 \ | |
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/tokens.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/seq_lens.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/seq_block_ids.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_0.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_1.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_2.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_3.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_4.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_5.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_6.npy \ | |
--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_7.npy \ | |
| tee $benchmark_2048_prefill | |
echo "iree-compile decode: " | |
iree-compile \ | |
$mlir_path_2048 \ | |
--iree-hip-target=gfx942 \ | |
-o=$vmfb_2048_decode\ | |
--iree-hal-target-device="hip[0]" \ | |
--iree-hal-target-device="hip[1]" \ | |
--iree-hal-target-device="hip[2]" \ | |
--iree-hal-target-device="hip[3]" \ | |
--iree-hal-target-device="hip[4]" \ | |
--iree-hal-target-device="hip[5]" \ | |
--iree-hal-target-device="hip[6]" \ | |
--iree-hal-target-device="hip[7]" \ | |
--iree-opt-level=O3 \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-stream-affinity-solver-max-iterations=1024 | |
echo "iree-benchmark-module decode: " | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=$vmfb_2048_decode \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \ | |
--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \ | |
--device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \ | |
--function=decode_bs4 \ | |
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/next_tokens.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/seq_lens.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/start_positions.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/seq_block_ids.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_0.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_1.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_2.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_3.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_4.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_5.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_6.npy \ | |
--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_7.npy \ | |
--benchmark_repetitions=10 \ | |
| tee $benchmark_2048_decode | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment