Skip to content

Instantly share code, notes, and snippets.

@AmosLewis
Last active July 11, 2025 18:43
Show Gist options
  • Save AmosLewis/9e5a0f64e9cbc4254fe1e93621028183 to your computer and use it in GitHub Desktop.
Save AmosLewis/9e5a0f64e9cbc4254fe1e93621028183 to your computer and use it in GitHub Desktop.
# # Check if a command-line argument is provided
if [ -z "$1" ]; then
iree_day="0710"
echo "No flag provided. Using default iree_day $iree_day."
else
iree_day="$1"
fi
if [ -z "$2" ]; then
shark_day="0710alex"
echo "No flag provided. Using default shark_day $shark_day."
else
shark_day="$2"
fi
# on SharkMI300x
# IRPA_PATH=/shark-dev/405b/instruct/weights/llama3.1_405b_fp16.irpa
# TOKENIZER_JSON=/shark-dev/405b/instruct/tokenizer.json
# on SharkMI300x-3
irpa_path=/shark-dev/llama3.1/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa
TOKENIZER_JSON=/shark-dev/llama3.1/405b/instruct/tokenizer.json
# On chiliu12@quanta-ccs-aus-f14-01 MI325
# IRPA_PATH=/shark-dev/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa
# TOKENIZER_JSON=/shark-dev/405b/instruct/tokenizer.json
# On ssh [email protected]
# IRPA_PATH=/shark-dev/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa
# TOKENIZER_JSON=/shark-dev/405b/instruct/tokenizer.json
# # To check numerics perplexity_iree
# # python -m sharktank.evaluate.perplexity_iree \
# # --irpa-file=/shark-dev/llama3.1/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa \
# # --tokenizer-config-json=/shark-dev/llama3.1/405b/instruct/tokenizer_config.json \
# # --iree-device='hip://0' \
# # --iree-device='hip://1' \
# # --iree-device='hip://2' \
# # --iree-device='hip://3' \
# # --iree-device='hip://4' \
# # --iree-device='hip://5' \
# # --iree-device='hip://6' \
# # --iree-device='hip://7' \
# # --iree-hal-target-device=hip \
# # --iree-hip-target=gfx942 \
# # --pipeline-parallelism-size=8 \
# # --num-prompts=4 \
# # --use-attention-mask \
# # --verbose
mlir_path_128="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.mlir"
config_128="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.json"
vmfb_128="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb"
vmfb_128_decode="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb"
benchmark_128_prefill="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt"
benchmark_128_decode="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.txt"
create_file_if_not_exists() {
local FILE="$1"
if [ ! -f "$FILE" ]; then
touch "$FILE"
echo "File created: $FILE"
else
echo "File already exists: $FILE"
fi
}
echo $mlir_path_128
echo $config_128
echo $vmfb_128
echo $vmfb_128_decode
echo $benchmark_128_prefill
create_file_if_not_exists $benchmark_128_prefill
echo $benchmark_128_decode
create_file_if_not_exists $benchmark_128_decode
# echo "export model: "
python3 -m sharktank.examples.export_paged_llm_v1 \
--irpa-file=$irpa_path \
--output-mlir=$mlir_path_128 \
--output-config=$config_128 \
--bs-prefill=4 \
--bs-decode=4 \
--block-seq-stride=32 \
--attention-dtype=float16 \
--activation-dtype=float16 \
--tensor-parallelism-size=1 \
--pipeline-parallelism-size=8 \
--attention-kernel=torch \
--kv-cache-dtype=float16 \
--use-attention-mask
echo "iree-compile: "
iree-compile \
$mlir_path_128 \
--iree-hip-target=gfx950 \
-o=$vmfb_128 \
--iree-hal-target-device="hip[0]" \
--iree-hal-target-device="hip[1]" \
--iree-hal-target-device="hip[2]" \
--iree-hal-target-device="hip[3]" \
--iree-hal-target-device="hip[4]" \
--iree-hal-target-device="hip[5]" \
--iree-hal-target-device="hip[6]" \
--iree-hal-target-device="hip[7]" \
--iree-opt-level=O3 \
--iree-dispatch-creation-propagate-collapse-across-expands=true \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-codegen-enable-default-tuning-specs=true \
--iree-stream-affinity-solver-max-iterations=1024
echo "iree-benchmark-module prefill: "
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=$vmfb_128 \
--parameters=model=$irpa_path \
--device=hip://0 \
--device=hip://1 \
--device=hip://2 \
--device=hip://3 \
--device=hip://4 \
--device=hip://5 \
--device=hip://6 \
--device=hip://7 \
--function=prefill_bs4 \
--input=4x128xi64 \
--input=4xi64 \
--input=4x4xi64 \
--input=128x1048576xf16 \
--input=128x1048576xf16 \
--input=128x1048576xf16 \
--input=128x983040xf16 \
--input=128x1048576xf16 \
--input=128x1048576xf16 \
--input=128x1048576xf16 \
--input=128x983040xf16 \
--benchmark_repetitions=10 \
| tee $benchmark_128_prefill
# # %arg3: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_0>},
# # %arg4: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_1>},
# # %arg5: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_2>},
# # %arg6: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_3>},
# # %arg7: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_4>},
# # %arg8: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_5>},
# # %arg9: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_6>},
# # %arg10: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_7>}
echo "iree-compile decode: "
iree-compile \
$mlir_path_128 \
--iree-hip-target=gfx950 \
-o=$vmfb_128_decode \
--iree-hal-target-device="hip[0]" \
--iree-hal-target-device="hip[1]" \
--iree-hal-target-device="hip[2]" \
--iree-hal-target-device="hip[3]" \
--iree-hal-target-device="hip[4]" \
--iree-hal-target-device="hip[5]" \
--iree-hal-target-device="hip[6]" \
--iree-hal-target-device="hip[7]" \
--iree-opt-level=O3 \
--iree-dispatch-creation-propagate-collapse-across-expands=true \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-stream-affinity-solver-max-iterations=1024
echo "iree-benchmark-module decode: "
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=$vmfb_128_decode \
--parameters=model=$irpa_path \
--device=hip://0 \
--device=hip://1 \
--device=hip://2 \
--device=hip://3 \
--device=hip://4 \
--device=hip://5 \
--device=hip://6 \
--device=hip://7 \
--function=decode_bs4 \
--input=4x1xi64 \
--input=4xi64 \
--input=4xi64 \
--input=4x5xi64 \
--input=128x1048576xf16 \
--input=128x1048576xf16 \
--input=128x1048576xf16 \
--input=128x983040xf16 \
--input=128x1048576xf16 \
--input=128x1048576xf16 \
--input=128x1048576xf16 \
--input=128x983040xf16 \
--benchmark_repetitions=10 \
| tee $benchmark_128_decode
# # %arg0: !torch.vtensor<[4,1],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>},
# # %arg1: !torch.vtensor<[4],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>},
# # %arg2: !torch.vtensor<[4],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>},
# # %arg3: !torch.vtensor<[4,?],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>},
# # %arg4: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_0>},
# # %arg5: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_1>},
# # %arg6: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_2>},
# # %arg7: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_3>},
# # %arg8: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_4>},
# # %arg9: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_5>},
# # %arg10: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_6>},
# # %arg11: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_7>})
###########################################2048###############################################################################################
# mlir_path_2048="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.mlir"
mlir_path_2048=$mlir_path_128
config_2048="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.json"
# vmfb_2048="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb"
vmfb_2048=$vmfb_128
vmfb_2048_decode="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb"
vmfb_2048_decode=$vmfb_128_decode
benchmark_2048_prefill="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt"
benchmark_2048_decode="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.txt"
echo $mlir_path_2048
echo $config_2048
echo $vmfb_2048
echo $benchmark_2048_prefill
create_file_if_not_exists $benchmark_2048_prefill
echo $benchmark_2048_decode
create_file_if_not_exists $benchmark_2048_decode
echo "iree-benchmark-module prefill: "
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=$vmfb_2048 \
--parameters=model=$irpa_path \
--device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \
--function=prefill_bs4 \
--input=4x2048xi64 \
--input=4xi64 \
--input=4x64xi64 \
--input=513x1048576xf16 \
--input=513x1048576xf16 \
--input=513x1048576xf16 \
--input=513x983040xf16 \
--input=513x1048576xf16 \
--input=513x1048576xf16 \
--input=513x1048576xf16 \
--input=513x983040xf16 \
--benchmark_repetitions=10 \
| tee $benchmark_2048_prefill
echo "iree-benchmark-module decode: "
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=$vmfb_2048_decode \
--parameters=model=$irpa_path \
--device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \
--function=decode_bs4 \
--input=4x1xi64 \
--input=4xi64 \
--input=4xi64 \
--input=4x65xi64 \
--input=513x1048576xf16 \
--input=513x1048576xf16 \
--input=513x1048576xf16 \
--input=513x983040xf16 \
--input=513x1048576xf16 \
--input=513x1048576xf16 \
--input=513x1048576xf16 \
--input=513x983040xf16 \
--benchmark_repetitions=10 \
| tee $benchmark_2048_decode
########################shortfin server#######################################################################################
# TOKENIZER_JSON="/shark-dev/405b/instruct/tokenizer.json"
MODEL_CONFIG=$config_128
VMFB=$vmfb_128
IRPA_PATH=$irpa_path
conc=4
RESULTS_DIR=/sharedfile/f16/128/405b/pp8/out
echo "shortfin_apps.llm.cli "
export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
cd /home/chi/src/shark-ai/shortfin
python3 -m shortfin_apps.llm.cli \
--device hip \
--tokenizer_json=$TOKENIZER_JSON \
--model_config=$MODEL_CONFIG \
--vmfb=$VMFB \
--parameters $IRPA_PATH \
--benchmark \
--benchmark_tasks=4 \
--device_ids 0 1 2 3 4 5 6 7 \
--stream \
--input_token_length 128 \
--decode_steps=1 \
--workers_offline=4 \
--output-json $RESULTS_DIR/4.json
# To check online serving numerics and get tracy
# # Terminal 1
# export SHORTFIN_ENABLE_TRACING=ON
# export TRACY_PORT=8007
# python -m shortfin_apps.llm.server \
# --tokenizer_json=/shark-dev/llama3.1/405b/instruct/tokenizer.json \
# --model_config=/sharedfile/f16/128/405b/pp8/out/f16_iree0707.shark0710alex.json \
# --vmfb=/sharedfile/f16/128/405b/pp8/out/f16_iree0707.shark0710alex.prefill.vmfb \
# --parameters=/shark-dev/llama3.1/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa \
# --device=hip \
# --device_ids 0 1 2 3 4 5 6 7
# # Terminal 2
# curl http://localhost:8007/generate -H "Content-Type: application/json" -d '{ "return_text_in_logprobs": "False", "text": "life is amazing, is it not? ", "sampling_params": {"max_completion_tokens": 20}}'
# # Terminal 3
# iree-tracy-capture -o mi300x3_405b_f16_pp8_trace.tracy -p 8007
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment