Last active
July 11, 2025 18:43
-
-
Save AmosLewis/9e5a0f64e9cbc4254fe1e93621028183 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# # Check if a command-line argument is provided | |
if [ -z "$1" ]; then | |
iree_day="0710" | |
echo "No flag provided. Using default iree_day $iree_day." | |
else | |
iree_day="$1" | |
fi | |
if [ -z "$2" ]; then | |
shark_day="0710alex" | |
echo "No flag provided. Using default shark_day $shark_day." | |
else | |
shark_day="$2" | |
fi | |
# on SharkMI300x | |
# IRPA_PATH=/shark-dev/405b/instruct/weights/llama3.1_405b_fp16.irpa | |
# TOKENIZER_JSON=/shark-dev/405b/instruct/tokenizer.json | |
# on SharkMI300x-3 | |
irpa_path=/shark-dev/llama3.1/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa | |
TOKENIZER_JSON=/shark-dev/llama3.1/405b/instruct/tokenizer.json | |
# On chiliu12@quanta-ccs-aus-f14-01 MI325 | |
# IRPA_PATH=/shark-dev/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa | |
# TOKENIZER_JSON=/shark-dev/405b/instruct/tokenizer.json | |
# On ssh [email protected] | |
# IRPA_PATH=/shark-dev/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa | |
# TOKENIZER_JSON=/shark-dev/405b/instruct/tokenizer.json | |
# # To check numerics perplexity_iree | |
# # python -m sharktank.evaluate.perplexity_iree \ | |
# # --irpa-file=/shark-dev/llama3.1/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa \ | |
# # --tokenizer-config-json=/shark-dev/llama3.1/405b/instruct/tokenizer_config.json \ | |
# # --iree-device='hip://0' \ | |
# # --iree-device='hip://1' \ | |
# # --iree-device='hip://2' \ | |
# # --iree-device='hip://3' \ | |
# # --iree-device='hip://4' \ | |
# # --iree-device='hip://5' \ | |
# # --iree-device='hip://6' \ | |
# # --iree-device='hip://7' \ | |
# # --iree-hal-target-device=hip \ | |
# # --iree-hip-target=gfx942 \ | |
# # --pipeline-parallelism-size=8 \ | |
# # --num-prompts=4 \ | |
# # --use-attention-mask \ | |
# # --verbose | |
mlir_path_128="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.mlir" | |
config_128="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.json" | |
vmfb_128="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb" | |
vmfb_128_decode="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb" | |
benchmark_128_prefill="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt" | |
benchmark_128_decode="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.txt" | |
create_file_if_not_exists() { | |
local FILE="$1" | |
if [ ! -f "$FILE" ]; then | |
touch "$FILE" | |
echo "File created: $FILE" | |
else | |
echo "File already exists: $FILE" | |
fi | |
} | |
echo $mlir_path_128 | |
echo $config_128 | |
echo $vmfb_128 | |
echo $vmfb_128_decode | |
echo $benchmark_128_prefill | |
create_file_if_not_exists $benchmark_128_prefill | |
echo $benchmark_128_decode | |
create_file_if_not_exists $benchmark_128_decode | |
# echo "export model: " | |
python3 -m sharktank.examples.export_paged_llm_v1 \ | |
--irpa-file=$irpa_path \ | |
--output-mlir=$mlir_path_128 \ | |
--output-config=$config_128 \ | |
--bs-prefill=4 \ | |
--bs-decode=4 \ | |
--block-seq-stride=32 \ | |
--attention-dtype=float16 \ | |
--activation-dtype=float16 \ | |
--tensor-parallelism-size=1 \ | |
--pipeline-parallelism-size=8 \ | |
--attention-kernel=torch \ | |
--kv-cache-dtype=float16 \ | |
--use-attention-mask | |
echo "iree-compile: " | |
iree-compile \ | |
$mlir_path_128 \ | |
--iree-hip-target=gfx950 \ | |
-o=$vmfb_128 \ | |
--iree-hal-target-device="hip[0]" \ | |
--iree-hal-target-device="hip[1]" \ | |
--iree-hal-target-device="hip[2]" \ | |
--iree-hal-target-device="hip[3]" \ | |
--iree-hal-target-device="hip[4]" \ | |
--iree-hal-target-device="hip[5]" \ | |
--iree-hal-target-device="hip[6]" \ | |
--iree-hal-target-device="hip[7]" \ | |
--iree-opt-level=O3 \ | |
--iree-dispatch-creation-propagate-collapse-across-expands=true \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-codegen-enable-default-tuning-specs=true \ | |
--iree-stream-affinity-solver-max-iterations=1024 | |
echo "iree-benchmark-module prefill: " | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=$vmfb_128 \ | |
--parameters=model=$irpa_path \ | |
--device=hip://0 \ | |
--device=hip://1 \ | |
--device=hip://2 \ | |
--device=hip://3 \ | |
--device=hip://4 \ | |
--device=hip://5 \ | |
--device=hip://6 \ | |
--device=hip://7 \ | |
--function=prefill_bs4 \ | |
--input=4x128xi64 \ | |
--input=4xi64 \ | |
--input=4x4xi64 \ | |
--input=128x1048576xf16 \ | |
--input=128x1048576xf16 \ | |
--input=128x1048576xf16 \ | |
--input=128x983040xf16 \ | |
--input=128x1048576xf16 \ | |
--input=128x1048576xf16 \ | |
--input=128x1048576xf16 \ | |
--input=128x983040xf16 \ | |
--benchmark_repetitions=10 \ | |
| tee $benchmark_128_prefill | |
# # %arg3: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_0>}, | |
# # %arg4: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_1>}, | |
# # %arg5: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_2>}, | |
# # %arg6: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_3>}, | |
# # %arg7: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_4>}, | |
# # %arg8: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_5>}, | |
# # %arg9: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_6>}, | |
# # %arg10: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_7>} | |
echo "iree-compile decode: " | |
iree-compile \ | |
$mlir_path_128 \ | |
--iree-hip-target=gfx950 \ | |
-o=$vmfb_128_decode \ | |
--iree-hal-target-device="hip[0]" \ | |
--iree-hal-target-device="hip[1]" \ | |
--iree-hal-target-device="hip[2]" \ | |
--iree-hal-target-device="hip[3]" \ | |
--iree-hal-target-device="hip[4]" \ | |
--iree-hal-target-device="hip[5]" \ | |
--iree-hal-target-device="hip[6]" \ | |
--iree-hal-target-device="hip[7]" \ | |
--iree-opt-level=O3 \ | |
--iree-dispatch-creation-propagate-collapse-across-expands=true \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-stream-affinity-solver-max-iterations=1024 | |
echo "iree-benchmark-module decode: " | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=$vmfb_128_decode \ | |
--parameters=model=$irpa_path \ | |
--device=hip://0 \ | |
--device=hip://1 \ | |
--device=hip://2 \ | |
--device=hip://3 \ | |
--device=hip://4 \ | |
--device=hip://5 \ | |
--device=hip://6 \ | |
--device=hip://7 \ | |
--function=decode_bs4 \ | |
--input=4x1xi64 \ | |
--input=4xi64 \ | |
--input=4xi64 \ | |
--input=4x5xi64 \ | |
--input=128x1048576xf16 \ | |
--input=128x1048576xf16 \ | |
--input=128x1048576xf16 \ | |
--input=128x983040xf16 \ | |
--input=128x1048576xf16 \ | |
--input=128x1048576xf16 \ | |
--input=128x1048576xf16 \ | |
--input=128x983040xf16 \ | |
--benchmark_repetitions=10 \ | |
| tee $benchmark_128_decode | |
# # %arg0: !torch.vtensor<[4,1],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>}, | |
# # %arg1: !torch.vtensor<[4],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>}, | |
# # %arg2: !torch.vtensor<[4],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>}, | |
# # %arg3: !torch.vtensor<[4,?],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>}, | |
# # %arg4: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_0>}, | |
# # %arg5: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_1>}, | |
# # %arg6: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_2>}, | |
# # %arg7: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_3>}, | |
# # %arg8: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_4>}, | |
# # %arg9: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_5>}, | |
# # %arg10: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_6>}, | |
# # %arg11: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_7>}) | |
###########################################2048############################################################################################### | |
# mlir_path_2048="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.mlir" | |
mlir_path_2048=$mlir_path_128 | |
config_2048="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.json" | |
# vmfb_2048="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb" | |
vmfb_2048=$vmfb_128 | |
vmfb_2048_decode="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb" | |
vmfb_2048_decode=$vmfb_128_decode | |
benchmark_2048_prefill="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt" | |
benchmark_2048_decode="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.txt" | |
echo $mlir_path_2048 | |
echo $config_2048 | |
echo $vmfb_2048 | |
echo $benchmark_2048_prefill | |
create_file_if_not_exists $benchmark_2048_prefill | |
echo $benchmark_2048_decode | |
create_file_if_not_exists $benchmark_2048_decode | |
echo "iree-benchmark-module prefill: " | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=$vmfb_2048 \ | |
--parameters=model=$irpa_path \ | |
--device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \ | |
--function=prefill_bs4 \ | |
--input=4x2048xi64 \ | |
--input=4xi64 \ | |
--input=4x64xi64 \ | |
--input=513x1048576xf16 \ | |
--input=513x1048576xf16 \ | |
--input=513x1048576xf16 \ | |
--input=513x983040xf16 \ | |
--input=513x1048576xf16 \ | |
--input=513x1048576xf16 \ | |
--input=513x1048576xf16 \ | |
--input=513x983040xf16 \ | |
--benchmark_repetitions=10 \ | |
| tee $benchmark_2048_prefill | |
echo "iree-benchmark-module decode: " | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=$vmfb_2048_decode \ | |
--parameters=model=$irpa_path \ | |
--device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \ | |
--function=decode_bs4 \ | |
--input=4x1xi64 \ | |
--input=4xi64 \ | |
--input=4xi64 \ | |
--input=4x65xi64 \ | |
--input=513x1048576xf16 \ | |
--input=513x1048576xf16 \ | |
--input=513x1048576xf16 \ | |
--input=513x983040xf16 \ | |
--input=513x1048576xf16 \ | |
--input=513x1048576xf16 \ | |
--input=513x1048576xf16 \ | |
--input=513x983040xf16 \ | |
--benchmark_repetitions=10 \ | |
| tee $benchmark_2048_decode | |
########################shortfin server####################################################################################### | |
# TOKENIZER_JSON="/shark-dev/405b/instruct/tokenizer.json" | |
MODEL_CONFIG=$config_128 | |
VMFB=$vmfb_128 | |
IRPA_PATH=$irpa_path | |
conc=4 | |
RESULTS_DIR=/sharedfile/f16/128/405b/pp8/out | |
echo "shortfin_apps.llm.cli " | |
export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | |
cd /home/chi/src/shark-ai/shortfin | |
python3 -m shortfin_apps.llm.cli \ | |
--device hip \ | |
--tokenizer_json=$TOKENIZER_JSON \ | |
--model_config=$MODEL_CONFIG \ | |
--vmfb=$VMFB \ | |
--parameters $IRPA_PATH \ | |
--benchmark \ | |
--benchmark_tasks=4 \ | |
--device_ids 0 1 2 3 4 5 6 7 \ | |
--stream \ | |
--input_token_length 128 \ | |
--decode_steps=1 \ | |
--workers_offline=4 \ | |
--output-json $RESULTS_DIR/4.json | |
# To check online serving numerics and get tracy | |
# # Terminal 1 | |
# export SHORTFIN_ENABLE_TRACING=ON | |
# export TRACY_PORT=8007 | |
# python -m shortfin_apps.llm.server \ | |
# --tokenizer_json=/shark-dev/llama3.1/405b/instruct/tokenizer.json \ | |
# --model_config=/sharedfile/f16/128/405b/pp8/out/f16_iree0707.shark0710alex.json \ | |
# --vmfb=/sharedfile/f16/128/405b/pp8/out/f16_iree0707.shark0710alex.prefill.vmfb \ | |
# --parameters=/shark-dev/llama3.1/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa \ | |
# --device=hip \ | |
# --device_ids 0 1 2 3 4 5 6 7 | |
# # Terminal 2 | |
# curl http://localhost:8007/generate -H "Content-Type: application/json" -d '{ "return_text_in_logprobs": "False", "text": "life is amazing, is it not? ", "sampling_params": {"max_completion_tokens": 20}}' | |
# # Terminal 3 | |
# iree-tracy-capture -o mi300x3_405b_f16_pp8_trace.tracy -p 8007 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment