AmosLewis · July 11, 2025 18:43
diff --git a/export_run_f16_405b_pp8.sh b/export_run_f16_405b_pp8.sh
 # # Check if a command-line argument is provided
 if [ -z "$1" ]; then
 iree_day="0710"
 echo "No flag provided. Using default iree_day $iree_day."
 else
 iree_day="$1"
 fi

 if [ -z "$2" ]; then
 shark_day="0710alex"
 echo "No flag provided. Using default shark_day $shark_day."
 else
 shark_day="$2"
 fi

 # on SharkMI300x
 # IRPA_PATH=/shark-dev/405b/instruct/weights/llama3.1_405b_fp16.irpa
 # TOKENIZER_JSON=/shark-dev/405b/instruct/tokenizer.json

 # on SharkMI300x-3
 irpa_path=/shark-dev/llama3.1/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa
 TOKENIZER_JSON=/shark-dev/llama3.1/405b/instruct/tokenizer.json


 # On chiliu12@quanta-ccs-aus-f14-01 MI325
 # IRPA_PATH=/shark-dev/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa
 # TOKENIZER_JSON=/shark-dev/405b/instruct/tokenizer.json

 # On ssh [email protected]
 # IRPA_PATH=/shark-dev/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa
 # TOKENIZER_JSON=/shark-dev/405b/instruct/tokenizer.json

 # # To check numerics perplexity_iree
 # # python -m sharktank.evaluate.perplexity_iree \
 # #     --irpa-file=/shark-dev/llama3.1/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa \
 # #     --tokenizer-config-json=/shark-dev/llama3.1/405b/instruct/tokenizer_config.json \
 # #     --iree-device='hip://0'  \
 # #     --iree-device='hip://1' \
 # #     --iree-device='hip://2' \
 # #     --iree-device='hip://3' \
 # #     --iree-device='hip://4' \
 # #     --iree-device='hip://5' \
 # #     --iree-device='hip://6' \
 # #     --iree-device='hip://7' \
 # #     --iree-hal-target-device=hip \
 # #     --iree-hip-target=gfx942 \
 # #     --pipeline-parallelism-size=8 \
 # #     --num-prompts=4 \
 # #     --use-attention-mask \
 # #     --verbose

 mlir_path_128="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.mlir"
 config_128="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.json"
 vmfb_128="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb"
 vmfb_128_decode="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb"
 benchmark_128_prefill="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt"
 benchmark_128_decode="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.txt"

 create_file_if_not_exists() {
    local FILE="$1"

    if [ ! -f "$FILE" ]; then
      touch "$FILE"
      echo "File created: $FILE"
    else
      echo "File already exists: $FILE"
    fi
 }

 echo $mlir_path_128
 echo $config_128
 echo $vmfb_128
 echo $vmfb_128_decode
 echo $benchmark_128_prefill
 create_file_if_not_exists $benchmark_128_prefill
 echo $benchmark_128_decode
 create_file_if_not_exists $benchmark_128_decode

 # echo "export model: "
 python3 -m sharktank.examples.export_paged_llm_v1 \
    --irpa-file=$irpa_path \
    --output-mlir=$mlir_path_128 \
    --output-config=$config_128 \
    --bs-prefill=4 \
    --bs-decode=4 \
    --block-seq-stride=32 \
    --attention-dtype=float16 \
    --activation-dtype=float16 \
    --tensor-parallelism-size=1 \
    --pipeline-parallelism-size=8 \
    --attention-kernel=torch \
    --kv-cache-dtype=float16 \
    --use-attention-mask

 echo "iree-compile: "
 iree-compile \
    $mlir_path_128 \
    --iree-hip-target=gfx950 \
    -o=$vmfb_128 \
    --iree-hal-target-device="hip[0]" \
    --iree-hal-target-device="hip[1]" \
    --iree-hal-target-device="hip[2]" \
    --iree-hal-target-device="hip[3]" \
    --iree-hal-target-device="hip[4]" \
    --iree-hal-target-device="hip[5]" \
    --iree-hal-target-device="hip[6]" \
    --iree-hal-target-device="hip[7]" \
    --iree-opt-level=O3 \
    --iree-dispatch-creation-propagate-collapse-across-expands=true \
    --iree-hal-indirect-command-buffers=true \
    --iree-stream-resource-memory-model=discrete \
    --iree-hal-memoization=true \
    --iree-codegen-enable-default-tuning-specs=true \
    --iree-stream-affinity-solver-max-iterations=1024

 echo "iree-benchmark-module prefill: "
 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
    iree-benchmark-module \
    --hip_use_streams=true \
    --module=$vmfb_128 \
    --parameters=model=$irpa_path \
    --device=hip://0 \
    --device=hip://1 \
    --device=hip://2 \
    --device=hip://3 \
    --device=hip://4 \
    --device=hip://5 \
    --device=hip://6 \
    --device=hip://7 \
    --function=prefill_bs4 \
    --input=4x128xi64 \
    --input=4xi64 \
    --input=4x4xi64 \
    --input=128x1048576xf16 \
    --input=128x1048576xf16 \
    --input=128x1048576xf16 \
    --input=128x983040xf16 \
    --input=128x1048576xf16 \
    --input=128x1048576xf16 \
    --input=128x1048576xf16 \
    --input=128x983040xf16 \
    --benchmark_repetitions=10 \
    | tee $benchmark_128_prefill

 # # %arg3: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_0>},
 # # %arg4: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_1>}, 
 # # %arg5: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_2>}, 
 # # %arg6: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_3>}, 
 # # %arg7: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_4>}, 
 # # %arg8: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_5>}, 
 # # %arg9: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_6>}, 
 # # %arg10: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_7>}

 echo "iree-compile decode: "
 iree-compile \
    $mlir_path_128 \
    --iree-hip-target=gfx950 \
    -o=$vmfb_128_decode \
    --iree-hal-target-device="hip[0]" \
    --iree-hal-target-device="hip[1]" \
    --iree-hal-target-device="hip[2]" \
    --iree-hal-target-device="hip[3]" \
    --iree-hal-target-device="hip[4]" \
    --iree-hal-target-device="hip[5]" \
    --iree-hal-target-device="hip[6]" \
    --iree-hal-target-device="hip[7]" \
    --iree-opt-level=O3 \
    --iree-dispatch-creation-propagate-collapse-across-expands=true \
    --iree-hal-indirect-command-buffers=true \
    --iree-stream-resource-memory-model=discrete \
    --iree-hal-memoization=true \
    --iree-stream-affinity-solver-max-iterations=1024

 echo "iree-benchmark-module decode: "
 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
    iree-benchmark-module \
    --hip_use_streams=true \
    --module=$vmfb_128_decode \
    --parameters=model=$irpa_path \
    --device=hip://0 \
    --device=hip://1 \
    --device=hip://2 \
    --device=hip://3 \
    --device=hip://4 \
    --device=hip://5 \
    --device=hip://6 \
    --device=hip://7 \
    --function=decode_bs4 \
    --input=4x1xi64 \
    --input=4xi64 \
    --input=4xi64 \
    --input=4x5xi64 \
    --input=128x1048576xf16 \
    --input=128x1048576xf16 \
    --input=128x1048576xf16 \
    --input=128x983040xf16 \
    --input=128x1048576xf16 \
    --input=128x1048576xf16 \
    --input=128x1048576xf16 \
    --input=128x983040xf16 \
    --benchmark_repetitions=10 \
    | tee $benchmark_128_decode

 # # %arg0: !torch.vtensor<[4,1],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>}, 
 # # %arg1: !torch.vtensor<[4],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>}, 
 # # %arg2: !torch.vtensor<[4],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>}, 
 # # %arg3: !torch.vtensor<[4,?],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>}, 
 # # %arg4: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_0>}, 
 # # %arg5: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_1>}, 
 # # %arg6: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_2>}, 
 # # %arg7: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_3>}, 
 # # %arg8: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_4>}, 
 # # %arg9: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_5>}, 
 # # %arg10: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_6>}, 
 # # %arg11: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_7>}) 


 ###########################################2048###############################################################################################

 # mlir_path_2048="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.mlir"
 mlir_path_2048=$mlir_path_128
 config_2048="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.json"
 # vmfb_2048="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb"
 vmfb_2048=$vmfb_128
 vmfb_2048_decode="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb"
 vmfb_2048_decode=$vmfb_128_decode
 benchmark_2048_prefill="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt"
 benchmark_2048_decode="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.txt"

 echo $mlir_path_2048
 echo $config_2048
 echo $vmfb_2048
 echo $benchmark_2048_prefill
 create_file_if_not_exists $benchmark_2048_prefill
 echo $benchmark_2048_decode
 create_file_if_not_exists $benchmark_2048_decode

 echo "iree-benchmark-module prefill: "
 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
    iree-benchmark-module \
    --hip_use_streams=true \
    --module=$vmfb_2048 \
    --parameters=model=$irpa_path \
    --device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \
    --function=prefill_bs4 \
    --input=4x2048xi64 \
    --input=4xi64 \
    --input=4x64xi64 \
    --input=513x1048576xf16 \
    --input=513x1048576xf16 \
    --input=513x1048576xf16 \
    --input=513x983040xf16 \
    --input=513x1048576xf16 \
    --input=513x1048576xf16 \
    --input=513x1048576xf16 \
    --input=513x983040xf16 \
    --benchmark_repetitions=10 \
    | tee $benchmark_2048_prefill

 echo "iree-benchmark-module decode: "
 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
    iree-benchmark-module \
    --hip_use_streams=true \
    --module=$vmfb_2048_decode \
    --parameters=model=$irpa_path \
    --device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \
    --function=decode_bs4 \
    --input=4x1xi64 \
    --input=4xi64 \
    --input=4xi64 \
    --input=4x65xi64 \
    --input=513x1048576xf16 \
    --input=513x1048576xf16 \
    --input=513x1048576xf16 \
    --input=513x983040xf16 \
    --input=513x1048576xf16 \
    --input=513x1048576xf16 \
    --input=513x1048576xf16 \
    --input=513x983040xf16 \
    --benchmark_repetitions=10 \
    | tee $benchmark_2048_decode



 ########################shortfin server#######################################################################################
 # TOKENIZER_JSON="/shark-dev/405b/instruct/tokenizer.json"
 MODEL_CONFIG=$config_128
 VMFB=$vmfb_128
 IRPA_PATH=$irpa_path
 conc=4
 RESULTS_DIR=/sharedfile/f16/128/405b/pp8/out

 echo "shortfin_apps.llm.cli "
 export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

 cd /home/chi/src/shark-ai/shortfin
 python3 -m shortfin_apps.llm.cli \
    --device hip \
    --tokenizer_json=$TOKENIZER_JSON \
    --model_config=$MODEL_CONFIG \
    --vmfb=$VMFB \
    --parameters $IRPA_PATH \
    --benchmark  \
    --benchmark_tasks=4  \
    --device_ids 0 1 2 3 4 5 6 7 \
    --stream \
    --input_token_length 128 \
    --decode_steps=1 \
    --workers_offline=4 \
    --output-json $RESULTS_DIR/4.json

 # To check online serving numerics and get tracy
 # # Terminal 1
 # export SHORTFIN_ENABLE_TRACING=ON
 # export TRACY_PORT=8007
 # python -m shortfin_apps.llm.server \
 #             --tokenizer_json=/shark-dev/llama3.1/405b/instruct/tokenizer.json \
 #             --model_config=/sharedfile/f16/128/405b/pp8/out/f16_iree0707.shark0710alex.json \
 #             --vmfb=/sharedfile/f16/128/405b/pp8/out/f16_iree0707.shark0710alex.prefill.vmfb \
 #             --parameters=/shark-dev/llama3.1/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa \
 #             --device=hip \
 #             --device_ids 0 1 2 3 4 5 6 7

 # # Terminal 2
 # curl http://localhost:8007/generate -H "Content-Type: application/json" -d '{ "return_text_in_logprobs": "False", "text":  "life is amazing, is it not? ", "sampling_params": {"max_completion_tokens": 20}}'


 # # Terminal 3			   
 # iree-tracy-capture -o mi300x3_405b_f16_pp8_trace.tracy -p 8007
	# # Check if a command-line argument is provided
	if [ -z "$1" ]; then
	iree_day="0710"
	echo "No flag provided. Using default iree_day $iree_day."
	else
	iree_day="$1"
	fi

	if [ -z "$2" ]; then
	shark_day="0710alex"
	echo "No flag provided. Using default shark_day $shark_day."
	else
	shark_day="$2"
	fi

	# on SharkMI300x
	# IRPA_PATH=/shark-dev/405b/instruct/weights/llama3.1_405b_fp16.irpa
	# TOKENIZER_JSON=/shark-dev/405b/instruct/tokenizer.json

	# on SharkMI300x-3
	irpa_path=/shark-dev/llama3.1/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa
	TOKENIZER_JSON=/shark-dev/llama3.1/405b/instruct/tokenizer.json


	# On chiliu12@quanta-ccs-aus-f14-01 MI325
	# IRPA_PATH=/shark-dev/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa
	# TOKENIZER_JSON=/shark-dev/405b/instruct/tokenizer.json

	# On ssh [email protected]
	# IRPA_PATH=/shark-dev/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa
	# TOKENIZER_JSON=/shark-dev/405b/instruct/tokenizer.json

	# # To check numerics perplexity_iree
	# # python -m sharktank.evaluate.perplexity_iree \
	# # --irpa-file=/shark-dev/llama3.1/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa \
	# # --tokenizer-config-json=/shark-dev/llama3.1/405b/instruct/tokenizer_config.json \
	# # --iree-device='hip://0' \
	# # --iree-device='hip://1' \
	# # --iree-device='hip://2' \
	# # --iree-device='hip://3' \
	# # --iree-device='hip://4' \
	# # --iree-device='hip://5' \
	# # --iree-device='hip://6' \
	# # --iree-device='hip://7' \
	# # --iree-hal-target-device=hip \
	# # --iree-hip-target=gfx942 \
	# # --pipeline-parallelism-size=8 \
	# # --num-prompts=4 \
	# # --use-attention-mask \
	# # --verbose

	mlir_path_128="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.mlir"
	config_128="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.json"
	vmfb_128="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb"
	vmfb_128_decode="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb"
	benchmark_128_prefill="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt"
	benchmark_128_decode="/sharedfile/f16/128/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.txt"

	create_file_if_not_exists() {
	local FILE="$1"

	if [ ! -f "$FILE" ]; then
	touch "$FILE"
	echo "File created: $FILE"
	else
	echo "File already exists: $FILE"
	fi
	}

	echo $mlir_path_128
	echo $config_128
	echo $vmfb_128
	echo $vmfb_128_decode
	echo $benchmark_128_prefill
	create_file_if_not_exists $benchmark_128_prefill
	echo $benchmark_128_decode
	create_file_if_not_exists $benchmark_128_decode

	# echo "export model: "
	python3 -m sharktank.examples.export_paged_llm_v1 \
	--irpa-file=$irpa_path \
	--output-mlir=$mlir_path_128 \
	--output-config=$config_128 \
	--bs-prefill=4 \
	--bs-decode=4 \
	--block-seq-stride=32 \
	--attention-dtype=float16 \
	--activation-dtype=float16 \
	--tensor-parallelism-size=1 \
	--pipeline-parallelism-size=8 \
	--attention-kernel=torch \
	--kv-cache-dtype=float16 \
	--use-attention-mask

	echo "iree-compile: "
	iree-compile \
	$mlir_path_128 \
	--iree-hip-target=gfx950 \
	-o=$vmfb_128 \
	--iree-hal-target-device="hip[0]" \
	--iree-hal-target-device="hip[1]" \
	--iree-hal-target-device="hip[2]" \
	--iree-hal-target-device="hip[3]" \
	--iree-hal-target-device="hip[4]" \
	--iree-hal-target-device="hip[5]" \
	--iree-hal-target-device="hip[6]" \
	--iree-hal-target-device="hip[7]" \
	--iree-opt-level=O3 \
	--iree-dispatch-creation-propagate-collapse-across-expands=true \
	--iree-hal-indirect-command-buffers=true \
	--iree-stream-resource-memory-model=discrete \
	--iree-hal-memoization=true \
	--iree-codegen-enable-default-tuning-specs=true \
	--iree-stream-affinity-solver-max-iterations=1024

	echo "iree-benchmark-module prefill: "
	ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	iree-benchmark-module \
	--hip_use_streams=true \
	--module=$vmfb_128 \
	--parameters=model=$irpa_path \
	--device=hip://0 \
	--device=hip://1 \
	--device=hip://2 \
	--device=hip://3 \
	--device=hip://4 \
	--device=hip://5 \
	--device=hip://6 \
	--device=hip://7 \
	--function=prefill_bs4 \
	--input=4x128xi64 \
	--input=4xi64 \
	--input=4x4xi64 \
	--input=128x1048576xf16 \
	--input=128x1048576xf16 \
	--input=128x1048576xf16 \
	--input=128x983040xf16 \
	--input=128x1048576xf16 \
	--input=128x1048576xf16 \
	--input=128x1048576xf16 \
	--input=128x983040xf16 \
	--benchmark_repetitions=10 \
	\| tee $benchmark_128_prefill

	# # %arg3: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_0>},
	# # %arg4: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_1>},
	# # %arg5: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_2>},
	# # %arg6: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_3>},
	# # %arg7: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_4>},
	# # %arg8: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_5>},
	# # %arg9: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_6>},
	# # %arg10: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_7>}

	echo "iree-compile decode: "
	iree-compile \
	$mlir_path_128 \
	--iree-hip-target=gfx950 \
	-o=$vmfb_128_decode \
	--iree-hal-target-device="hip[0]" \
	--iree-hal-target-device="hip[1]" \
	--iree-hal-target-device="hip[2]" \
	--iree-hal-target-device="hip[3]" \
	--iree-hal-target-device="hip[4]" \
	--iree-hal-target-device="hip[5]" \
	--iree-hal-target-device="hip[6]" \
	--iree-hal-target-device="hip[7]" \
	--iree-opt-level=O3 \
	--iree-dispatch-creation-propagate-collapse-across-expands=true \
	--iree-hal-indirect-command-buffers=true \
	--iree-stream-resource-memory-model=discrete \
	--iree-hal-memoization=true \
	--iree-stream-affinity-solver-max-iterations=1024

	echo "iree-benchmark-module decode: "
	ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	iree-benchmark-module \
	--hip_use_streams=true \
	--module=$vmfb_128_decode \
	--parameters=model=$irpa_path \
	--device=hip://0 \
	--device=hip://1 \
	--device=hip://2 \
	--device=hip://3 \
	--device=hip://4 \
	--device=hip://5 \
	--device=hip://6 \
	--device=hip://7 \
	--function=decode_bs4 \
	--input=4x1xi64 \
	--input=4xi64 \
	--input=4xi64 \
	--input=4x5xi64 \
	--input=128x1048576xf16 \
	--input=128x1048576xf16 \
	--input=128x1048576xf16 \
	--input=128x983040xf16 \
	--input=128x1048576xf16 \
	--input=128x1048576xf16 \
	--input=128x1048576xf16 \
	--input=128x983040xf16 \
	--benchmark_repetitions=10 \
	\| tee $benchmark_128_decode

	# # %arg0: !torch.vtensor<[4,1],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>},
	# # %arg1: !torch.vtensor<[4],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>},
	# # %arg2: !torch.vtensor<[4],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>},
	# # %arg3: !torch.vtensor<[4,?],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>},
	# # %arg4: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_0>},
	# # %arg5: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_1>},
	# # %arg6: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_2>},
	# # %arg7: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_3>},
	# # %arg8: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_4>},
	# # %arg9: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_5>},
	# # %arg10: !torch.tensor<[?,1048576],f16> {iree.abi.affinity = #hal.device.promise<@__device_6>},
	# # %arg11: !torch.tensor<[?,983040],f16> {iree.abi.affinity = #hal.device.promise<@__device_7>})


	###########################################2048###############################################################################################

	# mlir_path_2048="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.mlir"
	mlir_path_2048=$mlir_path_128
	config_2048="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.json"
	# vmfb_2048="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb"
	vmfb_2048=$vmfb_128
	vmfb_2048_decode="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb"
	vmfb_2048_decode=$vmfb_128_decode
	benchmark_2048_prefill="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt"
	benchmark_2048_decode="/sharedfile/f16/2048/405b/pp8/out/f16_iree$iree_day.shark$shark_day.decode.txt"

	echo $mlir_path_2048
	echo $config_2048
	echo $vmfb_2048
	echo $benchmark_2048_prefill
	create_file_if_not_exists $benchmark_2048_prefill
	echo $benchmark_2048_decode
	create_file_if_not_exists $benchmark_2048_decode

	echo "iree-benchmark-module prefill: "
	ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	iree-benchmark-module \
	--hip_use_streams=true \
	--module=$vmfb_2048 \
	--parameters=model=$irpa_path \
	--device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \
	--function=prefill_bs4 \
	--input=4x2048xi64 \
	--input=4xi64 \
	--input=4x64xi64 \
	--input=513x1048576xf16 \
	--input=513x1048576xf16 \
	--input=513x1048576xf16 \
	--input=513x983040xf16 \
	--input=513x1048576xf16 \
	--input=513x1048576xf16 \
	--input=513x1048576xf16 \
	--input=513x983040xf16 \
	--benchmark_repetitions=10 \
	\| tee $benchmark_2048_prefill

	echo "iree-benchmark-module decode: "
	ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	iree-benchmark-module \
	--hip_use_streams=true \
	--module=$vmfb_2048_decode \
	--parameters=model=$irpa_path \
	--device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \
	--function=decode_bs4 \
	--input=4x1xi64 \
	--input=4xi64 \
	--input=4xi64 \
	--input=4x65xi64 \
	--input=513x1048576xf16 \
	--input=513x1048576xf16 \
	--input=513x1048576xf16 \
	--input=513x983040xf16 \
	--input=513x1048576xf16 \
	--input=513x1048576xf16 \
	--input=513x1048576xf16 \
	--input=513x983040xf16 \
	--benchmark_repetitions=10 \
	\| tee $benchmark_2048_decode



	########################shortfin server#######################################################################################
	# TOKENIZER_JSON="/shark-dev/405b/instruct/tokenizer.json"
	MODEL_CONFIG=$config_128
	VMFB=$vmfb_128
	IRPA_PATH=$irpa_path
	conc=4
	RESULTS_DIR=/sharedfile/f16/128/405b/pp8/out

	echo "shortfin_apps.llm.cli "
	export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

	cd /home/chi/src/shark-ai/shortfin
	python3 -m shortfin_apps.llm.cli \
	--device hip \
	--tokenizer_json=$TOKENIZER_JSON \
	--model_config=$MODEL_CONFIG \
	--vmfb=$VMFB \
	--parameters $IRPA_PATH \
	--benchmark \
	--benchmark_tasks=4 \
	--device_ids 0 1 2 3 4 5 6 7 \
	--stream \
	--input_token_length 128 \
	--decode_steps=1 \
	--workers_offline=4 \
	--output-json $RESULTS_DIR/4.json

	# To check online serving numerics and get tracy
	# # Terminal 1
	# export SHORTFIN_ENABLE_TRACING=ON
	# export TRACY_PORT=8007
	# python -m shortfin_apps.llm.server \
	# --tokenizer_json=/shark-dev/llama3.1/405b/instruct/tokenizer.json \
	# --model_config=/sharedfile/f16/128/405b/pp8/out/f16_iree0707.shark0710alex.json \
	# --vmfb=/sharedfile/f16/128/405b/pp8/out/f16_iree0707.shark0710alex.prefill.vmfb \
	# --parameters=/shark-dev/llama3.1/405b/instruct/weights/fp16/llama3_405b_instruct_fp16.irpa \
	# --device=hip \
	# --device_ids 0 1 2 3 4 5 6 7

	# # Terminal 2
	# curl http://localhost:8007/generate -H "Content-Type: application/json" -d '{ "return_text_in_logprobs": "False", "text": "life is amazing, is it not? ", "sampling_params": {"max_completion_tokens": 20}}'


	# # Terminal 3
	# iree-tracy-capture -o mi300x3_405b_f16_pp8_trace.tracy -p 8007