AmosLewis · June 18, 2025 03:43
diff --git a/llama8b_export_run_f16_8b_tp8.sh b/llama8b_export_run_f16_8b_tp8.sh
 # # Check if a command-line argument is provided
 if [ -z "$1" ]; then
 iree_day="0616"
 echo "No flag provided. Using default iree_day $iree_day."
 else
 iree_day="$1"
 fi

 if [ -z "$2" ]; then
 shark_day="0616"
 echo "No flag provided. Using default shark_day $shark_day."
 else
 shark_day="$2"
 fi

 irpa_path=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa


 # pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 \
 # --iree-device=hip://4 -k testBenchmark8b_f16_TP8_Non_Decomposed_Input_Len_128
 mlir_path_128="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.mlir"
 config_128="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.json"
 vmfb_128="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb"
 vmfb_128_decode="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb"
 benchmark_128_prefill="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt"
 benchmark_128_decode="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.txt"

 create_file_if_not_exists() {
    local FILE="$1"

    if [ ! -f "$FILE" ]; then
      touch "$FILE"
      echo "File created: $FILE"
    else
      echo "File already exists: $FILE"
    fi
 }

 echo $mlir_path_128
 echo $config_128
 echo $vmfb_128
 echo $vmfb_128_decode
 echo $benchmark_128_prefill
 create_file_if_not_exists $benchmark_128_prefill
 echo $benchmark_128_decode
 create_file_if_not_exists $benchmark_128_decode

 echo "python3: "
 python3 -m sharktank.examples.export_paged_llm_v1 \
  --irpa-file=$irpa_path \
  --output-mlir=$mlir_path_128 \
  --output-config=$config_128 \
  --bs-prefill=4 \
  --bs-decode=4 \
  --block-seq-stride=32 \
  --attention-dtype=float16 \
  --activation-dtype=float16 \
  --tensor-parallelism-size=8 \
  --pipeline-parallelism-size=1 \
  --attention-kernel=torch


 echo "iree-compile: "
 iree-compile \
  $mlir_path_128 \
  --iree-hip-target=gfx942 \
  -o=$vmfb_128 \
  --iree-hal-target-device="hip[0]" \
  --iree-hal-target-device="hip[1]" \
  --iree-hal-target-device="hip[2]" \
  --iree-hal-target-device="hip[3]" \
  --iree-hal-target-device="hip[4]" \
  --iree-hal-target-device="hip[5]" \
  --iree-hal-target-device="hip[6]" \
  --iree-hal-target-device="hip[7]" \
  --iree-opt-level=O3 \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-codegen-enable-default-tuning-specs=true \
  --iree-stream-affinity-solver-max-iterations=1024

 echo "iree-benchmark-module: "
 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
  iree-benchmark-module \
  --hip_use_streams=true \
  --module=$vmfb_128 \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \
  --device=hip://0 \
  --device=hip://1 \
  --device=hip://2 \
  --device=hip://3 \
  --device=hip://4 \
  --device=hip://5 \
  --device=hip://6 \
  --device=hip://7 \
  --function=prefill_bs4 \
  --input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/tokens.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/seq_lens.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/seq_block_ids.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy \
  --benchmark_repetitions=10 \
  | tee $benchmark_128_prefill

 echo "iree-compile decode: "
 iree-compile \
  $mlir_path_128 \
  --iree-hip-target=gfx942 \
  -o=$vmfb_128_decode \
  --iree-hal-target-device="hip[0]" \
  --iree-hal-target-device="hip[1]" \
  --iree-hal-target-device="hip[2]" \
  --iree-hal-target-device="hip[3]" \
  --iree-hal-target-device="hip[4]" \
  --iree-hal-target-device="hip[5]" \
  --iree-hal-target-device="hip[6]" \
  --iree-hal-target-device="hip[7]" \
  --iree-opt-level=O3 \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-stream-affinity-solver-max-iterations=1024

 echo "iree-benchmark-module decode: "
 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
  iree-benchmark-module \
  --hip_use_streams=true \
  --module=$vmfb_128_decode \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \
  --device=hip://0 \
  --device=hip://1 \
  --device=hip://2 \
  --device=hip://3 \
  --device=hip://4 \
  --device=hip://5 \
  --device=hip://6 \
  --device=hip://7 \
  --function=decode_bs4 \
  --input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/next_tokens.npy \
  --input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/seq_lens.npy \
  --input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/start_positions.npy \
  --input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/seq_block_ids.npy \
  --input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy \
  --input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy \
  --input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy \
  --input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy \
  --input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy \
  --input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy \
  --input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy \
  --input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy \
  --benchmark_repetitions=10 \
  | tee $benchmark_128_decode



 ###########################################2048###############################################################################################
 # pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 \
 # --iree-device=hip://4 -k testBenchmark8b_f16_TP8_Non_Decomposed_Input_Len_2048


 mlir_path_2048="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.mlir"
 config_2048="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.json"
 vmfb_2048="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb"
 vmfb_2048_decode="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb"
 benchmark_2048_prefill="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt"
 benchmark_2048_decode="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.txt"

 echo $mlir_path_2048
 echo $config_2048
 echo $vmfb_2048
 echo $benchmark_2048_prefill
 create_file_if_not_exists $benchmark_2048_prefill
 echo $benchmark_2048_decode
 create_file_if_not_exists $benchmark_2048_decode

 echo "python3 : "
 python3 -m sharktank.examples.export_paged_llm_v1 \
  --irpa-file=$irpa_path \
  --output-mlir=$mlir_path_2048 \
  --output-config=$config_2048 \
  --bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 \
  --activation-dtype=float16 --tensor-parallelism-size=8 --pipeline-parallelism-size=1 --attention-kernel=torch

 echo "iree-compile : "
 iree-compile \
  $mlir_path_2048 \
  --iree-hip-target=gfx942 \
  -o=$vmfb_2048\
  --iree-hal-target-device="hip[0]" \
  --iree-hal-target-device="hip[1]" \
  --iree-hal-target-device="hip[2]" \
  --iree-hal-target-device="hip[3]" \
  --iree-hal-target-device="hip[4]" \
  --iree-hal-target-device="hip[5]" \
  --iree-hal-target-device="hip[6]" \
  --iree-hal-target-device="hip[7]" \
  --iree-opt-level=O3 \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-codegen-enable-default-tuning-specs=true \
  --iree-stream-affinity-solver-max-iterations=1024

 echo "iree-benchmark-module : "
 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
  iree-benchmark-module \
  --hip_use_streams=true \
  --module=$vmfb_2048 \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \
  --device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \
  --function=prefill_bs4 \
  --input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/tokens.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/seq_lens.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/seq_block_ids.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_0.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_1.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_2.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_3.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_4.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_5.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_6.npy \
  --input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_7.npy \
  | tee $benchmark_2048_prefill

 echo "iree-compile decode: "
 iree-compile \
  $mlir_path_2048 \
  --iree-hip-target=gfx942 \
  -o=$vmfb_2048_decode\
  --iree-hal-target-device="hip[0]" \
  --iree-hal-target-device="hip[1]" \
  --iree-hal-target-device="hip[2]" \
  --iree-hal-target-device="hip[3]" \
  --iree-hal-target-device="hip[4]" \
  --iree-hal-target-device="hip[5]" \
  --iree-hal-target-device="hip[6]" \
  --iree-hal-target-device="hip[7]" \
  --iree-opt-level=O3 \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-stream-affinity-solver-max-iterations=1024

 echo "iree-benchmark-module decode: "
 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
  iree-benchmark-module \
  --hip_use_streams=true \
  --module=$vmfb_2048_decode \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \
  --parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \
  --device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \
  --function=decode_bs4 \
  --input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/next_tokens.npy \
  --input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/seq_lens.npy \
  --input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/start_positions.npy \
  --input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/seq_block_ids.npy \
  --input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_0.npy \
  --input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_1.npy \
  --input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_2.npy \
  --input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_3.npy \
  --input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_4.npy \
  --input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_5.npy \
  --input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_6.npy \
  --input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_7.npy \
  --benchmark_repetitions=10 \
  | tee $benchmark_2048_decode
	# # Check if a command-line argument is provided
	if [ -z "$1" ]; then
	iree_day="0616"
	echo "No flag provided. Using default iree_day $iree_day."
	else
	iree_day="$1"
	fi

	if [ -z "$2" ]; then
	shark_day="0616"
	echo "No flag provided. Using default shark_day $shark_day."
	else
	shark_day="$2"
	fi

	irpa_path=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa


	# pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 \
	# --iree-device=hip://4 -k testBenchmark8b_f16_TP8_Non_Decomposed_Input_Len_128
	mlir_path_128="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.mlir"
	config_128="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.json"
	vmfb_128="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb"
	vmfb_128_decode="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb"
	benchmark_128_prefill="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt"
	benchmark_128_decode="/sharedfile/f16/128/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.txt"

	create_file_if_not_exists() {
	local FILE="$1"

	if [ ! -f "$FILE" ]; then
	touch "$FILE"
	echo "File created: $FILE"
	else
	echo "File already exists: $FILE"
	fi
	}

	echo $mlir_path_128
	echo $config_128
	echo $vmfb_128
	echo $vmfb_128_decode
	echo $benchmark_128_prefill
	create_file_if_not_exists $benchmark_128_prefill
	echo $benchmark_128_decode
	create_file_if_not_exists $benchmark_128_decode

	echo "python3: "
	python3 -m sharktank.examples.export_paged_llm_v1 \
	--irpa-file=$irpa_path \
	--output-mlir=$mlir_path_128 \
	--output-config=$config_128 \
	--bs-prefill=4 \
	--bs-decode=4 \
	--block-seq-stride=32 \
	--attention-dtype=float16 \
	--activation-dtype=float16 \
	--tensor-parallelism-size=8 \
	--pipeline-parallelism-size=1 \
	--attention-kernel=torch


	echo "iree-compile: "
	iree-compile \
	$mlir_path_128 \
	--iree-hip-target=gfx942 \
	-o=$vmfb_128 \
	--iree-hal-target-device="hip[0]" \
	--iree-hal-target-device="hip[1]" \
	--iree-hal-target-device="hip[2]" \
	--iree-hal-target-device="hip[3]" \
	--iree-hal-target-device="hip[4]" \
	--iree-hal-target-device="hip[5]" \
	--iree-hal-target-device="hip[6]" \
	--iree-hal-target-device="hip[7]" \
	--iree-opt-level=O3 \
	--iree-hal-indirect-command-buffers=true \
	--iree-stream-resource-memory-model=discrete \
	--iree-hal-memoization=true \
	--iree-codegen-enable-default-tuning-specs=true \
	--iree-stream-affinity-solver-max-iterations=1024

	echo "iree-benchmark-module: "
	ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	iree-benchmark-module \
	--hip_use_streams=true \
	--module=$vmfb_128 \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \
	--device=hip://0 \
	--device=hip://1 \
	--device=hip://2 \
	--device=hip://3 \
	--device=hip://4 \
	--device=hip://5 \
	--device=hip://6 \
	--device=hip://7 \
	--function=prefill_bs4 \
	--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/tokens.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/seq_lens.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/seq_block_ids.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy \
	--benchmark_repetitions=10 \
	\| tee $benchmark_128_prefill

	echo "iree-compile decode: "
	iree-compile \
	$mlir_path_128 \
	--iree-hip-target=gfx942 \
	-o=$vmfb_128_decode \
	--iree-hal-target-device="hip[0]" \
	--iree-hal-target-device="hip[1]" \
	--iree-hal-target-device="hip[2]" \
	--iree-hal-target-device="hip[3]" \
	--iree-hal-target-device="hip[4]" \
	--iree-hal-target-device="hip[5]" \
	--iree-hal-target-device="hip[6]" \
	--iree-hal-target-device="hip[7]" \
	--iree-opt-level=O3 \
	--iree-hal-indirect-command-buffers=true \
	--iree-stream-resource-memory-model=discrete \
	--iree-hal-memoization=true \
	--iree-stream-affinity-solver-max-iterations=1024

	echo "iree-benchmark-module decode: "
	ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	iree-benchmark-module \
	--hip_use_streams=true \
	--module=$vmfb_128_decode \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \
	--device=hip://0 \
	--device=hip://1 \
	--device=hip://2 \
	--device=hip://3 \
	--device=hip://4 \
	--device=hip://5 \
	--device=hip://6 \
	--device=hip://7 \
	--function=decode_bs4 \
	--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/next_tokens.npy \
	--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/seq_lens.npy \
	--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/start_positions.npy \
	--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/seq_block_ids.npy \
	--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_0.npy \
	--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_1.npy \
	--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_2.npy \
	--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_3.npy \
	--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_4.npy \
	--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_5.npy \
	--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_6.npy \
	--input=@/shark-dev/8b/decode_args_bs4_128_stride_32_tp8/cs_f16_shard_7.npy \
	--benchmark_repetitions=10 \
	\| tee $benchmark_128_decode



	###########################################2048###############################################################################################
	# pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 \
	# --iree-device=hip://4 -k testBenchmark8b_f16_TP8_Non_Decomposed_Input_Len_2048


	mlir_path_2048="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.mlir"
	config_2048="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.json"
	vmfb_2048="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.vmfb"
	vmfb_2048_decode="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.vmfb"
	benchmark_2048_prefill="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.prefill.txt"
	benchmark_2048_decode="/sharedfile/f16/2048/8b/tp8/out/f16_iree$iree_day.shark$shark_day.decode.txt"

	echo $mlir_path_2048
	echo $config_2048
	echo $vmfb_2048
	echo $benchmark_2048_prefill
	create_file_if_not_exists $benchmark_2048_prefill
	echo $benchmark_2048_decode
	create_file_if_not_exists $benchmark_2048_decode

	echo "python3 : "
	python3 -m sharktank.examples.export_paged_llm_v1 \
	--irpa-file=$irpa_path \
	--output-mlir=$mlir_path_2048 \
	--output-config=$config_2048 \
	--bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 \
	--activation-dtype=float16 --tensor-parallelism-size=8 --pipeline-parallelism-size=1 --attention-kernel=torch

	echo "iree-compile : "
	iree-compile \
	$mlir_path_2048 \
	--iree-hip-target=gfx942 \
	-o=$vmfb_2048\
	--iree-hal-target-device="hip[0]" \
	--iree-hal-target-device="hip[1]" \
	--iree-hal-target-device="hip[2]" \
	--iree-hal-target-device="hip[3]" \
	--iree-hal-target-device="hip[4]" \
	--iree-hal-target-device="hip[5]" \
	--iree-hal-target-device="hip[6]" \
	--iree-hal-target-device="hip[7]" \
	--iree-opt-level=O3 \
	--iree-hal-indirect-command-buffers=true \
	--iree-stream-resource-memory-model=discrete \
	--iree-hal-memoization=true \
	--iree-codegen-enable-default-tuning-specs=true \
	--iree-stream-affinity-solver-max-iterations=1024

	echo "iree-benchmark-module : "
	ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	iree-benchmark-module \
	--hip_use_streams=true \
	--module=$vmfb_2048 \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \
	--device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \
	--function=prefill_bs4 \
	--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/tokens.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/seq_lens.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/seq_block_ids.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_0.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_1.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_2.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_3.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_4.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_5.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_6.npy \
	--input=@/shark-dev/8b/prefill_args_bs4_2048_stride_32_tp8/cs_f16_shard_7.npy \
	\| tee $benchmark_2048_prefill

	echo "iree-compile decode: "
	iree-compile \
	$mlir_path_2048 \
	--iree-hip-target=gfx942 \
	-o=$vmfb_2048_decode\
	--iree-hal-target-device="hip[0]" \
	--iree-hal-target-device="hip[1]" \
	--iree-hal-target-device="hip[2]" \
	--iree-hal-target-device="hip[3]" \
	--iree-hal-target-device="hip[4]" \
	--iree-hal-target-device="hip[5]" \
	--iree-hal-target-device="hip[6]" \
	--iree-hal-target-device="hip[7]" \
	--iree-opt-level=O3 \
	--iree-hal-indirect-command-buffers=true \
	--iree-stream-resource-memory-model=discrete \
	--iree-hal-memoization=true \
	--iree-stream-affinity-solver-max-iterations=1024

	echo "iree-benchmark-module decode: "
	ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	iree-benchmark-module \
	--hip_use_streams=true \
	--module=$vmfb_2048_decode \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank0.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank1.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank2.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank3.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank4.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank5.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank6.irpa \
	--parameters=model=/shark-dev/8b/instruct/weights/tp8/llama3.1_8b_instruct_fp16_tp8.rank7.irpa \
	--device=hip://0 --device=hip://1 --device=hip://2 --device=hip://3 --device=hip://4 --device=hip://5 --device=hip://6 --device=hip://7 \
	--function=decode_bs4 \
	--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/next_tokens.npy \
	--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/seq_lens.npy \
	--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/start_positions.npy \
	--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/seq_block_ids.npy \
	--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_0.npy \
	--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_1.npy \
	--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_2.npy \
	--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_3.npy \
	--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_4.npy \
	--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_5.npy \
	--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_6.npy \
	--input=@/shark-dev/8b/decode_args_bs4_2048_stride_32_tp8/cs_f16_shard_7.npy \
	--benchmark_repetitions=10 \
	\| tee $benchmark_2048_decode