Created
June 9, 2025 17:07
-
-
Save vanbasten23/5e926c6ea393f9ae0963da77995ed591 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Usage: | |
# Run the file under the parent directory of the vllm directory as | |
# bash run_tpu_benchmark.sh --model <model_name> --tp 1 | |
# bash run_tpu_benchmark.sh --model <model_name> --tp 1 --profile | |
# bash run_tpu_benchmark.sh --model <model_name> --tp 4 | |
# | |
# Commonly used models: | |
# meta-llama/Meta-Llama-3.1-8B-Instruct | |
# neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | |
# meta-llama/Llama-3.1-70B-Instruct | |
# RedHatAI/Meta-Llama-3.1-70B-Instruct-quantized.w8a8 | |
# | |
# Examples: | |
# bash run_tpu_benchmark.sh --model RedHatAI/Meta-Llama-3.1-70B-Instruct-quantized.w8a8 --tp 8 | |
# bash run_tpu_benchmark.sh --model RedHatAI/Meta-Llama-3.1-70B-Instruct-quantized.w8a8 --tp 8 --profile | |
# bash run_tpu_benchmark.sh --model neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 --tp 8 | |
LONGOPTS=model:,tp:,profile | |
# Parse arguments | |
PARSED=$(getopt --options=$OPTIONS --longoptions=$LONGOPTS --name "$0" -- "$@") | |
if [[ $? -ne 0 ]]; then | |
exit 2 | |
fi | |
eval set -- "$PARSED" | |
profile=false | |
# Option parsing | |
while true; do | |
case "$1" in | |
--model) | |
model="$2" | |
shift 2 | |
;; | |
--tp) | |
tp=$2 | |
shift 2 | |
;; | |
--profile) | |
profile=true | |
shift | |
;; | |
--) | |
shift | |
break | |
;; | |
*) | |
echo "Unknown option: $1" | |
exit 3 | |
;; | |
esac | |
done | |
if [[ -z "$model" ]]; then | |
echo "Error: --model is required" | |
exit 1 | |
fi | |
if [[ -z "$tp" ]]; then | |
echo "Error: --tp is required" | |
exit 1 | |
fi | |
if $profile; then | |
rm -rfv /tmp/tpu-profile/ && mkdir /tmp/tpu-profile | |
export VLLM_TORCH_PROFILER_DIR=/tmp/tpu-profile | |
fi | |
if [ ! -e vllm/benchmarks/sonnet_4x.txt ]; then | |
echo "Creating the data file sonnet_4x.txt" | |
touch vllm/benchmarks/sonnet_4x.txt | |
for _ in {1..4} | |
do | |
cat vllm/benchmarks/sonnet.txt >> vllm/benchmarks/sonnet_4x.txt | |
done | |
else | |
echo "sonnet_4x.txt exists. Skip creating the file." | |
fi | |
DEFAULT_HOST=127.0.0.1 | |
DEFAULT_PORT=8000 | |
printf $profile | |
if ! $profile; then | |
echo "Profiling is disabled" | |
else | |
echo "Profiling is enabled" | |
fi | |
XLA_HLO_DEBUG=1 VLLM_USE_V1=1 vllm serve $model --disable-log-requests --gpu-memory-utilization 0.98 --max-num-batched-tokens 2048 --max-num-seqs 128 --max-model-len 2048 --tensor-parallel-size $tp --no-enable-prefix-caching 2>&1 | tee vllm_server_out.txt & | |
nc -zv $DEFAULT_HOST $DEFAULT_PORT | |
while [ $? -ne 0 ]; do | |
echo "Waiting for the server to start..." | |
sleep 15 | |
nc -zv $DEFAULT_HOST $DEFAULT_PORT | |
done | |
echo "Server is up and running" | |
if ! $profile; then | |
echo "Running the benchmark..." | |
XLA_HLO_DEBUG=1 python3 vllm/benchmarks/benchmark_serving.py --model $model --dataset-name sonnet --dataset-path vllm/benchmarks/sonnet_4x.txt --sonnet-input-len 1800 --sonnet-output-len 128 --ignore_eos 2>&1 | tee vllm_benchmark_out.txt | |
else | |
echo "Running the benchmark with profiling..." | |
XLA_HLO_DEBUG=1 python3 vllm/benchmarks/benchmark_serving.py --model $model --dataset-name sonnet --dataset-path vllm/benchmarks/sonnet_4x.txt --sonnet-input-len 1800 --sonnet-output-len 128 --ignore_eos --profile | |
fi | |
echo "All done. Killing the server..." | |
kill %1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment