Last active
September 17, 2024 22:50
-
-
Save dcbark01/5ac0dfaa86302282840d0eff4025bd37 to your computer and use it in GitHub Desktop.
Huggingface Text Generation Inference SLURM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --job-name=llm-swarm | |
#SBATCH --partition hopper-prod | |
#SBATCH --gpus={{gpus}} | |
#SBATCH --cpus-per-task=12 | |
#SBATCH --mem-per-cpu=11G | |
#SBATCH -o slurm/logs/%x_%j.out | |
# See original source here: | |
# https://github.com/huggingface/llm-swarm/blob/main/templates/tgi_h100.template.slurm | |
# For HF cluster internal users: Check if /fsx directory exists | |
if [ -d "/fsx/.cache" ]; then | |
export volume="/fsx/.cache" | |
else | |
export volume=".cache" | |
fi | |
export model={{model}} | |
export revision={{revision}} | |
function unused_port() { | |
N=${1:-1} | |
comm -23 \ | |
<(seq "1025" "65535" | sort) \ | |
<(ss -Htan | | |
awk '{print $4}' | | |
cut -d':' -f2 | | |
sort -u) | | |
shuf | | |
head -n "$N" | |
} | |
export PORT=$(unused_port) | |
if [ -z "$HUGGING_FACE_HUB_TOKEN" ]; then | |
# try reading from file | |
export HUGGING_FACE_HUB_TOKEN=$(cat ~/.cache/huggingface/token) | |
fi | |
echo "Starting TGI container port $PORT" | |
echo "http://$(hostname -I | awk '{print $1}'):$PORT" >> {{slurm_hosts_path}} | |
# unset cache dirs to avoid pyxis having host env var somehow get into the container | |
unset HF_HUB_CACHE HF_ASSETS_CACHE HF_DATASETS_CACHE HF_MODULES_CACHE | |
srun --container-image='ghcr.io#huggingface/text-generation-inference' \ | |
--container-env=HUGGING_FACE_HUB_TOKEN,PORT \ | |
--container-mounts="$volume:/data" \ | |
--no-container-mount-home \ | |
--qos normal \ | |
/usr/local/bin/text-generation-launcher \ | |
--model-id $model \ | |
--revision $revision \ | |
--max-concurrent-requests 2000 \ | |
--max-total-tokens {{model_max_length}} \ | |
--max-input-length {{model_input_length}} \ | |
--max-batch-prefill-tokens {{model_max_length}} \ | |
echo "End of job" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment