A docker image with the vLLM server installed.
export DOCKER_IMAGE=docker.io/fxnlabs/vllm-openai
# or use the following if you want to use the latest version
# export DOCKER_IMAGE=docker.io/vllm/vllm-openai
docker run \
--entrypoint /bin/bash \
--network host \
--ipc=host \
--name vllm-ray-head \
--gpus 0 \
-v "$HOME/.cache/huggingface:/root/.cache/huggingface" \
docker.io/vllm/vllm-openai \
-c 'ray start --head --port 6379 --node-ip-address 192.168.196.97 --block --num-gpus 1'
docker run \
--entrypoint /bin/bash \
--network host \
--ipc=host \
--name vllm-ray-worker \
--gpus 1 \
-v "$HOME/.cache/huggingface:/root/.cache/huggingface" \
$DOCKER_IMAGE \
-c "ray start --address='192.168.196.97:6379' --block"
docker exec -it vllm-ray-head bash -c 'python3 -m vllm.entrypoints.openai.api_server \
--host 0.0.0.0 \
--pipeline-parallel-size 2 \
--port 8080 \
--model neuralmagic/Meta-Llama-3-8B-Instruct-FP8
' > server.log 2>&1
docker run -it --rm \
--runtime=nvidia \
--network host \
--ipc=host \
--gpus 0 \
--entrypoint /bin/bash \
-v "$HOME/.cache/huggingface:/root/.cache/huggingface" docker.io/fxnlabs/vllm-openai:latest \
$DOCKER_IMAGE \
-c 'python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --pipeline-parallel-size 2 --port 8080 --model neuralmagic/Meta-Llama-3-8B-Instruct-FP8' > server.log 2>&1