Note: While it definitely works with RayCluster, using RayJob is much easier.
FROM vllm/vllm-openai:v0.5.2
RUN apt update && apt install -y wget # important for future healthcheck
RUN pip3 install ray[default] # important for future healthcheck
ENTRYPOINT [""] # important for launching ray cluster using vllm image, the original one's entryppint is vLLM startup script.
apiVersion: ray.io/v1
kind: RayJob
metadata:
name: vllm-server
spec:
entrypoint: vllm serve meta-llama/Llama-2-70b-chat-hf --tensor-parallel-size 4 --distributed-executor-backend ray
runtimeEnvYAML: |
pip:
- requests
env_vars:
HF_TOKEN: "xxxxxxxxx"
# rayClusterSpec specifies the RayCluster instance to be created by the RayJob controller.
rayClusterSpec:
rayVersion: '2.32.0'
headGroupSpec:
rayStartParams:
dashboard-host: '0.0.0.0'
template:
spec:
containers:
- name: ray-head
image: seedjeffwan/vllm-openai:v0.5.2-distributed
resources:
limits:
cpu: "8"
nvidia.com/gpu: "1"
requests:
cpu: "8"
nvidia.com/gpu: "1"
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
- mountPath: /root/.cache/huggingface
name: models
volumes:
- name: ray-logs
emptyDir: {}
- name: models
hostPath:
path: /root/.cache/huggingface
type: DirectoryOrCreate
workerGroupSpecs:
- replicas: 3
minReplicas: 3
maxReplicas: 3
groupName: small-group
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
image: seedjeffwan/vllm-openai:v0.5.2-distributed
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
- mountPath: /root/.cache/huggingface
name: models
resources:
limits:
cpu: "8"
nvidia.com/gpu: "1"
requests:
cpu: "8"
nvidia.com/gpu: "1"
volumes:
- name: ray-logs
emptyDir: {}
- name: models
hostPath:
path: /root/.cache/huggingface
type: DirectoryOrCreate
Ensure RayCluster exposes the 8000 service port from the container.
curl http://vllm-server-raycluster-kf6cq-head-svc.default.svc.cluster.local:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "facebook/opt-13b",
"prompt": "San Francisco is a",
"max_tokens": 128,
"temperature": 0
}'
This will help you set up and test your distributed inference using KubeRay Job effectively.