Deploy using kubectl apply -f deepseek.yaml
and watch the logs on the pod. When you see vllm
listening on port 8000, use port forward so you can try running python query.py
kubectl port-forward svc/deepseek-r1-server 8000:8000
Deploy using kubectl apply -f deepseek.yaml
and watch the logs on the pod. When you see vllm
listening on port 8000, use port forward so you can try running python query.py
kubectl port-forward svc/deepseek-r1-server 8000:8000
apiVersion: apps/v1 | |
kind: Deployment | |
metadata: | |
name: deepseek-server | |
annotations: | |
description: "Deployment for DeepSeek server" | |
spec: | |
replicas: 1 | |
selector: | |
matchLabels: | |
app: deepseek-server | |
template: | |
metadata: | |
labels: | |
app: deepseek-server | |
spec: | |
containers: | |
- name: vllm-server | |
image: vllm/vllm-openai:latest | |
env: | |
- name: HF_HUB_ENABLE_HF_TRANSFER | |
value: "1" | |
- name: HF_HOME | |
value: "/local/huggingface" | |
- name: HF_TOKEN | |
valueFrom: | |
secretKeyRef: | |
name: hf-token-secret | |
key: token | |
- name: MODEL_REPO | |
value: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" | |
command: ["/bin/bash"] | |
args: | |
- "-c" | |
- > | |
vllm serve ${MODEL_REPO} | |
--host 0.0.0.0 | |
--port 8000 | |
--trust-remote-code | |
resources: | |
limits: | |
cpu: "32" | |
memory: 100G | |
nvidia.com/gpu: "1" | |
requests: | |
cpu: "16" | |
memory: 30G | |
nvidia.com/gpu: "1" | |
securityContext: | |
privileged: true | |
ports: | |
- containerPort: 8000 | |
startupProbe: | |
periodSeconds: 10 | |
failureThreshold: 720 | |
httpGet: | |
path: /health | |
port: 8000 | |
volumeMounts: | |
- name: local-storage | |
mountPath: /local | |
- name: shm | |
mountPath: /dev/shm | |
volumes: | |
- name: local-storage | |
hostPath: | |
path: /root/local | |
type: DirectoryOrCreate | |
- name: shm | |
emptyDir: | |
medium: Memory | |
sizeLimit: "2Gi" | |
--- | |
apiVersion: v1 | |
kind: Service | |
metadata: | |
name: deepseek-r1-server | |
spec: | |
selector: | |
app: deepseek-server | |
type: ClusterIP | |
ports: | |
- name: port-8000 | |
port: 8000 | |
targetPort: 8000 | |
--- | |
apiVersion: v1 | |
kind: Secret | |
metadata: | |
name: hf-token-secret | |
type: Opaque | |
data: | |
token: "<YOUR-HF-TOKEN-GOES-HERE>" |
#!/usr/bin/env python | |
import openai | |
client = openai.Client( | |
base_url="http://127.0.0.1:8000/v1", api_key="EMPTY") | |
# Get the Models | |
models = client.models.list() | |
print(models) | |
# Chat completion | |
response = client.chat.completions.create( | |
model=models.data[0].id, | |
messages=[ | |
{ | |
"role": "user", | |
"content": "What is Kubernetes?" | |
}, | |
] | |
) | |
print(response) |