Skip to content

Instantly share code, notes, and snippets.

@Jeffwan
Created May 18, 2025 22:36
Show Gist options
  • Save Jeffwan/50e67dcdc35e2e14c71e053b0d07a7c9 to your computer and use it in GitHub Desktop.
Save Jeffwan/50e67dcdc35e2e14c71e053b0d07a7c9 to your computer and use it in GitHub Desktop.
deepseek-r1.yaml
apiVersion: orchestration.aibrix.ai/v1alpha1
kind: RayClusterFleet
metadata:
labels:
app.kubernetes.io/name: aibrix
model.aibrix.ai/name: deepseek-r1-671b
model.aibrix.ai/port: "8000"
name: deepseek-r1-671b
spec:
replicas: 1
selector:
matchLabels:
model.aibrix.ai/name: deepseek-r1-671b
model.aibrix.ai/port: "8000"
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
labels:
model.aibrix.ai/name: deepseek-r1-671b
model.aibrix.ai/port: "8000"
annotations:
ray.io/overwrite-container-cmd: "true"
spec:
rayVersion: '2.40.0'
headGroupSpec:
rayStartParams:
dashboard-host: '0.0.0.0'
block: 'false'
template:
metadata:
labels:
model.aibrix.ai/name: deepseek-r1-671b
model.aibrix.ai/port: "8000"
annotations:
k8s.volcengine.com/pod-networks: |
[
{
"cniConf":{
"name":"rdma"
}
},
{
"cniConf":{
"name":"rdma"
}
},
{
"cniConf":{
"name":"rdma"
}
},
{
"cniConf":{
"name":"rdma"
}
},
{
"cniConf":{
"name":"rdma"
}
},
{
"cniConf":{
"name":"rdma"
}
},
{
"cniConf":{
"name":"rdma"
}
},
{
"cniConf":{
"name":"rdma"
}
}
]
spec:
initContainers:
- name: init-model
image: aibrix-cn-beijing.cr.volces.com/aibrix/runtime:v0.3.0-rc.1
command:
- aibrix_download
- --model-uri
- tos://aibrix-artifact-testing/models/DeepSeek-R1/
- --local-dir
- /models/
env:
- name: DOWNLOADER_MODEL_NAME
value: deepseek-r1
- name: DOWNLOADER_NUM_THREADS
value: "16"
- name: DOWNLOADER_ALLOW_FILE_SUFFIX
value: json, safetensors, py
- name: TOS_ACCESS_KEY
valueFrom:
secretKeyRef:
name: tos-credential
key: TOS_ACCESS_KEY
- name: TOS_SECRET_KEY
valueFrom:
secretKeyRef:
name: tos-credential
key: TOS_SECRET_KEY
- name: TOS_ENDPOINT
value: https://tos-s3-cn-beijing.ivolces.com
- name: TOS_REGION
value: cn-beijing
volumeMounts:
- mountPath: /models
name: models
containers:
- name: ray-head
image: aibrix-cn-beijing.cr.volces.com/aibrix/vllm-openai:v0.7.3.self.post1
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: service
command: ["/bin/bash", "-lc", "--"]
args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; vllm serve /models/DeepSeek-R1 --trust-remote-code --served-model-name deepseek-r1-671b --tensor-parallel-size 8 --pipeline-parallel-size 2 --distributed-executor-backend ray --uvicorn-log-level warning"]
env:
- name: GLOO_SOCKET_IFNAME
value: eth0
- name: NCCL_SOCKET_IFNAME
value: eth0
- name: NCCL_IB_DISABLE
value: "0"
- name: NCCL_DEBUG
value: INFO
- name: NCCL_IB_HCA
value: mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1
- name: NCCL_IB_GID_INDEX
value: "7"
resources:
limits:
nvidia.com/gpu: 8
vke.volcengine.com/rdma: "8"
requests:
nvidia.com/gpu: 8
vke.volcengine.com/rdma: "8"
securityContext:
capabilities:
add:
- IPC_LOCK
startupProbe:
httpGet:
path: /metrics
port: service
initialDelaySeconds: 180
failureThreshold: 150
periodSeconds: 10
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /dev/shm
name: shared-mem
- mountPath: /models
name: models
volumes:
- name: shared-mem
emptyDir:
medium: Memory
- name: models
hostPath:
path: /mnt/nvme0/models
type: DirectoryOrCreate
workerGroupSpecs:
- replicas: 1
minReplicas: 1
maxReplicas: 1
groupName: worker-group
rayStartParams: {}
template:
metadata:
labels:
model.aibrix.ai/name: deepseek-r1-671b
model.aibrix.ai/port: "8000"
annotations:
k8s.volcengine.com/pod-networks: |
[
{
"cniConf":{
"name":"rdma"
}
},
{
"cniConf":{
"name":"rdma"
}
},
{
"cniConf":{
"name":"rdma"
}
},
{
"cniConf":{
"name":"rdma"
}
},
{
"cniConf":{
"name":"rdma"
}
},
{
"cniConf":{
"name":"rdma"
}
},
{
"cniConf":{
"name":"rdma"
}
},
{
"cniConf":{
"name":"rdma"
}
}
]
spec:
initContainers:
- name: init-model
image: aibrix-cn-beijing.cr.volces.com/aibrix/runtime:v0.3.0-rc.1
command:
- aibrix_download
- --model-uri
- tos://aibrix-artifact-testing/models/DeepSeek-R1/
- --local-dir
- /models/
env:
- name: DOWNLOADER_MODEL_NAME
value: deepseek-r1
- name: DOWNLOADER_NUM_THREADS
value: "16"
- name: DOWNLOADER_ALLOW_FILE_SUFFIX
value: json, safetensors, py
- name: TOS_ACCESS_KEY
valueFrom:
secretKeyRef:
name: tos-credential
key: TOS_ACCESS_KEY
- name: TOS_SECRET_KEY
valueFrom:
secretKeyRef:
name: tos-credential
key: TOS_SECRET_KEY
- name: TOS_ENDPOINT
value: https://tos-s3-cn-beijing.ivolces.com
- name: TOS_REGION
value: cn-beijing
volumeMounts:
- mountPath: /models
name: models
containers:
- name: ray-worker
image: aibrix-cn-beijing.cr.volces.com/aibrix/vllm-openai:v0.7.3.self.post1
command: ["/bin/bash", "-lc", "--"]
args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD;"]
env:
- name: GLOO_SOCKET_IFNAME
value: eth0
- name: NCCL_SOCKET_IFNAME
value: eth0
- name: NCCL_IB_DISABLE
value: "0"
- name: NCCL_DEBUG
value: INFO
- name: NCCL_IB_HCA
value: mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1
- name: NCCL_IB_GID_INDEX
value: "7"
lifecycle:
preStop:
exec:
command: [ "/bin/sh","-c","ray stop" ]
resources:
limits:
nvidia.com/gpu: 8
vke.volcengine.com/rdma: "8"
requests:
nvidia.com/gpu: 8
vke.volcengine.com/rdma: "8"
securityContext:
capabilities:
add:
- IPC_LOCK
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /dev/shm
name: shared-mem
- mountPath: /models
name: models
volumes:
- name: shared-mem
emptyDir:
medium: Memory
- name: models
hostPath:
path: /mnt/nvme0/models
type: DirectoryOrCreate
@Jeffwan
Copy link
Author

Jeffwan commented May 18, 2025

Experiment 1 RDMA - TP16 - 14:38PM

python benchmark_serving.py \
    --backend vllm \
    --model deepseek-ai/DeepSeek-R1 \
    --served-model-name deepseek-r1-671b \
    --base-url http://115.190.25.67:80 \
    --endpoint /v1/completions \
    --num-prompts 100 \
    --request-rate 0.3 \
    --metric_percentiles '50,90,95,99' \
    --goodput ttft:5000 tpot:50 \
    --max-concurrency 200 \
    --random-input-len 2000 \
    --random-output-len 200 \
    --dataset-name random \
    --ignore-eos \
    --seed 61
WARNING 05-18 14:34:04 _custom_ops.py:19] Failed to import from vllm._C with ModuleNotFoundError("No module named 'vllm._C'")
INFO 05-18 14:34:04 importing.py:10] Triton not installed; certain GPU-related functions will not be available.
Namespace(backend='vllm', base_url='http://115.190.25.67:80', host='127.0.0.1', port=8000, endpoint='/v1/completions', dataset_name='random', dataset_path=None, max_concurrency=200, model='deepseek-ai/DeepSeek-R1', tokenizer=None, use_beam_search=False, num_prompts=100, logprobs=None, request_rate=0.3, burstiness=1.0, seed=61, trust_remote_code=False, disable_tqdm=False, profile=False, save_result=False, save_detailed=False, append_result=False, metadata=None, result_dir=None, result_filename=None, ignore_eos=True, percentile_metrics='ttft,tpot,itl', metric_percentiles='50,90,95,99', goodput=['ttft:5000', 'tpot:50'], sonnet_input_len=550, sonnet_output_len=150, sonnet_prefix_len=200, sharegpt_output_len=None, random_input_len=2000, random_output_len=200, random_range_ratio=0.0, random_prefix_len=0, hf_subset=None, hf_split=None, hf_output_len=None, top_p=None, top_k=None, min_p=None, temperature=None, tokenizer_mode='auto', served_model_name='deepseek-r1-671b', lora_modules=None)
Starting initial single prompt test run...
Initial test run completed. Starting main benchmark run...
Traffic request rate: 0.3
Burstiness factor: 1.0 (Poisson process)
Maximum request concurrency: 200
  0%|                                                                                                               | 0/100 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
        - Avoid using `tokenizers` before the fork if possible
        - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [06:11<00:00,  3.71s/it]
============ Serving Benchmark Result ============
Successful requests:                     100
Benchmark duration (s):                  371.03
Total input tokens:                      199900
Total generated tokens:                  20000
Request throughput (req/s):              0.27
Request goodput (req/s):                 0.11
Output token throughput (tok/s):         53.90
Total Token throughput (tok/s):          592.68
---------------Time to First Token----------------
Mean TTFT (ms):                          926.04
Median TTFT (ms):                        913.89
P50 TTFT (ms):                           913.89
P90 TTFT (ms):                           1035.27
P95 TTFT (ms):                           1127.15
P99 TTFT (ms):                           1327.41
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          54.51
Median TPOT (ms):                        54.05
P50 TPOT (ms):                           54.05
P90 TPOT (ms):                           62.45
P95 TPOT (ms):                           70.66
P99 TPOT (ms):                           72.27
---------------Inter-token Latency----------------
Mean ITL (ms):                           54.51
Median ITL (ms):                         48.02
P50 ITL (ms):                            48.02
P90 ITL (ms):                            61.22
P95 ITL (ms):                            64.72
P99 ITL (ms):                            232.41
==================================================

Experiment 2: TCP - TP 16 - 14:47PM

============ Serving Benchmark Result ============
Successful requests:                     61
Benchmark duration (s):                  377.88
Total input tokens:                      121939
Total generated tokens:                  12200
Request throughput (req/s):              0.16
Request goodput (req/s):                 0.00
Output token throughput (tok/s):         32.29
Total Token throughput (tok/s):          354.98
---------------Time to First Token----------------
Mean TTFT (ms):                          6188.16
Median TTFT (ms):                        5335.39
P50 TTFT (ms):                           5335.39
P90 TTFT (ms):                           10000.83
P95 TTFT (ms):                           12187.98
P99 TTFT (ms):                           12686.11
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          277.54
Median TPOT (ms):                        243.22
P50 TPOT (ms):                           243.22
P90 TPOT (ms):                           504.25
P95 TPOT (ms):                           539.18
P99 TPOT (ms):                           566.43
---------------Inter-token Latency----------------
Mean ITL (ms):                           277.54
Median ITL (ms):                         114.27
P50 ITL (ms):                            114.27
P90 ITL (ms):                            152.42
P95 ITL (ms):                            175.06
P99 ITL (ms):                            3192.99
==================================================

Experiment 3 - RDMA - TP 8 PP 2 - 15:00PM

============ Serving Benchmark Result ============
Successful requests:                     100
Benchmark duration (s):                  371.94
Total input tokens:                      199900
Total generated tokens:                  20000
Request throughput (req/s):              0.27
Request goodput (req/s):                 0.00
Output token throughput (tok/s):         53.77
Total Token throughput (tok/s):          591.22
---------------Time to First Token----------------
Mean TTFT (ms):                          1022.03
Median TTFT (ms):                        991.88
P50 TTFT (ms):                           991.88
P90 TTFT (ms):                           1175.18
P95 TTFT (ms):                           1398.70
P99 TTFT (ms):                           1536.46
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          64.18
Median TPOT (ms):                        62.81
P50 TPOT (ms):                           62.81
P90 TPOT (ms):                           76.85
P95 TPOT (ms):                           85.85
P99 TPOT (ms):                           88.47
---------------Inter-token Latency----------------
Mean ITL (ms):                           64.18
Median ITL (ms):                         57.22
P50 ITL (ms):                            57.22
P90 ITL (ms):                            75.06
P95 ITL (ms):                            79.11
P99 ITL (ms):                            327.63
==================================================

Experiment 4 - TCP - TP 8 PP 2 - 15:15PM

============ Serving Benchmark Result ============
Successful requests:                     100
Benchmark duration (s):                  372.04
Total input tokens:                      199900
Total generated tokens:                  20000
Request throughput (req/s):              0.27
Request goodput (req/s):                 0.00
Output token throughput (tok/s):         53.76
Total Token throughput (tok/s):          591.06
---------------Time to First Token----------------
Mean TTFT (ms):                          1053.42
Median TTFT (ms):                        1016.55
P50 TTFT (ms):                           1016.55
P90 TTFT (ms):                           1263.98
P95 TTFT (ms):                           1446.10
P99 TTFT (ms):                           1550.35
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          64.51
Median TPOT (ms):                        63.04
P50 TPOT (ms):                           63.04
P90 TPOT (ms):                           77.17
P95 TPOT (ms):                           85.35
P99 TPOT (ms):                           87.65
---------------Inter-token Latency----------------
Mean ITL (ms):                           64.51
Median ITL (ms):                         56.79
P50 ITL (ms):                            56.79
P90 ITL (ms):                            74.82
P95 ITL (ms):                            78.80
P99 ITL (ms):                            352.15
==================================================

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment