Skip to content

Instantly share code, notes, and snippets.

@wseaton
Last active November 12, 2025 03:45
Show Gist options
  • Select an option

  • Save wseaton/afabee7bf8ac599d0e6fb8a22361fb13 to your computer and use it in GitHub Desktop.

Select an option

Save wseaton/afabee7bf8ac599d0e6fb8a22361fb13 to your computer and use it in GitHub Desktop.
DeepEP Low-Latency Test on OpenShift with SR-IOV RoCE - Working Configuration
# Auto-generated by Hermes (RDMA Cluster Analyzer)
# https://github.com/llm-d-incubation/hermes
#
# Configuration:
# workload: deepep-lowlatency-test
# namespace: llm-test
# image: ghcr.io/llm-d/llm-d-cuda-dev:sha-d58731d@sha256:ba067a81b28546650a5496c3093a21b249c3f0c60d0d305ddcd1907e632e6edd
# sriov_network: roce-p2
# gpus_per_node: 1
# rdma_resource_type: rdma/roce_gdr
# ucx_tls: rc,tcp,cuda_copy,cuda_ipc
#
# Selected nodes:
# server_node: pokprod-b93r38s2
# client_node: pokprod-b93r39s1
# selection_reason: Optimal OpenShift same-topology: RoCE GPU Direct RDMA within 'ip-range-192.168.98.30-39' (cache score: 3)
#
---
apiVersion: v1
kind: ConfigMap
metadata:
name: deepep-lowlatency-script-7662562c
labels:
app: deepep-lowlatency-test
test-id: "7662562c"
data:
diagnostics.sh: |-
#!/bin/bash
# shared RDMA diagnostics script
# returns the RDMA device name that corresponds to SR-IOV interface
# uses SRIOV_INTERFACE env var (defaults to net1)
get_sriov_rdma_device() {
local iface="${SRIOV_INTERFACE:-net1}"
if [ -d "/sys/class/net/$iface" ]; then
local iface_pci=$(readlink -f "/sys/class/net/$iface/device" 2>/dev/null | xargs basename)
for dev in /sys/class/infiniband/*; do
if [ -d "$dev" ]; then
dev_name=$(basename "$dev")
dev_pci=$(readlink -f "$dev/device" 2>/dev/null | xargs basename)
if [ "$dev_pci" = "$iface_pci" ]; then
echo "$dev_name"
return 0
fi
fi
done
fi
return 1
}
print_rdma_diagnostics() {
echo ""
echo "=== RDMA Device Diagnostics ==="
echo "Network interfaces:"
ip addr show | grep -E "^[0-9]+:|inet "
echo ""
echo "InfiniBand/RoCE devices:"
ls -la /sys/class/infiniband/ || echo "No IB devices found"
echo ""
echo "RDMA device details:"
for dev in /sys/class/infiniband/*; do
if [ -d "$dev" ]; then
dev_name=$(basename "$dev")
echo "Device: $dev_name"
# show GID table for port 1
if [ -d "$dev/ports/1/gids" ]; then
echo " GID table (port 1):"
for i in {0..5}; do
if [ -f "$dev/ports/1/gids/$i" ]; then
gid=$(cat "$dev/ports/1/gids/$i" 2>/dev/null || echo "error")
echo " [$i] $gid"
fi
done
fi
# try to find associated network interface
if [ -d "$dev/device/net" ]; then
netdev=$(ls "$dev/device/net" 2>/dev/null | head -1)
if [ -n "$netdev" ]; then
echo " Network interface: $netdev"
ip addr show "$netdev" | grep -E "inet |link/ether"
fi
fi
echo ""
fi
done
echo "NCCL environment variables:"
env | grep NCCL || echo "No NCCL env vars set"
echo ""
echo "Network interface to RDMA device mapping:"
if command -v ibdev2netdev &> /dev/null; then
ibdev2netdev
else
echo "ibdev2netdev not available, using sysfs mapping:"
for netif in /sys/class/net/*; do
ifname=$(basename "$netif")
if [ -L "$netif/device/infiniband" ]; then
ibdev=$(ls "$netif/device/infiniband" 2>/dev/null | head -1)
pci=$(readlink -f "$netif/device" | xargs basename)
echo " $ifname -> $ibdev (PCI: $pci)"
fi
done
fi
echo ""
echo "GPU topology:"
nvidia-smi topo -m 2>/dev/null || echo "GPU topology not available"
local iface="${SRIOV_INTERFACE:-net1}"
echo ""
echo "Checking SR-IOV interface ($iface):"
if [ -d "/sys/class/net/$iface" ]; then
echo " $iface exists"
local iface_pci=$(readlink -f "/sys/class/net/$iface/device" 2>/dev/null | xargs basename)
echo " PCI address: $iface_pci"
# find matching RDMA device
for dev in /sys/class/infiniband/*; do
if [ -d "$dev" ]; then
dev_name=$(basename "$dev")
dev_pci=$(readlink -f "$dev/device" 2>/dev/null | xargs basename)
if [ "$dev_pci" = "$iface_pci" ]; then
echo " Matching RDMA device: $dev_name"
fi
fi
done
else
echo " $iface not found!"
fi
echo "=== End Diagnostics ==="
echo ""
}
master-entrypoint.sh: |-
#!/bin/bash
set -e
echo "Starting DeepEP low latency test (MASTER) on node ${NODE_NAME}"
echo "GPU information:"
nvidia-smi -L
GPU_COUNT=$(nvidia-smi -L | wc -l)
echo "Detected $GPU_COUNT GPUs"
# source and run diagnostics
source /opt/deepep-test/diagnostics.sh
print_rdma_diagnostics
# detect and configure RDMA device for nvshmem and NCCL
RDMA_DEVICE=$(get_sriov_rdma_device)
if [ -n "$RDMA_DEVICE" ]; then
export NVSHMEM_HCA_LIST="$RDMA_DEVICE"
export NCCL_IB_HCA="$RDMA_DEVICE"
echo "Configured NVSHMEM_HCA_LIST and NCCL_IB_HCA to use RDMA device: $RDMA_DEVICE (interface: ${SRIOV_INTERFACE:-net1})"
else
echo "WARNING: Could not detect RDMA device for SR-IOV interface ${SRIOV_INTERFACE:-net1}, RDMA operations may fail"
fi
echo "Cloning DeepEP repository..."
cd /tmp
git clone https://github.com/deepseek-ai/DeepEP || echo "Repository already exists"
cd DeepEP
git checkout v1.2.1
TOTAL_GPUS=$((GPU_COUNT * 2))
echo "Running DeepEP low latency test with $TOTAL_GPUS total GPUs (rank 0-$((GPU_COUNT-1)))"
export MASTER_ADDR=deepep-lowlatency-master-${TEST_ID}
export MASTER_PORT=29500
export WORLD_SIZE=$TOTAL_GPUS
export RANK=0
export PYTHONUNBUFFERED=1
echo "Starting Python test script..."
echo "Command: python tests/test_low_latency.py --num-processes $GPU_COUNT --num-tokens 128 --hidden 2048 --num-topk 4 --num-experts 32"
python -u tests/test_low_latency.py --num-processes "$GPU_COUNT" --num-tokens 128 --hidden 2048 --num-topk 4 --num-experts 32 2>&1 | tee /tmp/test_output.log
TEST_EXIT_CODE=${PIPESTATUS[0]}
echo "Python test exited with code: $TEST_EXIT_CODE"
if [ $TEST_EXIT_CODE -eq 0 ]; then
echo "DeepEP low latency test completed successfully"
else
echo "DeepEP low latency test FAILED with exit code $TEST_EXIT_CODE"
echo "Last 50 lines of output:"
tail -50 /tmp/test_output.log
exit $TEST_EXIT_CODE
fi
worker-entrypoint.sh: |-
#!/bin/bash
set -e
echo "Starting DeepEP low latency test (WORKER) on node ${NODE_NAME}"
echo "GPU information:"
nvidia-smi -L
GPU_COUNT=$(nvidia-smi -L | wc -l)
echo "Detected $GPU_COUNT GPUs"
# source and run diagnostics
source /opt/deepep-test/diagnostics.sh
print_rdma_diagnostics
# detect and configure RDMA device for nvshmem and NCCL
RDMA_DEVICE=$(get_sriov_rdma_device)
if [ -n "$RDMA_DEVICE" ]; then
export NVSHMEM_HCA_LIST="$RDMA_DEVICE"
export NCCL_IB_HCA="$RDMA_DEVICE"
echo "Configured NVSHMEM_HCA_LIST and NCCL_IB_HCA to use RDMA device: $RDMA_DEVICE (interface: ${SRIOV_INTERFACE:-net1})"
else
echo "WARNING: Could not detect RDMA device for SR-IOV interface ${SRIOV_INTERFACE:-net1}, RDMA operations may fail"
fi
echo "Cloning DeepEP repository..."
cd /tmp
git clone https://github.com/deepseek-ai/DeepEP || echo "Repository already exists"
cd DeepEP
git checkout v1.2.1
echo "Waiting for master to be ready..."
until getent hosts "deepep-lowlatency-master-${TEST_ID}"; do
echo "Waiting for master service..."
sleep 2
done
sleep 5
TOTAL_GPUS=$((GPU_COUNT * 2))
echo "Running DeepEP low latency test with $TOTAL_GPUS total GPUs (rank $GPU_COUNT-$((TOTAL_GPUS-1)))"
export MASTER_ADDR=deepep-lowlatency-master-${TEST_ID}
export MASTER_PORT=29500
export WORLD_SIZE=$TOTAL_GPUS
export RANK=$GPU_COUNT
export PYTHONUNBUFFERED=1
echo "Starting Python test script..."
echo "Command: python tests/test_low_latency.py --num-processes $GPU_COUNT --num-tokens 128 --hidden 2048 --num-topk 4 --num-experts 32"
python -u tests/test_low_latency.py --num-processes "$GPU_COUNT" --num-tokens 128 --hidden 2048 --num-topk 4 --num-experts 32 2>&1 | tee /tmp/test_output.log
TEST_EXIT_CODE=${PIPESTATUS[0]}
echo "Python test exited with code: $TEST_EXIT_CODE"
if [ $TEST_EXIT_CODE -eq 0 ]; then
echo "DeepEP low latency test completed successfully"
else
echo "DeepEP low latency test FAILED with exit code $TEST_EXIT_CODE"
echo "Last 50 lines of output:"
tail -50 /tmp/test_output.log
exit $TEST_EXIT_CODE
fi
---
apiVersion: v1
kind: Service
metadata:
name: deepep-lowlatency-master-7662562c
labels:
app: deepep-lowlatency-test
role: master
test-id: "7662562c"
spec:
clusterIP: None
selector:
app: deepep-lowlatency-test
role: master
test-id: "7662562c"
ports:
- port: 29500
name: dist
---
apiVersion: batch/v1
kind: Job
metadata:
name: deepep-lowlatency-master-7662562c
labels:
app: deepep-lowlatency-test
role: master
test-id: "7662562c"
spec:
template:
metadata:
labels:
app: deepep-lowlatency-test
role: master
test-id: "7662562c"
annotations:
k8s.v1.cni.cncf.io/networks: '[{"name": "roce-p2", "interface": "net1"}]'
spec:
restartPolicy: Never
nodeSelector:
kubernetes.io/hostname: "pokprod-b93r38s2"
containers:
- name: deepep-lowlatency-master
image: ghcr.io/llm-d/llm-d-cuda-dev:sha-d58731d@sha256:ba067a81b28546650a5496c3093a21b249c3f0c60d0d305ddcd1907e632e6edd
command: ["/bin/bash", "/opt/deepep-test/master-entrypoint.sh"]
resources:
requests:
rdma/roce_gdr: "1"
nvidia.com/gpu: "1"
memory: 8Gi
cpu: "4"
limits:
rdma/roce_gdr: "1"
nvidia.com/gpu: "1"
memory: 16Gi
cpu: "8"
env:
- name: TEST_ID
value: "7662562c"
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: UCX_LOG_LEVEL
value: "info"
- name: UCX_ERROR_SIGNALS
value: ""
- name: UCX_TLS
value: "rc,tcp,cuda_copy,cuda_ipc"
- name: SRIOV_INTERFACE
value: "net1"
- name: NCCL_SOCKET_IFNAME
value: "net1"
- name: NCCL_DEBUG
value: "INFO"
volumeMounts:
- name: deepep-test-script
mountPath: /opt/deepep-test
readOnly: true
- name: dshm
mountPath: /dev/shm
volumes:
- name: deepep-test-script
configMap:
name: deepep-lowlatency-script-7662562c
defaultMode: 493
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 16Gi
---
apiVersion: batch/v1
kind: Job
metadata:
name: deepep-lowlatency-worker-7662562c
labels:
app: deepep-lowlatency-test
role: worker
test-id: "7662562c"
spec:
template:
metadata:
labels:
app: deepep-lowlatency-test
role: worker
test-id: "7662562c"
annotations:
k8s.v1.cni.cncf.io/networks: '[{"name": "roce-p2", "interface": "net1"}]'
spec:
restartPolicy: Never
nodeSelector:
kubernetes.io/hostname: "pokprod-b93r39s1"
containers:
- name: deepep-lowlatency-worker
image: ghcr.io/llm-d/llm-d-cuda-dev:sha-d58731d@sha256:ba067a81b28546650a5496c3093a21b249c3f0c60d0d305ddcd1907e632e6edd
command: ["/bin/bash", "/opt/deepep-test/worker-entrypoint.sh"]
resources:
requests:
rdma/roce_gdr: "1"
nvidia.com/gpu: "1"
memory: 8Gi
cpu: "4"
limits:
rdma/roce_gdr: "1"
nvidia.com/gpu: "1"
memory: 16Gi
cpu: "8"
env:
- name: TEST_ID
value: "7662562c"
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: UCX_LOG_LEVEL
value: "info"
- name: UCX_ERROR_SIGNALS
value: ""
- name: UCX_TLS
value: "rc,tcp,cuda_copy,cuda_ipc"
- name: SRIOV_INTERFACE
value: "net1"
- name: NCCL_SOCKET_IFNAME
value: "net1"
- name: NCCL_DEBUG
value: "INFO"
volumeMounts:
- name: deepep-test-script
mountPath: /opt/deepep-test
readOnly: true
- name: dshm
mountPath: /dev/shm
volumes:
- name: deepep-test-script
configMap:
name: deepep-lowlatency-script-7662562c
defaultMode: 493
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 16Gi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment