Last active
November 12, 2025 03:45
-
-
Save wseaton/afabee7bf8ac599d0e6fb8a22361fb13 to your computer and use it in GitHub Desktop.
DeepEP Low-Latency Test on OpenShift with SR-IOV RoCE - Working Configuration
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Auto-generated by Hermes (RDMA Cluster Analyzer) | |
| # https://github.com/llm-d-incubation/hermes | |
| # | |
| # Configuration: | |
| # workload: deepep-lowlatency-test | |
| # namespace: llm-test | |
| # image: ghcr.io/llm-d/llm-d-cuda-dev:sha-d58731d@sha256:ba067a81b28546650a5496c3093a21b249c3f0c60d0d305ddcd1907e632e6edd | |
| # sriov_network: roce-p2 | |
| # gpus_per_node: 1 | |
| # rdma_resource_type: rdma/roce_gdr | |
| # ucx_tls: rc,tcp,cuda_copy,cuda_ipc | |
| # | |
| # Selected nodes: | |
| # server_node: pokprod-b93r38s2 | |
| # client_node: pokprod-b93r39s1 | |
| # selection_reason: Optimal OpenShift same-topology: RoCE GPU Direct RDMA within 'ip-range-192.168.98.30-39' (cache score: 3) | |
| # | |
| --- | |
| apiVersion: v1 | |
| kind: ConfigMap | |
| metadata: | |
| name: deepep-lowlatency-script-7662562c | |
| labels: | |
| app: deepep-lowlatency-test | |
| test-id: "7662562c" | |
| data: | |
| diagnostics.sh: |- | |
| #!/bin/bash | |
| # shared RDMA diagnostics script | |
| # returns the RDMA device name that corresponds to SR-IOV interface | |
| # uses SRIOV_INTERFACE env var (defaults to net1) | |
| get_sriov_rdma_device() { | |
| local iface="${SRIOV_INTERFACE:-net1}" | |
| if [ -d "/sys/class/net/$iface" ]; then | |
| local iface_pci=$(readlink -f "/sys/class/net/$iface/device" 2>/dev/null | xargs basename) | |
| for dev in /sys/class/infiniband/*; do | |
| if [ -d "$dev" ]; then | |
| dev_name=$(basename "$dev") | |
| dev_pci=$(readlink -f "$dev/device" 2>/dev/null | xargs basename) | |
| if [ "$dev_pci" = "$iface_pci" ]; then | |
| echo "$dev_name" | |
| return 0 | |
| fi | |
| fi | |
| done | |
| fi | |
| return 1 | |
| } | |
| print_rdma_diagnostics() { | |
| echo "" | |
| echo "=== RDMA Device Diagnostics ===" | |
| echo "Network interfaces:" | |
| ip addr show | grep -E "^[0-9]+:|inet " | |
| echo "" | |
| echo "InfiniBand/RoCE devices:" | |
| ls -la /sys/class/infiniband/ || echo "No IB devices found" | |
| echo "" | |
| echo "RDMA device details:" | |
| for dev in /sys/class/infiniband/*; do | |
| if [ -d "$dev" ]; then | |
| dev_name=$(basename "$dev") | |
| echo "Device: $dev_name" | |
| # show GID table for port 1 | |
| if [ -d "$dev/ports/1/gids" ]; then | |
| echo " GID table (port 1):" | |
| for i in {0..5}; do | |
| if [ -f "$dev/ports/1/gids/$i" ]; then | |
| gid=$(cat "$dev/ports/1/gids/$i" 2>/dev/null || echo "error") | |
| echo " [$i] $gid" | |
| fi | |
| done | |
| fi | |
| # try to find associated network interface | |
| if [ -d "$dev/device/net" ]; then | |
| netdev=$(ls "$dev/device/net" 2>/dev/null | head -1) | |
| if [ -n "$netdev" ]; then | |
| echo " Network interface: $netdev" | |
| ip addr show "$netdev" | grep -E "inet |link/ether" | |
| fi | |
| fi | |
| echo "" | |
| fi | |
| done | |
| echo "NCCL environment variables:" | |
| env | grep NCCL || echo "No NCCL env vars set" | |
| echo "" | |
| echo "Network interface to RDMA device mapping:" | |
| if command -v ibdev2netdev &> /dev/null; then | |
| ibdev2netdev | |
| else | |
| echo "ibdev2netdev not available, using sysfs mapping:" | |
| for netif in /sys/class/net/*; do | |
| ifname=$(basename "$netif") | |
| if [ -L "$netif/device/infiniband" ]; then | |
| ibdev=$(ls "$netif/device/infiniband" 2>/dev/null | head -1) | |
| pci=$(readlink -f "$netif/device" | xargs basename) | |
| echo " $ifname -> $ibdev (PCI: $pci)" | |
| fi | |
| done | |
| fi | |
| echo "" | |
| echo "GPU topology:" | |
| nvidia-smi topo -m 2>/dev/null || echo "GPU topology not available" | |
| local iface="${SRIOV_INTERFACE:-net1}" | |
| echo "" | |
| echo "Checking SR-IOV interface ($iface):" | |
| if [ -d "/sys/class/net/$iface" ]; then | |
| echo " $iface exists" | |
| local iface_pci=$(readlink -f "/sys/class/net/$iface/device" 2>/dev/null | xargs basename) | |
| echo " PCI address: $iface_pci" | |
| # find matching RDMA device | |
| for dev in /sys/class/infiniband/*; do | |
| if [ -d "$dev" ]; then | |
| dev_name=$(basename "$dev") | |
| dev_pci=$(readlink -f "$dev/device" 2>/dev/null | xargs basename) | |
| if [ "$dev_pci" = "$iface_pci" ]; then | |
| echo " Matching RDMA device: $dev_name" | |
| fi | |
| fi | |
| done | |
| else | |
| echo " $iface not found!" | |
| fi | |
| echo "=== End Diagnostics ===" | |
| echo "" | |
| } | |
| master-entrypoint.sh: |- | |
| #!/bin/bash | |
| set -e | |
| echo "Starting DeepEP low latency test (MASTER) on node ${NODE_NAME}" | |
| echo "GPU information:" | |
| nvidia-smi -L | |
| GPU_COUNT=$(nvidia-smi -L | wc -l) | |
| echo "Detected $GPU_COUNT GPUs" | |
| # source and run diagnostics | |
| source /opt/deepep-test/diagnostics.sh | |
| print_rdma_diagnostics | |
| # detect and configure RDMA device for nvshmem and NCCL | |
| RDMA_DEVICE=$(get_sriov_rdma_device) | |
| if [ -n "$RDMA_DEVICE" ]; then | |
| export NVSHMEM_HCA_LIST="$RDMA_DEVICE" | |
| export NCCL_IB_HCA="$RDMA_DEVICE" | |
| echo "Configured NVSHMEM_HCA_LIST and NCCL_IB_HCA to use RDMA device: $RDMA_DEVICE (interface: ${SRIOV_INTERFACE:-net1})" | |
| else | |
| echo "WARNING: Could not detect RDMA device for SR-IOV interface ${SRIOV_INTERFACE:-net1}, RDMA operations may fail" | |
| fi | |
| echo "Cloning DeepEP repository..." | |
| cd /tmp | |
| git clone https://github.com/deepseek-ai/DeepEP || echo "Repository already exists" | |
| cd DeepEP | |
| git checkout v1.2.1 | |
| TOTAL_GPUS=$((GPU_COUNT * 2)) | |
| echo "Running DeepEP low latency test with $TOTAL_GPUS total GPUs (rank 0-$((GPU_COUNT-1)))" | |
| export MASTER_ADDR=deepep-lowlatency-master-${TEST_ID} | |
| export MASTER_PORT=29500 | |
| export WORLD_SIZE=$TOTAL_GPUS | |
| export RANK=0 | |
| export PYTHONUNBUFFERED=1 | |
| echo "Starting Python test script..." | |
| echo "Command: python tests/test_low_latency.py --num-processes $GPU_COUNT --num-tokens 128 --hidden 2048 --num-topk 4 --num-experts 32" | |
| python -u tests/test_low_latency.py --num-processes "$GPU_COUNT" --num-tokens 128 --hidden 2048 --num-topk 4 --num-experts 32 2>&1 | tee /tmp/test_output.log | |
| TEST_EXIT_CODE=${PIPESTATUS[0]} | |
| echo "Python test exited with code: $TEST_EXIT_CODE" | |
| if [ $TEST_EXIT_CODE -eq 0 ]; then | |
| echo "DeepEP low latency test completed successfully" | |
| else | |
| echo "DeepEP low latency test FAILED with exit code $TEST_EXIT_CODE" | |
| echo "Last 50 lines of output:" | |
| tail -50 /tmp/test_output.log | |
| exit $TEST_EXIT_CODE | |
| fi | |
| worker-entrypoint.sh: |- | |
| #!/bin/bash | |
| set -e | |
| echo "Starting DeepEP low latency test (WORKER) on node ${NODE_NAME}" | |
| echo "GPU information:" | |
| nvidia-smi -L | |
| GPU_COUNT=$(nvidia-smi -L | wc -l) | |
| echo "Detected $GPU_COUNT GPUs" | |
| # source and run diagnostics | |
| source /opt/deepep-test/diagnostics.sh | |
| print_rdma_diagnostics | |
| # detect and configure RDMA device for nvshmem and NCCL | |
| RDMA_DEVICE=$(get_sriov_rdma_device) | |
| if [ -n "$RDMA_DEVICE" ]; then | |
| export NVSHMEM_HCA_LIST="$RDMA_DEVICE" | |
| export NCCL_IB_HCA="$RDMA_DEVICE" | |
| echo "Configured NVSHMEM_HCA_LIST and NCCL_IB_HCA to use RDMA device: $RDMA_DEVICE (interface: ${SRIOV_INTERFACE:-net1})" | |
| else | |
| echo "WARNING: Could not detect RDMA device for SR-IOV interface ${SRIOV_INTERFACE:-net1}, RDMA operations may fail" | |
| fi | |
| echo "Cloning DeepEP repository..." | |
| cd /tmp | |
| git clone https://github.com/deepseek-ai/DeepEP || echo "Repository already exists" | |
| cd DeepEP | |
| git checkout v1.2.1 | |
| echo "Waiting for master to be ready..." | |
| until getent hosts "deepep-lowlatency-master-${TEST_ID}"; do | |
| echo "Waiting for master service..." | |
| sleep 2 | |
| done | |
| sleep 5 | |
| TOTAL_GPUS=$((GPU_COUNT * 2)) | |
| echo "Running DeepEP low latency test with $TOTAL_GPUS total GPUs (rank $GPU_COUNT-$((TOTAL_GPUS-1)))" | |
| export MASTER_ADDR=deepep-lowlatency-master-${TEST_ID} | |
| export MASTER_PORT=29500 | |
| export WORLD_SIZE=$TOTAL_GPUS | |
| export RANK=$GPU_COUNT | |
| export PYTHONUNBUFFERED=1 | |
| echo "Starting Python test script..." | |
| echo "Command: python tests/test_low_latency.py --num-processes $GPU_COUNT --num-tokens 128 --hidden 2048 --num-topk 4 --num-experts 32" | |
| python -u tests/test_low_latency.py --num-processes "$GPU_COUNT" --num-tokens 128 --hidden 2048 --num-topk 4 --num-experts 32 2>&1 | tee /tmp/test_output.log | |
| TEST_EXIT_CODE=${PIPESTATUS[0]} | |
| echo "Python test exited with code: $TEST_EXIT_CODE" | |
| if [ $TEST_EXIT_CODE -eq 0 ]; then | |
| echo "DeepEP low latency test completed successfully" | |
| else | |
| echo "DeepEP low latency test FAILED with exit code $TEST_EXIT_CODE" | |
| echo "Last 50 lines of output:" | |
| tail -50 /tmp/test_output.log | |
| exit $TEST_EXIT_CODE | |
| fi | |
| --- | |
| apiVersion: v1 | |
| kind: Service | |
| metadata: | |
| name: deepep-lowlatency-master-7662562c | |
| labels: | |
| app: deepep-lowlatency-test | |
| role: master | |
| test-id: "7662562c" | |
| spec: | |
| clusterIP: None | |
| selector: | |
| app: deepep-lowlatency-test | |
| role: master | |
| test-id: "7662562c" | |
| ports: | |
| - port: 29500 | |
| name: dist | |
| --- | |
| apiVersion: batch/v1 | |
| kind: Job | |
| metadata: | |
| name: deepep-lowlatency-master-7662562c | |
| labels: | |
| app: deepep-lowlatency-test | |
| role: master | |
| test-id: "7662562c" | |
| spec: | |
| template: | |
| metadata: | |
| labels: | |
| app: deepep-lowlatency-test | |
| role: master | |
| test-id: "7662562c" | |
| annotations: | |
| k8s.v1.cni.cncf.io/networks: '[{"name": "roce-p2", "interface": "net1"}]' | |
| spec: | |
| restartPolicy: Never | |
| nodeSelector: | |
| kubernetes.io/hostname: "pokprod-b93r38s2" | |
| containers: | |
| - name: deepep-lowlatency-master | |
| image: ghcr.io/llm-d/llm-d-cuda-dev:sha-d58731d@sha256:ba067a81b28546650a5496c3093a21b249c3f0c60d0d305ddcd1907e632e6edd | |
| command: ["/bin/bash", "/opt/deepep-test/master-entrypoint.sh"] | |
| resources: | |
| requests: | |
| rdma/roce_gdr: "1" | |
| nvidia.com/gpu: "1" | |
| memory: 8Gi | |
| cpu: "4" | |
| limits: | |
| rdma/roce_gdr: "1" | |
| nvidia.com/gpu: "1" | |
| memory: 16Gi | |
| cpu: "8" | |
| env: | |
| - name: TEST_ID | |
| value: "7662562c" | |
| - name: POD_NAME | |
| valueFrom: | |
| fieldRef: | |
| fieldPath: metadata.name | |
| - name: NODE_NAME | |
| valueFrom: | |
| fieldRef: | |
| fieldPath: spec.nodeName | |
| - name: POD_IP | |
| valueFrom: | |
| fieldRef: | |
| fieldPath: status.podIP | |
| - name: UCX_LOG_LEVEL | |
| value: "info" | |
| - name: UCX_ERROR_SIGNALS | |
| value: "" | |
| - name: UCX_TLS | |
| value: "rc,tcp,cuda_copy,cuda_ipc" | |
| - name: SRIOV_INTERFACE | |
| value: "net1" | |
| - name: NCCL_SOCKET_IFNAME | |
| value: "net1" | |
| - name: NCCL_DEBUG | |
| value: "INFO" | |
| volumeMounts: | |
| - name: deepep-test-script | |
| mountPath: /opt/deepep-test | |
| readOnly: true | |
| - name: dshm | |
| mountPath: /dev/shm | |
| volumes: | |
| - name: deepep-test-script | |
| configMap: | |
| name: deepep-lowlatency-script-7662562c | |
| defaultMode: 493 | |
| - name: dshm | |
| emptyDir: | |
| medium: Memory | |
| sizeLimit: 16Gi | |
| --- | |
| apiVersion: batch/v1 | |
| kind: Job | |
| metadata: | |
| name: deepep-lowlatency-worker-7662562c | |
| labels: | |
| app: deepep-lowlatency-test | |
| role: worker | |
| test-id: "7662562c" | |
| spec: | |
| template: | |
| metadata: | |
| labels: | |
| app: deepep-lowlatency-test | |
| role: worker | |
| test-id: "7662562c" | |
| annotations: | |
| k8s.v1.cni.cncf.io/networks: '[{"name": "roce-p2", "interface": "net1"}]' | |
| spec: | |
| restartPolicy: Never | |
| nodeSelector: | |
| kubernetes.io/hostname: "pokprod-b93r39s1" | |
| containers: | |
| - name: deepep-lowlatency-worker | |
| image: ghcr.io/llm-d/llm-d-cuda-dev:sha-d58731d@sha256:ba067a81b28546650a5496c3093a21b249c3f0c60d0d305ddcd1907e632e6edd | |
| command: ["/bin/bash", "/opt/deepep-test/worker-entrypoint.sh"] | |
| resources: | |
| requests: | |
| rdma/roce_gdr: "1" | |
| nvidia.com/gpu: "1" | |
| memory: 8Gi | |
| cpu: "4" | |
| limits: | |
| rdma/roce_gdr: "1" | |
| nvidia.com/gpu: "1" | |
| memory: 16Gi | |
| cpu: "8" | |
| env: | |
| - name: TEST_ID | |
| value: "7662562c" | |
| - name: POD_NAME | |
| valueFrom: | |
| fieldRef: | |
| fieldPath: metadata.name | |
| - name: NODE_NAME | |
| valueFrom: | |
| fieldRef: | |
| fieldPath: spec.nodeName | |
| - name: POD_IP | |
| valueFrom: | |
| fieldRef: | |
| fieldPath: status.podIP | |
| - name: UCX_LOG_LEVEL | |
| value: "info" | |
| - name: UCX_ERROR_SIGNALS | |
| value: "" | |
| - name: UCX_TLS | |
| value: "rc,tcp,cuda_copy,cuda_ipc" | |
| - name: SRIOV_INTERFACE | |
| value: "net1" | |
| - name: NCCL_SOCKET_IFNAME | |
| value: "net1" | |
| - name: NCCL_DEBUG | |
| value: "INFO" | |
| volumeMounts: | |
| - name: deepep-test-script | |
| mountPath: /opt/deepep-test | |
| readOnly: true | |
| - name: dshm | |
| mountPath: /dev/shm | |
| volumes: | |
| - name: deepep-test-script | |
| configMap: | |
| name: deepep-lowlatency-script-7662562c | |
| defaultMode: 493 | |
| - name: dshm | |
| emptyDir: | |
| medium: Memory | |
| sizeLimit: 16Gi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment