Created
November 12, 2025 18:54
-
-
Save wseaton/c2a7fb0848a0845a9a5858606f5518b7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Auto-generated by Hermes (RDMA Cluster Analyzer) | |
| # https://github.com/llm-d-incubation/hermes | |
| # | |
| # Configuration: | |
| # workload: deepep-lowlatency-test | |
| # namespace: llm-test | |
| # image: ghcr.io/llm-d/llm-d-cuda-dev:sha-d58731d@sha256:ba067a81b28546650a5496c3093a21b249c3f0c60d0d305ddcd1907e632e6edd | |
| # sriov_network: roce-p2 | |
| # gpus_per_node: 1 | |
| # rdma_resource_type: rdma/roce_gdr | |
| # ucx_tls: rc,tcp,cuda_copy,cuda_ipc | |
| # | |
| # Selected nodes: | |
| # server_node: pokprod-b93r43s1 | |
| # client_node: pokprod-b93r44s0 | |
| # selection_reason: Optimal OpenShift same-topology: RoCE GPU Direct RDMA within 'ip-range-192.168.98.40-49' (cache score: 3) | |
| # | |
| --- | |
| apiVersion: v1 | |
| kind: ConfigMap | |
| metadata: | |
| name: deepep-lowlatency-script-877d4780 | |
| labels: | |
| app: deepep-lowlatency-test | |
| test-id: "877d4780" | |
| data: | |
| diagnostics.sh: |- | |
| #!/bin/bash | |
| # shared RDMA diagnostics script | |
| # returns the RDMA device name that corresponds to SR-IOV interface | |
| # uses SRIOV_INTERFACE env var (defaults to net1) | |
| get_sriov_rdma_device() { | |
| local iface="${SRIOV_INTERFACE:-net1}" | |
| if [ -d "/sys/class/net/$iface" ]; then | |
| local iface_pci | |
| iface_pci=$(readlink -f "/sys/class/net/$iface/device" 2>/dev/null | xargs basename) | |
| for dev in /sys/class/infiniband/*; do | |
| if [ -d "$dev" ]; then | |
| dev_name=$(basename "$dev") | |
| dev_pci=$(readlink -f "$dev/device" 2>/dev/null | xargs basename) | |
| if [ "$dev_pci" = "$iface_pci" ]; then | |
| echo "$dev_name" | |
| return 0 | |
| fi | |
| fi | |
| done | |
| fi | |
| return 1 | |
| } | |
| # detects RDMA devices for both net1 and net2 interfaces | |
| # outputs: NET1_RDMA_DEVICE and NET2_RDMA_DEVICE variables | |
| detect_dual_rdma_devices() { | |
| echo "Detecting RDMA devices for dual SR-IOV interfaces..." | |
| # detect net1 device | |
| if [ -d "/sys/class/net/net1" ]; then | |
| local net1_pci | |
| net1_pci=$(readlink -f "/sys/class/net/net1/device" 2>/dev/null | xargs basename) | |
| for dev in /sys/class/infiniband/*; do | |
| if [ -d "$dev" ]; then | |
| dev_name=$(basename "$dev") | |
| dev_pci=$(readlink -f "$dev/device" 2>/dev/null | xargs basename) | |
| if [ "$dev_pci" = "$net1_pci" ]; then | |
| NET1_RDMA_DEVICE="$dev_name" | |
| echo " net1 -> $NET1_RDMA_DEVICE (PCI: $net1_pci)" | |
| break | |
| fi | |
| fi | |
| done | |
| fi | |
| # detect net2 device | |
| if [ -d "/sys/class/net/net2" ]; then | |
| local net2_pci | |
| net2_pci=$(readlink -f "/sys/class/net/net2/device" 2>/dev/null | xargs basename) | |
| for dev in /sys/class/infiniband/*; do | |
| if [ -d "$dev" ]; then | |
| dev_name=$(basename "$dev") | |
| dev_pci=$(readlink -f "$dev/device" 2>/dev/null | xargs basename) | |
| if [ "$dev_pci" = "$net2_pci" ]; then | |
| NET2_RDMA_DEVICE="$dev_name" | |
| echo " net2 -> $NET2_RDMA_DEVICE (PCI: $net2_pci)" | |
| break | |
| fi | |
| fi | |
| done | |
| fi | |
| if [ -z "$NET1_RDMA_DEVICE" ] || [ -z "$NET2_RDMA_DEVICE" ]; then | |
| echo "WARNING: Could not detect both RDMA devices" | |
| echo " NET1_RDMA_DEVICE=${NET1_RDMA_DEVICE:-not found}" | |
| echo " NET2_RDMA_DEVICE=${NET2_RDMA_DEVICE:-not found}" | |
| return 1 | |
| fi | |
| return 0 | |
| } | |
| print_rdma_diagnostics() { | |
| echo "" | |
| echo "=== RDMA Device Diagnostics ===" | |
| echo "Network interfaces:" | |
| ip addr show | grep -E "^[0-9]+:|inet " | |
| echo "" | |
| echo "InfiniBand/RoCE devices:" | |
| ls -la /sys/class/infiniband/ || echo "No IB devices found" | |
| echo "" | |
| echo "RDMA device details:" | |
| for dev in /sys/class/infiniband/*; do | |
| if [ -d "$dev" ]; then | |
| dev_name=$(basename "$dev") | |
| echo "Device: $dev_name" | |
| # show GID table for port 1 | |
| if [ -d "$dev/ports/1/gids" ]; then | |
| echo " GID table (port 1):" | |
| for i in {0..5}; do | |
| if [ -f "$dev/ports/1/gids/$i" ]; then | |
| gid=$(cat "$dev/ports/1/gids/$i" 2>/dev/null || echo "error") | |
| echo " [$i] $gid" | |
| fi | |
| done | |
| fi | |
| # try to find associated network interface | |
| if [ -d "$dev/device/net" ]; then | |
| netdev=$(ls "$dev/device/net" 2>/dev/null | head -1) | |
| if [ -n "$netdev" ]; then | |
| echo " Network interface: $netdev" | |
| ip addr show "$netdev" | grep -E "inet |link/ether" | |
| fi | |
| fi | |
| echo "" | |
| fi | |
| done | |
| echo "NCCL environment variables:" | |
| env | grep NCCL || echo "No NCCL env vars set" | |
| echo "" | |
| echo "Network interface to RDMA device mapping:" | |
| if command -v ibdev2netdev &> /dev/null; then | |
| ibdev2netdev | |
| else | |
| echo "ibdev2netdev not available, using sysfs mapping:" | |
| for netif in /sys/class/net/*; do | |
| ifname=$(basename "$netif") | |
| if [ -L "$netif/device/infiniband" ]; then | |
| ibdev=$(ls "$netif/device/infiniband" 2>/dev/null | head -1) | |
| pci=$(readlink -f "$netif/device" | xargs basename) | |
| echo " $ifname -> $ibdev (PCI: $pci)" | |
| fi | |
| done | |
| fi | |
| echo "" | |
| echo "GPU topology:" | |
| nvidia-smi topo -m 2>/dev/null || echo "GPU topology not available" | |
| local iface="${SRIOV_INTERFACE:-net1}" | |
| echo "" | |
| echo "Checking SR-IOV interface ($iface):" | |
| if [ -d "/sys/class/net/$iface" ]; then | |
| echo " $iface exists" | |
| local iface_pci | |
| iface_pci=$(readlink -f "/sys/class/net/$iface/device" 2>/dev/null | xargs basename) | |
| echo " PCI address: $iface_pci" | |
| # find matching RDMA device | |
| for dev in /sys/class/infiniband/*; do | |
| if [ -d "$dev" ]; then | |
| dev_name=$(basename "$dev") | |
| dev_pci=$(readlink -f "$dev/device" 2>/dev/null | xargs basename) | |
| if [ "$dev_pci" = "$iface_pci" ]; then | |
| echo " Matching RDMA device: $dev_name" | |
| fi | |
| fi | |
| done | |
| else | |
| echo " $iface not found!" | |
| fi | |
| echo "=== End Diagnostics ===" | |
| echo "" | |
| } | |
| worker-entrypoint.sh: |- | |
| #!/bin/bash | |
| set -e | |
| echo "Starting DeepEP low latency test (WORKER) on node ${NODE_NAME}" | |
| echo "GPU information:" | |
| nvidia-smi -L | |
| GPU_COUNT=$(nvidia-smi -L | wc -l) | |
| echo "Detected $GPU_COUNT GPUs" | |
| # source and run diagnostics | |
| source /opt/deepep-test/diagnostics.sh | |
| print_rdma_diagnostics | |
| # detect and configure RDMA devices - split between NCCL and NVSHMEM | |
| detect_dual_rdma_devices | |
| if [ -n "$NET1_RDMA_DEVICE" ] && [ -n "$NET2_RDMA_DEVICE" ]; then | |
| # congestion reduction: NCCL on net2, NVSHMEM on both (defaults to net1) | |
| export NCCL_IB_HCA="$NET2_RDMA_DEVICE:1" | |
| export NCCL_SOCKET_IFNAME="net2" | |
| export NVSHMEM_HCA_LIST="$NET1_RDMA_DEVICE:1,$NET2_RDMA_DEVICE:1" | |
| export NVSHMEM_ENABLE_NIC_PE_MAPPING="1" | |
| export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME="net1" | |
| export UCX_NET_DEVICES="$NET1_RDMA_DEVICE:1,$NET2_RDMA_DEVICE:1" | |
| echo "=== Dual HCA Congestion Reduction Config ===" | |
| echo " NCCL pinned to: $NET2_RDMA_DEVICE:1 (net2)" | |
| echo " NVSHMEM has access: $NET1_RDMA_DEVICE:1,$NET2_RDMA_DEVICE:1" | |
| echo " NVSHMEM bootstrap: net1 (should default here)" | |
| echo " UCX uses: $NET1_RDMA_DEVICE:1,$NET2_RDMA_DEVICE:1" | |
| echo " Theory: Separate NCCL to net2, NVSHMEM defaults to net1" | |
| echo "==============================================" | |
| else | |
| echo "ERROR: Failed to detect dual RDMA devices, cannot proceed" | |
| exit 1 | |
| fi | |
| echo "Cloning DeepEP repository..." | |
| cd /tmp | |
| git clone https://github.com/deepseek-ai/DeepEP || echo "Repository already exists" | |
| cd DeepEP | |
| git checkout v1.2.1 | |
| echo "Waiting for master to be ready..." | |
| until getent hosts "deepep-lowlatency-master-${TEST_ID}"; do | |
| echo "Waiting for master service..." | |
| sleep 2 | |
| done | |
| sleep 5 | |
| TOTAL_GPUS=$((GPU_COUNT * 2)) | |
| echo "Running DeepEP low latency test with $TOTAL_GPUS total GPUs (rank $GPU_COUNT-$((TOTAL_GPUS-1)))" | |
| export MASTER_ADDR=deepep-lowlatency-master-${TEST_ID} | |
| export MASTER_PORT=29500 | |
| export WORLD_SIZE=$TOTAL_GPUS | |
| export RANK=$GPU_COUNT | |
| export PYTHONUNBUFFERED=1 | |
| echo "Starting Python test script..." | |
| echo "Command: python tests/test_low_latency.py --num-processes $GPU_COUNT --num-tokens 128 --hidden 2048 --num-topk 4 --num-experts 32" | |
| python -u tests/test_low_latency.py --num-processes "$GPU_COUNT" --num-tokens 128 --hidden 2048 --num-topk 4 --num-experts 32 2>&1 | tee /tmp/test_output.log | |
| TEST_EXIT_CODE=${PIPESTATUS[0]} | |
| echo "Python test exited with code: $TEST_EXIT_CODE" | |
| if [ $TEST_EXIT_CODE -eq 0 ]; then | |
| echo "DeepEP low latency test completed successfully" | |
| else | |
| echo "DeepEP low latency test FAILED with exit code $TEST_EXIT_CODE" | |
| echo "Last 50 lines of output:" | |
| tail -50 /tmp/test_output.log | |
| exit $TEST_EXIT_CODE | |
| fi | |
| master-entrypoint.sh: |- | |
| #!/bin/bash | |
| set -e | |
| echo "Starting DeepEP low latency test (MASTER) on node ${NODE_NAME}" | |
| echo "GPU information:" | |
| nvidia-smi -L | |
| GPU_COUNT=$(nvidia-smi -L | wc -l) | |
| echo "Detected $GPU_COUNT GPUs" | |
| # source and run diagnostics | |
| source /opt/deepep-test/diagnostics.sh | |
| print_rdma_diagnostics | |
| # detect and configure RDMA devices - split between NCCL and NVSHMEM | |
| detect_dual_rdma_devices | |
| if [ -n "$NET1_RDMA_DEVICE" ] && [ -n "$NET2_RDMA_DEVICE" ]; then | |
| # congestion reduction: NCCL on net2, NVSHMEM on both (defaults to net1) | |
| export NCCL_IB_HCA="$NET2_RDMA_DEVICE:1" | |
| export NCCL_SOCKET_IFNAME="net2" | |
| export NVSHMEM_HCA_LIST="$NET1_RDMA_DEVICE:1,$NET2_RDMA_DEVICE:1" | |
| export NVSHMEM_ENABLE_NIC_PE_MAPPING="1" | |
| export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME="net1" | |
| export UCX_NET_DEVICES="$NET1_RDMA_DEVICE:1,$NET2_RDMA_DEVICE:1" | |
| echo "=== Dual HCA Congestion Reduction Config ===" | |
| echo " NCCL pinned to: $NET2_RDMA_DEVICE:1 (net2)" | |
| echo " NVSHMEM has access: $NET1_RDMA_DEVICE:1,$NET2_RDMA_DEVICE:1" | |
| echo " NVSHMEM bootstrap: net1 (should default here)" | |
| echo " UCX uses: $NET1_RDMA_DEVICE:1,$NET2_RDMA_DEVICE:1" | |
| echo " Theory: Separate NCCL to net2, NVSHMEM defaults to net1" | |
| echo "==============================================" | |
| else | |
| echo "ERROR: Failed to detect dual RDMA devices, cannot proceed" | |
| exit 1 | |
| fi | |
| echo "Cloning DeepEP repository..." | |
| cd /tmp | |
| git clone https://github.com/deepseek-ai/DeepEP || echo "Repository already exists" | |
| cd DeepEP | |
| git checkout v1.2.1 | |
| TOTAL_GPUS=$((GPU_COUNT * 2)) | |
| echo "Running DeepEP low latency test with $TOTAL_GPUS total GPUs (rank 0-$((GPU_COUNT-1)))" | |
| export MASTER_ADDR=deepep-lowlatency-master-${TEST_ID} | |
| export MASTER_PORT=29500 | |
| export WORLD_SIZE=$TOTAL_GPUS | |
| export RANK=0 | |
| export PYTHONUNBUFFERED=1 | |
| echo "Starting Python test script..." | |
| echo "Command: python tests/test_low_latency.py --num-processes $GPU_COUNT --num-tokens 128 --hidden 2048 --num-topk 4 --num-experts 32" | |
| python -u tests/test_low_latency.py --num-processes "$GPU_COUNT" --num-tokens 128 --hidden 2048 --num-topk 4 --num-experts 32 2>&1 | tee /tmp/test_output.log | |
| TEST_EXIT_CODE=${PIPESTATUS[0]} | |
| echo "Python test exited with code: $TEST_EXIT_CODE" | |
| if [ $TEST_EXIT_CODE -eq 0 ]; then | |
| echo "DeepEP low latency test completed successfully" | |
| else | |
| echo "DeepEP low latency test FAILED with exit code $TEST_EXIT_CODE" | |
| echo "Last 50 lines of output:" | |
| tail -50 /tmp/test_output.log | |
| exit $TEST_EXIT_CODE | |
| fi | |
| --- | |
| apiVersion: v1 | |
| kind: Service | |
| metadata: | |
| name: deepep-lowlatency-master-877d4780 | |
| labels: | |
| app: deepep-lowlatency-test | |
| role: master | |
| test-id: "877d4780" | |
| spec: | |
| clusterIP: None | |
| selector: | |
| app: deepep-lowlatency-test | |
| role: master | |
| test-id: "877d4780" | |
| ports: | |
| - port: 29500 | |
| name: dist | |
| --- | |
| apiVersion: batch/v1 | |
| kind: Job | |
| metadata: | |
| name: deepep-lowlatency-master-877d4780 | |
| labels: | |
| app: deepep-lowlatency-test | |
| role: master | |
| test-id: "877d4780" | |
| spec: | |
| activeDeadlineSeconds: 600 | |
| template: | |
| metadata: | |
| labels: | |
| app: deepep-lowlatency-test | |
| role: master | |
| test-id: "877d4780" | |
| annotations: | |
| k8s.v1.cni.cncf.io/networks: 'roce-p2, roce-p2' | |
| spec: | |
| restartPolicy: Never | |
| nodeSelector: | |
| kubernetes.io/hostname: "pokprod-b93r43s1" | |
| containers: | |
| - name: deepep-lowlatency-master | |
| image: ghcr.io/llm-d/llm-d-cuda-dev:sha-d58731d@sha256:ba067a81b28546650a5496c3093a21b249c3f0c60d0d305ddcd1907e632e6edd | |
| command: ["/bin/bash", "/opt/deepep-test/master-entrypoint.sh"] | |
| resources: | |
| requests: | |
| rdma/roce_gdr: "2" | |
| nvidia.com/gpu: "1" | |
| memory: 8Gi | |
| cpu: "4" | |
| limits: | |
| rdma/roce_gdr: "2" | |
| nvidia.com/gpu: "1" | |
| memory: 16Gi | |
| cpu: "8" | |
| env: | |
| - name: TEST_ID | |
| value: "877d4780" | |
| - name: POD_NAME | |
| valueFrom: | |
| fieldRef: | |
| fieldPath: metadata.name | |
| - name: NODE_NAME | |
| valueFrom: | |
| fieldRef: | |
| fieldPath: spec.nodeName | |
| - name: POD_IP | |
| valueFrom: | |
| fieldRef: | |
| fieldPath: status.podIP | |
| - name: UCX_LOG_LEVEL | |
| value: "info" | |
| - name: UCX_ERROR_SIGNALS | |
| value: "" | |
| - name: UCX_TLS | |
| value: "rc,tcp,cuda_copy,cuda_ipc" | |
| - name: SRIOV_INTERFACE | |
| value: "net1" | |
| - name: NCCL_SOCKET_IFNAME | |
| value: "net1" | |
| - name: NCCL_DEBUG | |
| value: "INFO" | |
| volumeMounts: | |
| - name: deepep-test-script | |
| mountPath: /opt/deepep-test | |
| readOnly: true | |
| - name: dshm | |
| mountPath: /dev/shm | |
| volumes: | |
| - name: deepep-test-script | |
| configMap: | |
| name: deepep-lowlatency-script-877d4780 | |
| defaultMode: 493 | |
| - name: dshm | |
| emptyDir: | |
| medium: Memory | |
| sizeLimit: 16Gi | |
| --- | |
| apiVersion: batch/v1 | |
| kind: Job | |
| metadata: | |
| name: deepep-lowlatency-worker-877d4780 | |
| labels: | |
| app: deepep-lowlatency-test | |
| role: worker | |
| test-id: "877d4780" | |
| spec: | |
| activeDeadlineSeconds: 600 | |
| template: | |
| metadata: | |
| labels: | |
| app: deepep-lowlatency-test | |
| role: worker | |
| test-id: "877d4780" | |
| annotations: | |
| k8s.v1.cni.cncf.io/networks: 'roce-p2, roce-p2' | |
| spec: | |
| restartPolicy: Never | |
| nodeSelector: | |
| kubernetes.io/hostname: "pokprod-b93r44s0" | |
| containers: | |
| - name: deepep-lowlatency-worker | |
| image: ghcr.io/llm-d/llm-d-cuda-dev:sha-d58731d@sha256:ba067a81b28546650a5496c3093a21b249c3f0c60d0d305ddcd1907e632e6edd | |
| command: ["/bin/bash", "/opt/deepep-test/worker-entrypoint.sh"] | |
| resources: | |
| requests: | |
| rdma/roce_gdr: "2" | |
| nvidia.com/gpu: "1" | |
| memory: 8Gi | |
| cpu: "4" | |
| limits: | |
| rdma/roce_gdr: "2" | |
| nvidia.com/gpu: "1" | |
| memory: 16Gi | |
| cpu: "8" | |
| env: | |
| - name: TEST_ID | |
| value: "877d4780" | |
| - name: POD_NAME | |
| valueFrom: | |
| fieldRef: | |
| fieldPath: metadata.name | |
| - name: NODE_NAME | |
| valueFrom: | |
| fieldRef: | |
| fieldPath: spec.nodeName | |
| - name: POD_IP | |
| valueFrom: | |
| fieldRef: | |
| fieldPath: status.podIP | |
| - name: UCX_LOG_LEVEL | |
| value: "info" | |
| - name: UCX_ERROR_SIGNALS | |
| value: "" | |
| - name: UCX_TLS | |
| value: "rc,tcp,cuda_copy,cuda_ipc" | |
| - name: SRIOV_INTERFACE | |
| value: "net1" | |
| - name: NCCL_SOCKET_IFNAME | |
| value: "net1" | |
| - name: NCCL_DEBUG | |
| value: "INFO" | |
| volumeMounts: | |
| - name: deepep-test-script | |
| mountPath: /opt/deepep-test | |
| readOnly: true | |
| - name: dshm | |
| mountPath: /dev/shm | |
| volumes: | |
| - name: deepep-test-script | |
| configMap: | |
| name: deepep-lowlatency-script-877d4780 | |
| defaultMode: 493 | |
| - name: dshm | |
| emptyDir: | |
| medium: Memory | |
| sizeLimit: 16Gi | |
| ---------------------------------------- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment