OguzPastirmaci · June 19, 2024 00:17
diff --git a/BM.GPU.H100.8-nccl-test.yaml b/BM.GPU.H100.8-nccl-test.yaml
 apiVersion: batch.volcano.sh/v1alpha1
 kind: Job
 metadata:
  annotations:
  name: nccl-allreduce-job0
 spec:
  minAvailable: 0
  plugins:
    ssh: []
    svc: []
  queue: default
  schedulerName: volcano
  tasks:
  - name: mpimaster
    policies:
    - action: CompleteJob
      event: TaskCompleted
    replicas: 1
    template:
      metadata:
      spec:
        containers:
        - command:
          - /bin/bash
          - -c
          - |
            set -e -o pipefail; trap 'exit=1' SIGINT
            NUM_GPUS=8
            NUM_HOSTS=$(sed -n '$=' /etc/volcano/mpiworker.host)
            NP=$(($NUM_HOSTS*$NUM_GPUS))
            mpirun --allow-run-as-root \
              -mca coll ^hcoll  -mca plm_rsh_args "-p 2222" \
              -mca coll_hcoll_enable 0 \
              -np $NP -npernode $NUM_GPUS --bind-to numa \
              -hostfile /etc/volcano/mpiworker.host \
              -x NCCL_CROSS_NIC=2 \
              -x NCCL_SOCKET_NTHREADS=16 \
              -x NCCL_DEBUG=WARN \
              -x NCCL_CUMEM_ENABLE=0 \
              -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
              -x NCCL_IB_QPS_PER_CONNECTION=16 \
              -x NCCL_IB_GID_INDEX=3 \
              -x NCCL_IB_HCA==mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17 \
              -x NCCL_IB_TC=41 \
              -x NCCL_IB_SL=0 \
              -x NCCL_IB_TIMEOUT=22 \
              -x HCOLL_ENABLE_MCAST_ALL=0 \
              -x UCX_TLS=tcp \
              -x UCX_NET_DEVICES=eth0 \
              -x RX_QUEUE_LEN=8192 \
              -x IB_RX_QUEUE_LEN=8192 \
              -x NCCL_SOCKET_IFNAME=eth0 \
              -x NCCL_IGNORE_CPU_AFFINITY=1 \
              /workspace/nccl-tests/build/all_reduce_perf -b 8 -f 2 -g 1 -e 4G -c 1
            while :; do { [[ $exit ]] && break; }; sleep 1; done
          ports:
          - { name: mpijob-port, containerPort: 2222, protocol: TCP }
          image: ord.ocir.io/hpc_limited_availability/nccl-tests:pytorch-24.02-nccl-2.20.5-1
          name: mpimaster
          resources:
            limits:
              ephemeral-storage: 32Gi
            requests:
              cpu: 4
              ephemeral-storage: 32Gi
              memory: 1Gi
          securityContext:
            privileged: true
            capabilities:
              add:
              - IPC_LOCK
          volumeMounts:
          - { mountPath: /dev/infiniband, name: devinf }
          - { mountPath: /dev/shm, name: shm }
          workingDir: /workspace
        dnsPolicy: ClusterFirstWithHostNet
        hostNetwork: true
        restartPolicy: OnFailure
        terminationGracePeriodSeconds: 2
        volumes:
        - { name: devinf, hostPath: { path: /dev/infiniband }}
        - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }}
  - minAvailable: 0
    name: mpiworker
    replicas: 2
    template:
      metadata:
      spec:
        containers:
        - command:
          - /bin/bash
          - -c
          - mkdir -p /var/run/sshd; /usr/sbin/sshd -D -p 2222 || sleep 999999999;
          image: ord.ocir.io/hpc_limited_availability/nccl-tests:pytorch-24.02-nccl-2.20.5-1
          name: mpiworker
          ports:
          - { name: mpijob-port, containerPort: 2222, protocol: TCP }
          resources:
            limits:
              ephemeral-storage: 32Gi
              nvidia.com/gpu: 8
            requests:
              cpu: 100
              ephemeral-storage: 32Gi
              memory: 512Gi
              nvidia.com/gpu: 8
          securityContext:
            privileged: true
            capabilities:
              add:
              - IPC_LOCK
          volumeMounts:
          - { mountPath: /dev/infiniband, name: devinf }
          - { mountPath: /dev/shm, name: shm }
          workingDir: /workspace
        dnsPolicy: ClusterFirstWithHostNet
        hostNetwork: true
        restartPolicy: OnFailure
        terminationGracePeriodSeconds: 15
        tolerations:
        - { key: nvidia.com/gpu, operator: Exists }
        volumes:
        - { name: devinf, hostPath: { path: /dev/infiniband }}
        - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }}
	apiVersion: batch.volcano.sh/v1alpha1
	kind: Job
	metadata:
	annotations:
	name: nccl-allreduce-job0
	spec:
	minAvailable: 0
	plugins:
	ssh: []
	svc: []
	queue: default
	schedulerName: volcano
	tasks:
	- name: mpimaster
	policies:
	- action: CompleteJob
	event: TaskCompleted
	replicas: 1
	template:
	metadata:
	spec:
	containers:
	- command:
	- /bin/bash
	- -c
	- \|
	set -e -o pipefail; trap 'exit=1' SIGINT
	NUM_GPUS=8
	NUM_HOSTS=$(sed -n '$=' /etc/volcano/mpiworker.host)
	NP=$(($NUM_HOSTS*$NUM_GPUS))
	mpirun --allow-run-as-root \
	-mca coll ^hcoll -mca plm_rsh_args "-p 2222" \
	-mca coll_hcoll_enable 0 \
	-np $NP -npernode $NUM_GPUS --bind-to numa \
	-hostfile /etc/volcano/mpiworker.host \
	-x NCCL_CROSS_NIC=2 \
	-x NCCL_SOCKET_NTHREADS=16 \
	-x NCCL_DEBUG=WARN \
	-x NCCL_CUMEM_ENABLE=0 \
	-x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
	-x NCCL_IB_QPS_PER_CONNECTION=16 \
	-x NCCL_IB_GID_INDEX=3 \
	-x NCCL_IB_HCA==mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17 \
	-x NCCL_IB_TC=41 \
	-x NCCL_IB_SL=0 \
	-x NCCL_IB_TIMEOUT=22 \
	-x HCOLL_ENABLE_MCAST_ALL=0 \
	-x UCX_TLS=tcp \
	-x UCX_NET_DEVICES=eth0 \
	-x RX_QUEUE_LEN=8192 \
	-x IB_RX_QUEUE_LEN=8192 \
	-x NCCL_SOCKET_IFNAME=eth0 \
	-x NCCL_IGNORE_CPU_AFFINITY=1 \
	/workspace/nccl-tests/build/all_reduce_perf -b 8 -f 2 -g 1 -e 4G -c 1
	while :; do { [[ $exit ]] && break; }; sleep 1; done
	ports:
	- { name: mpijob-port, containerPort: 2222, protocol: TCP }
	image: ord.ocir.io/hpc_limited_availability/nccl-tests:pytorch-24.02-nccl-2.20.5-1
	name: mpimaster
	resources:
	limits:
	ephemeral-storage: 32Gi
	requests:
	cpu: 4
	ephemeral-storage: 32Gi
	memory: 1Gi
	securityContext:
	privileged: true
	capabilities:
	add:
	- IPC_LOCK
	volumeMounts:
	- { mountPath: /dev/infiniband, name: devinf }
	- { mountPath: /dev/shm, name: shm }
	workingDir: /workspace
	dnsPolicy: ClusterFirstWithHostNet
	hostNetwork: true
	restartPolicy: OnFailure
	terminationGracePeriodSeconds: 2
	volumes:
	- { name: devinf, hostPath: { path: /dev/infiniband }}
	- { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }}
	- minAvailable: 0
	name: mpiworker
	replicas: 2
	template:
	metadata:
	spec:
	containers:
	- command:
	- /bin/bash
	- -c
	- mkdir -p /var/run/sshd; /usr/sbin/sshd -D -p 2222 \|\| sleep 999999999;
	image: ord.ocir.io/hpc_limited_availability/nccl-tests:pytorch-24.02-nccl-2.20.5-1
	name: mpiworker
	ports:
	- { name: mpijob-port, containerPort: 2222, protocol: TCP }
	resources:
	limits:
	ephemeral-storage: 32Gi
	nvidia.com/gpu: 8
	requests:
	cpu: 100
	ephemeral-storage: 32Gi
	memory: 512Gi
	nvidia.com/gpu: 8
	securityContext:
	privileged: true
	capabilities:
	add:
	- IPC_LOCK
	volumeMounts:
	- { mountPath: /dev/infiniband, name: devinf }
	- { mountPath: /dev/shm, name: shm }
	workingDir: /workspace
	dnsPolicy: ClusterFirstWithHostNet
	hostNetwork: true
	restartPolicy: OnFailure
	terminationGracePeriodSeconds: 15
	tolerations:
	- { key: nvidia.com/gpu, operator: Exists }
	volumes:
	- { name: devinf, hostPath: { path: /dev/infiniband }}
	- { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }}