docker run --net=host --runtime=nvidia -d -v /search/odin/tf-code-acoustics/train-data:/search/odin/tf-code-acoustics/train-data -it -P --privileged -v /dev/infiniband/:/dev/infiniband --name horovod_v3 10.142.104.73:8043/dlp/horovod:latest bash -c "/usr/sbin/sshd -p 55557; sleep infinity"
mpirun --allow-run-as-root -np 8 -H 10.141.186.118:8 -x NCCL_IB_DISABLE=1 -x NCCL_SOCKET_IFNAME=eth0 --mca btl_tcp_if_include eth0 --mca plm_rsh_args "-p 55557" bash run-restore.sh ctc &> test_1x8_256.log &
mpirun --allow-run-as-root -np 8 -H 10.141.250.15:8 -x NCCL_IB_DISABLE=0 -x NCCL_SOCKET_IFNAME=ib0 --mca btl_tcp_if_include ib0 --mca plm_rsh_args "-p 55557" bash run-restore.sh ctc &> test_1x8_256_ib.log &
mpirun --allow-run-as-root -np 16 -H 10.141.186.118:8,10.141.186.119:8 -x NCCL_IB_DISABLE=1 -x NCCL_SOCKET_IFNAME=eth0 --mca btl_tcp_if_include eth0 --mca plm_rsh_args "-p 55557" bash run-restore.sh ctc &> test_2x8_512.log &
mpirun --allow-run-as-root -np 16 -H 10.141.250.15:8,10.141.250.16:8 -x NCCL_IB_DISABLE=0 -x NCCL_SOCKET_IFNAME=ib0 --mca btl_tcp_if_include ib0 --mca plm_rsh_args "-p 55557" bash run-restore.sh ctc &> test_2x8_512_ib.log &
mpirun --allow-run-as-root -np 32 -H 10.141.186.118:8,10.141.186.119:8,10.141.186.111:8,10.141.186.117:8 -x NCCL_IB_DISABLE=1 -x NCCL_SOCKET_IFNAME=eth0 --mca btl_tcp_if_include eth0 --mca plm_rsh_args "-p 55557" bash run-restore.sh ctc &> test_4x8_1024.log &
mpirun --allow-run-as-root -np 32 -H 10.141.250.15:8,10.141.250.16:8,10.141.250.8:8,10.141.250.14:8 -x NCCL_IB_DISABLE=0 -x NCCL_SOCKET_IFNAME=ib0 --mca btl_tcp_if_include ib0 --mca plm_rsh_args "-p 55557" bash run-restore.sh ctc &> test_4x8_1024_ib.log &
mpirun --allow-run-as-root -np 64 -H 10.141.186.118:8,10.141.186.119:8,10.141.186.111:8,10.141.186.117:8,10.141.162.80:8,10.141.170.36:8,10.141.202.77:8,10.141.202.71:8 -x NCCL_IB_DISABLE=1 -x NCCL_SOCKET_IFNAME=eth0 --mca btl_tcp_if_include eth0 --mca plm_rsh_args "-p 55557" bash run-restore.sh ctc &> test_8x8_2048.log &
mpirun --allow-run-as-root -np 64 -H 10.141.250.15:8,10.141.250.16:8,10.141.250.8:8,10.141.250.14:8,10.141.250.33:8,10.141.251.7:8,10.141.251.45:8,10.141.251.55:8 -x NCCL_IB_DISABLE=0 -x NCCL_SOCKET_IFNAME=ib0 --mca btl_tcp_if_include ib0 --mca plm_rsh_args "-p 55557" bash run-restore.sh ctc &> test_8x8_2048_ib.log &
Node |
GPUs |
Batch Size |
TCP |
InfiniBand |
1 |
8 |
256 |
304s |
304s |
2 |
16 |
512 |
255s |
151s |
4 |
32 |
1024 |
118s |
92s |
8 |
64 |
1024 |
83s |
50s |