Last active
March 3, 2020 11:09
-
-
Save spdin/a3716eeb95800fcbf60952f70dc9ca83 to your computer and use it in GitHub Desktop.
Docker and horovod command collection
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# build images from Dockerfile | |
docker build -t horovod:latest horovod-docker-gpu | |
# get list images | |
docker images | |
# get list container | |
docker ps | |
# remove container | |
docker rm [container name] | |
#remove images | |
docker rmi [image nama:tag] | |
# open container | |
docker exec -it [container-id] bash | |
# save images to tar | |
# example : nvidia-docker save -o horovod.tar horovod:latest | |
nvidia-docker save -o [name of file.tar] [name of images:tag] | |
# load docker tar to images | |
docker load -i horovod.tar | |
# run docker with nvidia runtime | |
docker run --runtime=nvidia -it horovod:latest | |
# run docker with specific ssh network | |
docker run --runtime=nvidia -it --network=host -v /home/kvgn-beta/.ssh:/root/.ssh horovod:latest | |
# open connection | |
/usr/sbin/sshd -p 12345; sleep infinity | |
# run docker and open ssh connection for specific ports | |
docker run --runtime=nvidia -it --network=host -v /home/kvgn-beta/.ssh:/root/.ssh horovod:latest \ | |
bash -c "/usr/sbin/sshd -p 12345; sleep infinity" | |
# run ini distributed machine, 4 machines with 1 GPU each | |
# --start-timeout important for running multiple machine, because loading distributed model and optimizer quite long | |
# for the first time | |
horovodrun -np 4 --start-timeout 300 \ | |
-H localhost:1,192.168.99.2:1,192.168.99.12:1,192.168.99.4:1 -p 12345 \ | |
python pytorch_mnist.py | |
# Other helping commands | |
# add user | |
adduser [name of username] | |
# change network name, from eno2 to eno1 | |
sudo /sbin/ip link set eno2 down | |
sudo /sbin/ip link set eno2 name eno1 | |
sudo /sbin/ip link set eno1 up | |
# split file into 500MB each | |
split -b 500M dynamic.tar | |
# join files | |
cat x* > dynamic.tar | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment