Created
January 29, 2020 06:36
-
-
Save spdin/46cb8abf93a2814b781e588eb9de109a to your computer and use it in GitHub Desktop.
Dockerfile for horovod with PyTorch 1.3.0
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This Dockerfile specifically for using PyTorch 1.3.0 | |
FROM nvidia/cuda:10.0-devel-ubuntu18.04 | |
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully | |
# ENV TENSORFLOW_VERSION=2.0.0 | |
ENV PYTORCH_VERSION=1.3.0 | |
ENV TORCHVISION_VERSION=0.4.1 | |
ENV CUDNN_VERSION=7.6.0.64-1+cuda10.0 | |
ENV NCCL_VERSION=2.4.7-1+cuda10.0 | |
# ENV MXNET_VERSION=1.5.0 | |
# Python 2.7 or 3.6 is supported by Ubuntu Bionic out of the box | |
ARG python=3.6 | |
ENV PYTHON_VERSION=${python} | |
# Set default shell to /bin/bash | |
SHELL ["/bin/bash", "-cu"] | |
RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ | |
build-essential \ | |
cmake \ | |
g++-4.8 \ | |
git \ | |
curl \ | |
vim \ | |
wget \ | |
ca-certificates \ | |
libcudnn7=${CUDNN_VERSION} \ | |
libnccl2=${NCCL_VERSION} \ | |
libnccl-dev=${NCCL_VERSION} \ | |
libjpeg-dev \ | |
libpng-dev \ | |
python${PYTHON_VERSION} \ | |
python${PYTHON_VERSION}-dev \ | |
librdmacm1 \ | |
libibverbs1 \ | |
ibverbs-providers | |
RUN if [[ "${PYTHON_VERSION}" == "3.6" ]]; then \ | |
apt-get install -y python${PYTHON_VERSION}-distutils; \ | |
fi | |
RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python | |
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ | |
python get-pip.py && \ | |
rm get-pip.py | |
# Install TensorFlow, Keras, PyTorch and MXNet | |
RUN pip install future typing | |
RUN pip install numpy \ | |
h5py | |
#tensorflow-gpu==${TENSORFLOW_VERSION} \ | |
#keras \ | |
#h5py | |
RUN pip install https://download.pytorch.org/whl/cu100/torch-${PYTORCH_VERSION}%2Bcu100-$(python -c "import wheel.pep425tags as w; print('-'.join(w.get_supported()[0][:-1]))")-linux_x86_64.whl \ | |
https://download.pytorch.org/whl/cu100/torchvision-${TORCHVISION_VERSION}%2Bcu100-$(python -c "import wheel.pep425tags as w; print('-'.join(w.get_supported()[0][:-1]))")-linux_x86_64.whl | |
#RUN pip install mxnet-cu100==${MXNET_VERSION} | |
# Install Open MPI | |
RUN mkdir /tmp/openmpi && \ | |
cd /tmp/openmpi && \ | |
wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz && \ | |
tar zxf openmpi-4.0.0.tar.gz && \ | |
cd openmpi-4.0.0 && \ | |
./configure --enable-orterun-prefix-by-default && \ | |
make -j $(nproc) all && \ | |
make install && \ | |
ldconfig && \ | |
rm -rf /tmp/openmpi | |
# Install Horovod, temporarily using CUDA stubs | |
RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \ | |
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITH_PYTORCH=1 \ | |
pip install --no-cache-dir horovod && \ | |
ldconfig | |
#HOROVOD_WITH_TENSORFLOW=1 | |
#HOROVOD_WITH_MXNET=1 | |
# Install OpenSSH for MPI to communicate between containers | |
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \ | |
mkdir -p /var/run/sshd | |
# Allow OpenSSH to talk to containers without asking for confirmation | |
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ | |
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ | |
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config | |
# Download examples | |
RUN apt-get install -y --no-install-recommends subversion && \ | |
svn checkout https://github.com/horovod/horovod/trunk/examples && \ | |
rm -rf /examples/.svn | |
WORKDIR "/examples" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment