bartimusprimed · November 29, 2016 23:47
diff --git a/install-tensorflow.sh b/install-tensorflow.sh
 # Note – this is not a bash script (some of the steps require reboot)
 # I named it .sh just so Github does correct syntax highlighting.
 #
 # This is also available as an AMI in us-east-1 (virginia): ami-cf5028a5
 #
 # The CUDA part is mostly based on this excellent blog post:
 # http://tleyden.github.io/blog/2014/10/25/cuda-6-dot-5-on-aws-gpu-instance-running-ubuntu-14-dot-04/

 # Install various packages
 sudo apt-get update
 sudo apt-get upgrade -y # choose “install package maintainers version”
 sudo apt-get install -y build-essential python-pip python-dev git python-numpy swig python-dev default-jdk zip zlib1g-dev

 # Blacklist Noveau which has some kind of conflict with the nvidia driver
 echo -e "blacklist nouveau\nblacklist lbm-nouveau\noptions nouveau modeset=0\nalias nouveau off\nalias lbm-nouveau off\n" | sudo tee /etc/modprobe.d/blacklist-nouveau.conf
 echo options nouveau modeset=0 | sudo tee -a /etc/modprobe.d/nouveau-kms.conf
 sudo update-initramfs -u

 # Some other annoying thing we have to do
 sudo apt-get install -y linux-image-extra-virtual

 # REBOOT!
 sudo reboot

 # Install latest Linux headers
 sudo apt-get install -y linux-source linux-headers-`uname -r` 

 # Install CUDA 8.0
 mkdir install
 cd ./install
 wget https://developer.nvidia.com/compute/cuda/8.0/prod/local_installers/cuda_8.0.44_linux-run
 chmod +x cuda_8.0.44_linux-run
 ./cuda_8.0.44_linux-run -extract=`pwd`/nvidia_installers
 cd nvidia_installers
 sudo ./NVIDIA-Linux-x86_64-367.48.run
 sudo modprobe nvidia
 sudo ./cuda-linux64-rel-8.0.44-21122537.run
 cd

 # Install CUDNN 8.0
 # YOU NEED TO SCP THIS ONE FROM SOMEWHERE ELSE – it's not available online.
 # You need to register and get approved to get a download link. Very annoying.
 tar xvzf cudnn-8.0-linux-x64-v5.1.tgz
 sudo cp cuda/include/cudnn.h /usr/local/cuda/include
 sudo cp cuda/lib64/libcudnn* /usr/local/cuda/lib64
 sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*

 # At this point the root mount is getting a bit full
 # I had a lot of issues where the disk would fill up and then Bazel would end up in this weird state complaining about random things
 # Make sure you don't run out of disk space when building Tensorflow!
 sudo mkdir /mnt/tmp
 sudo chmod 777 /mnt/tmp
 sudo rm -rf /tmp
 sudo ln -s /mnt/tmp /tmp
 # Note that /mnt is not saved when building an AMI, so don't put anything crucial on it

 # Install Bazel
 # Latest installation manual is here:
 # https://bazel.build/versions/master/docs/install.html
 sudo add-apt-repository ppa:webupd8team/java
 sudo apt-get update
 sudo apt-get install oracle-java8-installer

 echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
 curl https://bazel.build/bazel-release.pub.gpg | sudo apt-key add -

 sudo apt-get update && sudo apt-get install bazel
 sudo apt-get upgrade bazel

 # Install TensorFlow
 export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda-8.0/lib64"
 export CUDA_HOME=/usr/local/cuda-8.0
 git clone --recurse-submodules https://github.com/tensorflow/tensorflow
 cd tensorflow

 # IMPORTANT! set compability with 3.0 in the next configure step if you are using g2.xlarge. 
 # If you using p2.xlarge, just use empty string
 # Please note that each additional compute capability significantly increases your build time and binary size.
 # [Default is: "3.5,5.2"]: 3.0
 TF_UNOFFICIAL_SETTING=1 ./configure

 # Build Python package
 # Note: you have to specify --config=cuda here - this is not mentioned in the official docs
 # https://github.com/tensorflow/tensorflow/issues/25#issuecomment-156173717
 # To build with GPU support:
 bazel build -c opt --config=cuda //tensorflow/tools/pip_package:build_pip_package
 bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
 sudo pip install /tmp/tensorflow_pkg/tensorflow-0.11.0rc2-py2-none-any.whl

 # Test it!
 cd tensorflow/models/image/cifar10/
 python cifar10_multi_gpu_train.py 

 # On a g2.2xlarge: step 100, loss = 4.50 (325.2 examples/sec; 0.394 sec/batch)
 # On a g2.8xlarge: step 100, loss = 4.49 (337.9 examples/sec; 0.379 sec/batch)
 # doesn't seem like it is able to use the 4 GPU cards unfortunately :(

 # To run tf in ipython after session relaunch yiu have to export some variables
 LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64
 export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda-8.0/lib64"
 export CUDA_HOME=/usr/local/cuda-8.0
	# Note – this is not a bash script (some of the steps require reboot)
	# I named it .sh just so Github does correct syntax highlighting.
	#
	# This is also available as an AMI in us-east-1 (virginia): ami-cf5028a5
	#
	# The CUDA part is mostly based on this excellent blog post:
	# http://tleyden.github.io/blog/2014/10/25/cuda-6-dot-5-on-aws-gpu-instance-running-ubuntu-14-dot-04/

	# Install various packages
	sudo apt-get update
	sudo apt-get upgrade -y # choose “install package maintainers version”
	sudo apt-get install -y build-essential python-pip python-dev git python-numpy swig python-dev default-jdk zip zlib1g-dev

	# Blacklist Noveau which has some kind of conflict with the nvidia driver
	echo -e "blacklist nouveau\nblacklist lbm-nouveau\noptions nouveau modeset=0\nalias nouveau off\nalias lbm-nouveau off\n" \| sudo tee /etc/modprobe.d/blacklist-nouveau.conf
	echo options nouveau modeset=0 \| sudo tee -a /etc/modprobe.d/nouveau-kms.conf
	sudo update-initramfs -u

	# Some other annoying thing we have to do
	sudo apt-get install -y linux-image-extra-virtual

	# REBOOT!
	sudo reboot

	# Install latest Linux headers
	sudo apt-get install -y linux-source linux-headers-`uname -r`

	# Install CUDA 8.0
	mkdir install
	cd ./install
	wget https://developer.nvidia.com/compute/cuda/8.0/prod/local_installers/cuda_8.0.44_linux-run
	chmod +x cuda_8.0.44_linux-run
	./cuda_8.0.44_linux-run -extract=`pwd`/nvidia_installers
	cd nvidia_installers
	sudo ./NVIDIA-Linux-x86_64-367.48.run
	sudo modprobe nvidia
	sudo ./cuda-linux64-rel-8.0.44-21122537.run
	cd

	# Install CUDNN 8.0
	# YOU NEED TO SCP THIS ONE FROM SOMEWHERE ELSE – it's not available online.
	# You need to register and get approved to get a download link. Very annoying.
	tar xvzf cudnn-8.0-linux-x64-v5.1.tgz
	sudo cp cuda/include/cudnn.h /usr/local/cuda/include
	sudo cp cuda/lib64/libcudnn* /usr/local/cuda/lib64
	sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*

	# At this point the root mount is getting a bit full
	# I had a lot of issues where the disk would fill up and then Bazel would end up in this weird state complaining about random things
	# Make sure you don't run out of disk space when building Tensorflow!
	sudo mkdir /mnt/tmp
	sudo chmod 777 /mnt/tmp
	sudo rm -rf /tmp
	sudo ln -s /mnt/tmp /tmp
	# Note that /mnt is not saved when building an AMI, so don't put anything crucial on it

	# Install Bazel
	# Latest installation manual is here:
	# https://bazel.build/versions/master/docs/install.html
	sudo add-apt-repository ppa:webupd8team/java
	sudo apt-get update
	sudo apt-get install oracle-java8-installer

	echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" \| sudo tee /etc/apt/sources.list.d/bazel.list
	curl https://bazel.build/bazel-release.pub.gpg \| sudo apt-key add -

	sudo apt-get update && sudo apt-get install bazel
	sudo apt-get upgrade bazel

	# Install TensorFlow
	export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda-8.0/lib64"
	export CUDA_HOME=/usr/local/cuda-8.0
	git clone --recurse-submodules https://github.com/tensorflow/tensorflow
	cd tensorflow

	# IMPORTANT! set compability with 3.0 in the next configure step if you are using g2.xlarge.
	# If you using p2.xlarge, just use empty string
	# Please note that each additional compute capability significantly increases your build time and binary size.
	# [Default is: "3.5,5.2"]: 3.0
	TF_UNOFFICIAL_SETTING=1 ./configure

	# Build Python package
	# Note: you have to specify --config=cuda here - this is not mentioned in the official docs
	# https://github.com/tensorflow/tensorflow/issues/25#issuecomment-156173717
	# To build with GPU support:
	bazel build -c opt --config=cuda //tensorflow/tools/pip_package:build_pip_package
	bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
	sudo pip install /tmp/tensorflow_pkg/tensorflow-0.11.0rc2-py2-none-any.whl

	# Test it!
	cd tensorflow/models/image/cifar10/
	python cifar10_multi_gpu_train.py

	# On a g2.2xlarge: step 100, loss = 4.50 (325.2 examples/sec; 0.394 sec/batch)
	# On a g2.8xlarge: step 100, loss = 4.49 (337.9 examples/sec; 0.379 sec/batch)
	# doesn't seem like it is able to use the 4 GPU cards unfortunately :(

	# To run tf in ipython after session relaunch yiu have to export some variables
	LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64
	export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda-8.0/lib64"
	export CUDA_HOME=/usr/local/cuda-8.0