Last active
February 5, 2019 09:10
-
-
Save kingspp/1b09bf7e889ebcdc33b2f96200d7f100 to your computer and use it in GitHub Desktop.
Horovod Spark Installation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# No Proper documentation on installing horovod with spark support | |
# Reference - https://github.com/uber/horovod/blob/master/docs/spark.md | |
# Corrupt Java installtion - | |
sudo apt-add-repository ppa:webupd8team/java | |
sudo apt-get update | |
sudo apt-get install oracle-java8-installer | |
# Install Spark | |
wget http://mirrors.estointernet.in/apache/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz | |
mv spark-2.4.0-bin-hadoop2.7 spark | |
./spark/sbin/start-master.sh | |
# Manually copy source files to pip installed packages as its not included in setup.py | |
sudo cp -R horovod/horovod/spark /usr/local/lib/python3.6/site-packages/horovod | |
# Install packages based on trial and error. No requirements.txt's or dependencies specified. | |
sudo pip3.6 install pyspark cloudpickle | |
# Export Variables | |
export SPARK_HOME=/home/prathyush.sp/spark | |
export PYSPARK_PYTHON=python3.6 | |
export PYSPARK_DRIVER_PYTHON=python3.6 | |
# Create a spark session in at the start of the script | |
from pyspark.sql import SparkSession | |
SparkSession.builder.appName('abc').getOrCreate() | |
print(pyspark.SparkContext._active_spark_context) | |
# Replace mpirun_command https://github.com/uber/horovod/blob/master/horovod/spark/__init__.py at line 160 | |
mpirun_command = ( | |
'mpirun --allow-run-as-root --tag-output ' | |
'-np {num_proc} -H {hosts} ' | |
'-bind-to none -map-by slot ' | |
'-mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include {common_intfs} -mca btl_tcp_if_exclude lo,docker0,ens4d1 ' | |
'-x NCCL_DEBUG=INFO -x NCCL_SOCKET_IFNAME={common_intfs} -x UCX_MEM_MALLOC_HOOKS=no' | |
'{env} ' # expect a lot of environment variables | |
'-mca plm_rsh_agent "{python} -m horovod.spark.driver.mpirun_rsh {encoded_driver_addresses}" ' | |
'{python} -m horovod.spark.task.mpirun_exec_fn {encoded_driver_addresses} ' | |
.format(num_proc=num_proc, | |
hosts=','.join('%s:%d' % (host_hash, len(driver.task_host_hash_indices()[host_hash])) | |
for host_hash in host_hashes), | |
common_intfs=','.join(common_intfs), | |
env=' '.join('-x %s' % key for key in env.keys()), | |
python=sys.executable, | |
encoded_driver_addresses=codec.dumps_base64(driver.addresses()))) | |
# Network Interface issue - All systems need to have a common network interface. | |
# If machines are connected through ethernet and has Ubuntu 16, the following works | |
sudo nano /etc/default/grub | |
GRUB_CMDLINE_LINUX="net.ifnames=0" | |
sudo update-grub | |
sudo reboot |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment