Skip to content

Instantly share code, notes, and snippets.

@nicor88
Created April 20, 2017 10:23
Show Gist options
  • Save nicor88/5260654eb26f6118772551861880dd67 to your computer and use it in GitHub Desktop.
Save nicor88/5260654eb26f6118772551861880dd67 to your computer and use it in GitHub Desktop.
Bootstrap action to install Conda and Jupyter on EMR
#!/usr/bin/env bash
set -x -e
JUPYTER_PASSWORD=${1:-"myJupyterPassword"}
NOTEBOOK_DIR=${2:-"s3://myS3Bucket/notebooks/"}
# home backup
if [ ! -d /mnt/home_backup ]; then
sudo mkdir /mnt/home_backup
sudo cp -a /home/* /mnt/home_backup
fi
# mount home to /mnt
if [ ! -d /mnt/home ]; then
sudo mv /home/ /mnt/
sudo ln -s /mnt/home /home
fi
# Install conda
wget https://repo.continuum.io/miniconda/Miniconda3-4.2.12-Linux-x86_64.sh -O /home/hadoop/miniconda.sh \
&& /bin/bash ~/miniconda.sh -b -p $HOME/conda
echo '\nexport PATH=$HOME/conda/bin:$PATH' >> $HOME/.bashrc && source $HOME/.bashrc
conda config --set always_yes yes --set changeps1 no
conda install conda=4.2.13
conda config -f --add channels conda-forge
conda config -f --add channels defaults
conda install hdfs3 findspark ujson jsonschema toolz boto3 py4j numpy pandas==0.19.2
# cleanup
rm ~/miniconda.sh
echo bootstrap_conda.sh completed. PATH now: $PATH
export PYSPARK_PYTHON="/home/hadoop/conda/bin/python3.5"
############### -------------- master node -------------- ###############
IS_MASTER=false
if grep isMaster /mnt/var/lib/info/instance.json | grep true;
then
IS_MASTER=true
### install dependencies for s3fs-fuse to access and store notebooks
sudo yum install -y git
sudo yum install -y libcurl libcurl-devel graphviz cyrus-sasl cyrus-sasl-devel readline readline-devel gnuplot
sudo yum install -y automake fuse fuse-devel libxml2-devel
# extract BUCKET and FOLDER to mount from NOTEBOOK_DIR
NOTEBOOK_DIR="${NOTEBOOK_DIR%/}/"
BUCKET=$(python -c "print('$NOTEBOOK_DIR'.split('//')[1].split('/')[0])")
FOLDER=$(python -c "print('/'.join('$NOTEBOOK_DIR'.split('//')[1].split('/')[1:-1]))")
echo "bucket '$BUCKET' folder '$FOLDER'"
cd /mnt
git clone https://github.com/s3fs-fuse/s3fs-fuse.git
cd s3fs-fuse/
ls -alrt
./autogen.sh
./configure
make
sudo make install
sudo su -c 'echo user_allow_other >> /etc/fuse.conf'
mkdir -p /mnt/s3fs-cache
mkdir -p /mnt/$BUCKET
/usr/local/bin/s3fs -o allow_other -o iam_role=auto -o umask=0 -o url=https://s3.amazonaws.com -o no_check_certificate -o enable_noobj_cache -o use_cache=/mnt/s3fs-cache $BUCKET /mnt/$BUCKET
### Install Jupyter Notebook with conda and configure it.
echo "installing python libs in master"
# install
conda install jupyter
# install visualization libs
conda install matplotlib plotly bokeh
# install scikit-learn stable version
conda install --channel scikit-learn-contrib scikit-learn==0.18
# jupyter configs
mkdir -p ~/.jupyter
touch ls ~/.jupyter/jupyter_notebook_config.py
HASHED_PASSWORD=$(python -c "from notebook.auth import passwd; print(passwd('$JUPYTER_PASSWORD'))")
echo "c.NotebookApp.password = u'$HASHED_PASSWORD'" >> ~/.jupyter/jupyter_notebook_config.py
echo "c.NotebookApp.open_browser = False" >> ~/.jupyter/jupyter_notebook_config.py
echo "c.NotebookApp.ip = '*'" >> ~/.jupyter/jupyter_notebook_config.py
echo "c.NotebookApp.notebook_dir = '/mnt/$BUCKET/$FOLDER'" >> ~/.jupyter/jupyter_notebook_config.py
echo "c.ContentsManager.checkpoints_kwargs = {'root_dir': '.checkpoints'}" >> ~/.jupyter/jupyter_notebook_config.py
### Setup Jupyter deamon and launch it
cd ~
echo "Creating Jupyter Daemon"
sudo cat <<EOF > /home/hadoop/jupyter.conf
description "Jupyter"
start on runlevel [2345]
stop on runlevel [016]
respawn
respawn limit 0 10
chdir /mnt/$BUCKET/$FOLDER
script
sudo su - hadoop > /var/log/jupyter.log 2>&1 <<BASH_SCRIPT
export PYSPARK_DRIVER_PYTHON="/home/hadoop/conda/bin/jupyter"
export PYSPARK_DRIVER_PYTHON_OPTS="notebook --log-level=INFO"
export PYSPARK_PYTHON=/home/hadoop/conda/bin/python3.5
export JAVA_HOME="/etc/alternatives/jre"
pyspark
BASH_SCRIPT
end script
EOF
sudo mv /home/hadoop/jupyter.conf /etc/init/
sudo chown root:root /etc/init/jupyter.conf
sudo initctl reload-configuration
# start jupyter daemon
echo "Starting Jupyter Daemon"
sudo initctl start jupyter
fi
@onieio
Copy link

onieio commented Feb 21, 2019

echo '\nexport PATH=$HOME/conda/bin:$PATH' >> $HOME/.bashrc && source $HOME/.bashrc out of this below one works for me
echo -e '\nexport PATH=$HOME/conda/bin:$PATH' >> $HOME/.bashrc && source $HOME/.bashrc

@naveenkumarmarri
Copy link

@nicor88 I'm seeing service failed to start when using it in the bootstrap action. What could be the reason for it?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment