Skip to content

Instantly share code, notes, and snippets.

@cosmincatalin
Last active April 17, 2023 14:23
Show Gist options
  • Save cosmincatalin/19ee904450284c97976a448695ee400c to your computer and use it in GitHub Desktop.
Save cosmincatalin/19ee904450284c97976a448695ee400c to your computer and use it in GitHub Desktop.
AWS EMR bootstraps to install Jupyter (R, SparkR, Python 2, Python 3, PySpark)

AWS EMR bootstraps to install Jupyter (R, SparkR, Python 2, Python 3, PySpark)

Use these bootstraps if you want to run Jupyter notebooks at scale using Spark or if you just want to run it on Amazon EMR. The default bootstrap installs the following kernels:

  • Python 2
  • PySpark (Python 2)
  • Python 3
  • PySpark (Python 3)

If you want R and SparkR, you'll have to use the second bootstrap as well.

Warning

This bootstrap will start a Jupyter server without any kind of authentication. Do not use it to create EMR clusters outside secure VPC.

Other interesting material

Take a look at my other Related gists:

#!/bin/bash
MINICONDA_VERSION="4.3.21"
PANDAS_VERSION="0.20.3"
SCIKIT_VERSION="0.19.0"
while [[ $# > 1 ]]; do
key="$1"
case $key in
# Where on S3 are the notebooks located.
# This path needs to exist.
# Eg: --notebook-dir /mnt/my-notebooks/
--notebook-dir)
NOTEBOOK_DIR="$2"
shift
;;
--miniconda-version)
MINICONDA_VERSION="$2"
shift
;;
--pandas-version)
PANDAS_VERSION="$2"
shift
;;
--scikit-version)
SCIKIT_VERSION="$2"
shift
;;
*)
echo "Unknown option: ${key}"
exit 1;
esac
shift
done
# Install conda and friends
wget https://repo.continuum.io/miniconda/Miniconda3-$MINICONDA_VERSION-Linux-x86_64.sh -O /mnt/miniconda.sh
/bin/bash /mnt/miniconda.sh -b -p /mnt/conda
rm /mnt/miniconda.sh
echo -e '\nexport PATH=/mnt/conda/bin:$PATH' >> $HOME/.bashrc && source $HOME/.bashrc
conda config --set always_yes yes --set changeps1 no
conda config -f --add channels conda-forge
conda config -f --add channels defaults
conda install hdfs3 findspark ujson jsonschema toolz boto3 py4j numpy pandas==$PANDAS_VERSION conda=$MINICONDA_VERSION jupyterlab scikit-learn==$SCIKIT_VERSION
conda install matplotlib plotly bokeh seaborn ipywidgets ipyvolume jupyter_contrib_nbextensions
conda install libgcc opencv fastparquet h5py
grep -Fq "\"isMaster\": true" /mnt/var/lib/info/instance.json
if [ $? -eq 0 ];
then
pip install --upgrade --ignore-installed mxnet
pip install --upgrade --ignore-installed mxnet-cu91
pip install --upgrade --ignore-installed tensorflow==1.5.0
pip install --upgrade --ignore-installed tensorflow-gpu==1.5.0
pip install --upgrade --ignore-installed keras
# Jupyter configuration. This is not using the defaults.
mkdir -p ~/.jupyter
touch ~/.jupyter/jupyter_notebook_config.py
echo "c.NotebookApp.token = ''" >> ~/.jupyter/jupyter_notebook_config.py
echo "c.NotebookApp.open_browser = False" >> ~/.jupyter/jupyter_notebook_config.py
echo "c.NotebookApp.ip = '*'" >> ~/.jupyter/jupyter_notebook_config.py
echo "c.NotebookApp.notebook_dir = '$NOTEBOOK_DIR'" >> ~/.jupyter/jupyter_notebook_config.py
# This will not work properly and there is no documentation for it
# echo "c.ContentsManager.checkpoints_kwargs = {'root_dir': '.checkpoints'}" >> ~/.jupyter/jupyter_notebook_config.py
# PySpark for Python 3
mkdir -p /mnt/conda/share/jupyter/kernels/python3-pyspark
cat <<EOF > /mnt/conda/share/jupyter/kernels/python3-pyspark/kernel.json
{
"argv": [
"/mnt/conda/bin/python3.6",
"-m",
"ipykernel_launcher",
"-f",
"{connection_file}"
],
"display_name": "Python 3 - PySpark",
"language": "python",
"env": {
"SPARK_HOME": "/usr/lib/spark",
"PYSPARK_PYTHON": "/mnt/conda/bin/python3.6",
"PYTHONPATH": "/usr/lib/spark/python:/usr/lib/spark/python/lib/py4j-0.10.4-src.zip",
"PYTHONSTARTUP": "/usr/lib/spark/python/pyspark/shell.py",
"PYSPARK_SUBMIT_ARGS": "--master yarn-client pyspark-shell"
}
}
EOF
# Install the kernel for Python 2
/mnt/conda/bin/conda create -n ipykernel_py2 python=2 ipykernel
source activate ipykernel_py2
python -m ipykernel install --user
# Install the kernel for PySpark with Python 2
mkdir -p /mnt/conda/share/jupyter/kernels/python2-pyspark
cat <<EOF > /mnt/conda/share/jupyter/kernels/python2-pyspark/kernel.json
{
"argv": [
"/mnt/conda/envs/ipykernel_py2/bin/python",
"-m",
"ipykernel_launcher",
"-f",
"{connection_file}"
],
"display_name": "Python 2 - PySpark",
"language": "python",
"env": {
"SPARK_HOME": "/usr/lib/spark",
"PYSPARK_PYTHON": "/mnt/conda/envs/ipykernel_py2/bin/python",
"PYTHONPATH": "/usr/lib/spark/python:/usr/lib/spark/python/lib/py4j-0.10.4-src.zip",
"PYTHONSTARTUP": "/usr/lib/spark/python/pyspark/shell.py",
"PYSPARK_SUBMIT_ARGS": "--master yarn-client pyspark-shell"
}
}
EOF
# Setup the Jupyter daemon
sudo cat <<EOF > ~/jupyter.conf
description "Jupyter"
author "Cosmin Catalin Sanda"
start on runlevel [2345]
stop on runlevel [016]
respawn
respawn limit 0 10
console output
chdir $NOTEBOOK_DIR
exec start-stop-daemon -v --start -c hadoop --exec /mnt/conda/bin/jupyter notebook
EOF
jupyter nbextension enable execute_time/ExecuteTime
jupyter nbextension enable freeze/main
jupyter nbextension enable hide_input/main
jupyter nbextension enable table_beautifier/main
jupyter nbextension enable spellchecker/main
jupyter nbextension enable python-markdown/main
jupyter nbextension enable scratchpad/main
jupyter nbextension enable varInspector/main
sudo mv ~/jupyter.conf /etc/init/
sudo chown root:root /etc/init/jupyter.conf
sudo initctl reload-configuration
# Start Jupyter daemon
sudo initctl start jupyter
fi
#!/bin/bash
SPARK="2.3.0"
while [[ $# > 1 ]]; do
key="$1"
case $key in
# The version of Spark to install for SparkR
--spark)
SPARK="$2"
shift
;;
*)
echo "Unknown option: ${key}"
exit 1;
esac
shift
done
source ~/.bashrc
# This is the user lib
mkdir -p ~/R/library
sudo yum install -y libcurl-devel openssl-devel openssl libcurl-devel libssh2-devel
sudo R -e "install.packages('devtools', repos='https://cran.rstudio.com/')" 1>&2
sudo R -e "install.packages('git2r', repos='https://cran.rstudio.com/')" 1>&2
sudo R -e "install.packages('repr', repos='https://cran.rstudio.com/')" 1>&2
sudo R -e "install.packages('IRdisplay', repos='https://cran.rstudio.com/')" 1>&2
sudo R -e "install.packages('crayon', repos='https://cran.rstudio.com/')" 1>&2
sudo R -e "install.packages('pbdZMQ', repos='https://cran.rstudio.com/')" 1>&2
sudo R -e "install.packages('tidyverse', repos='https://cran.rstudio.com/')" 1>&2
sudo R -e "devtools::install_github('apache/spark@v${SPARK}', subdir='R/pkg')" 1>&2
sudo R -e "devtools::install_github('IRkernel/IRkernel')" 1>&2
grep -Fq "\"isMaster\": true" /mnt/var/lib/info/instance.json
if [ $? -eq 0 ];
then
# There is actually no point in making another kernel
# R -e "IRkernel::installspec()" 1>&2
mkdir -p /mnt/conda/share/jupyter/kernels/r-sparkr
cat <<EOF > /mnt/conda/share/jupyter/kernels/r-sparkr/kernel.json
{
"argv": ["/usr/lib64/R/bin/R", "--slave", "-e", "IRkernel::main()", "--args", "{connection_file}"],
"display_name":"R - SparkR",
"language":"R",
"env": {
"SPARK_HOME": "/usr/lib/spark",
"R_LIBS_USER": "~/R/library"
}
}
EOF
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment