Last active
November 17, 2019 20:34
-
-
Save simicd/5955ffd3d75ce3ca967e59ef02067ee9 to your computer and use it in GitHub Desktop.
PySpark dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Source: https://hub.docker.com/r/jupyter/pyspark-notebook | |
# Copyright (c) Jupyter Development Team. | |
# Distributed under the terms of the Modified BSD License. | |
ARG BASE_CONTAINER=jupyter/scipy-notebook | |
FROM $BASE_CONTAINER | |
LABEL maintainer="Jupyter Project <[email protected]>" | |
USER root | |
# Spark dependencies | |
ENV APACHE_SPARK_VERSION 2.4.4 | |
ENV HADOOP_VERSION 2.7 | |
RUN apt-get -y update && \ | |
apt-get install --no-install-recommends -y openjdk-8-jre-headless ca-certificates-java && \ | |
rm -rf /var/lib/apt/lists/* | |
RUN cd /tmp && \ | |
wget -q http://mirrors.ukfast.co.uk/sites/ftp.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \ | |
echo "2E3A5C853B9F28C7D4525C0ADCB0D971B73AD47D5CCE138C85335B9F53A6519540D3923CB0B5CEE41E386E49AE8A409A51AB7194BA11A254E037A848D0C4A9E5 *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \ | |
tar xzf spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C /usr/local --owner root --group root --no-same-owner && \ | |
rm spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz | |
RUN cd /usr/local && ln -s spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark | |
# Mesos dependencies | |
# Install from the Xenial Mesosphere repository since there does not (yet) | |
# exist a Bionic repository and the dependencies seem to be compatible for now. | |
# COPY mesos.key /tmp/ | |
# RUN apt-get -y update && \ | |
# apt-get install --no-install-recommends -y gnupg && \ | |
# apt-key add /tmp/mesos.key && \ | |
# echo "deb http://repos.mesosphere.io/ubuntu xenial main" > /etc/apt/sources.list.d/mesosphere.list && \ | |
# apt-get -y update && \ | |
# apt-get --no-install-recommends -y install mesos=1.2\* && \ | |
# apt-get purge --auto-remove -y gnupg && \ | |
# rm -rf /var/lib/apt/lists/* | |
# Spark and Mesos config | |
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/ | |
ENV SPARK_HOME /usr/local/spark | |
ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip | |
ENV MESOS_NATIVE_LIBRARY /usr/local/lib/libmesos.so | |
ENV SPARK_OPTS --driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info | |
### Set the working directory to /powershell | |
WORKDIR /powershell | |
# Download the Microsoft repository GPG keys, register, install powershell and finally start | |
# See also: https://docs.microsoft.com/en-us/powershell/scripting/install/installing-powershell-core-on-linux?view=powershell-6 | |
RUN wget -q https://packages.microsoft.com/config/ubuntu/18.04/packages-microsoft-prod.deb && \ | |
sudo dpkg -i packages-microsoft-prod.deb && \ | |
sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update && \ | |
sudo apt-get install -y software-properties-common && \ | |
sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update && \ | |
sudo add-apt-repository universe && \ | |
sudo apt-get install -y powershell && \ | |
rm packages-microsoft-prod.deb && \ | |
apt-get clean | |
# Switch from root/superuser to normal user and change directory | |
WORKDIR /home/$NB_USER | |
USER $NB_UID | |
# # Install and activate Jupyter extensions | |
RUN pip install jupyter_contrib_nbextensions && \ | |
jupyter contrib nbextension install --user && \ | |
jupyter nbextensions_configurator enable --user && \ | |
pip install RISE && \ | |
jupyter-nbextension enable rise --py --user && \ | |
jupyter-nbextension enable collapsible_headings/main --user && \ | |
jupyter-nbextension enable scratchpad/main --user && \ | |
jupyter-nbextension enable hide_input_all/main --user && \ | |
jupyter-nbextension enable hide_input/main --user && \ | |
jupyter-nbextension enable comment-uncomment/main --user && \ | |
jupyter-nbextension enable code_prettify/code_prettify --user && \ | |
jupyter-nbextension enable toc2/main --user && \ | |
fix-permissions $CONDA_DIR && \ | |
fix-permissions /home/$NB_USER | |
# # Install additional Python packages | |
RUN pip install pyarrow==0.12.1 && \ | |
pip install koalas && \ | |
pip install streamlit && \ | |
pip install pytest-cov && \ | |
pip install sphinx && \ | |
pip install requests-kerberos | |
# Add custom styling for Jupyter notebook - create folder, change to that folder and download custom.css from GitHub gist | |
RUN mkdir -p /home/$NB_USER/.jupyter/custom && \ | |
cd /home/$NB_USER/.jupyter/custom && \ | |
wget -q https://gist.githubusercontent.com/simicd/20d40cae61305aecdf372c4ac0cffc4a/raw/082096fe90fd58095f84aab9030fa8a643713ec3/custom.css |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment