Created
May 1, 2024 18:20
-
-
Save coreyoconnor/7b7638143e2d8c149bfc1711facaef33 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ARG REGISTRY=quay.io | |
ARG OWNER=jupyter | |
ARG BASE_CONTAINER=$REGISTRY/$OWNER/scipy-notebook | |
FROM $BASE_CONTAINER | |
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014 | |
SHELL ["/bin/bash", "-o", "pipefail", "-c"] | |
USER root | |
# Spark dependencies | |
# Default values can be overridden at build time | |
# (ARGS are in lowercase to distinguish them from ENV) | |
ARG openjdk_version="17" | |
RUN apt-get update --yes && \ | |
apt-get install --yes --no-install-recommends \ | |
"openjdk-${openjdk_version}-jre-headless" \ | |
ca-certificates-java && \ | |
apt-get clean && rm -rf /var/lib/apt/lists/* | |
# If spark_version is not set, latest stable Spark will be installed | |
ARG spark_version="3.5.1" | |
ARG hadoop_version="3" | |
# If scala_version is not set, Spark without Scala will be installed | |
ARG scala_version="2.13" | |
# URL to use for Spark downloads | |
# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions | |
# But it seems to be slower, that's why we use the recommended site for download | |
ARG spark_download_url="https://dlcdn.apache.org/spark/" | |
ENV SPARK_HOME=/usr/local/spark | |
ENV PATH="${PATH}:${SPARK_HOME}/bin" | |
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" | |
COPY setup_spark.py /opt/setup-scripts/ | |
# Setup Spark | |
RUN /opt/setup-scripts/setup_spark.py \ | |
--spark-version="${spark_version}" \ | |
--hadoop-version="${hadoop_version}" \ | |
--scala-version="${scala_version}" \ | |
--spark-download-url="${spark_download_url}" | |
# Configure IPython system-wide | |
COPY ipython_kernel_config.py "/etc/ipython/" | |
RUN fix-permissions "/etc/ipython/" | |
RUN curl -Lo /bin/coursier https://git.io/coursier-cli && \ | |
chmod +x /bin/coursier | |
USER ${NB_UID} | |
RUN coursier launch --fork almond:0.14.0-RC14 --scala 2.13.12 \ | |
-- --install --id scala2 --display-name "Scala 2" \ | |
--jupyter-path /opt/conda/share/jupyter/kernels \ | |
--global | |
# Install pyarrow | |
# NOTE: It's important to ensure compatibility between Pandas versions. | |
# The pandas version in this Dockerfile should match the version | |
# on which the Pandas API for Spark is built. | |
# To find the right version: | |
# 1. Check out the Spark branch you are on: <https://github.com/apache/spark> | |
# 2. Find the pandas version in the file `dev/infra/Dockerfile`. | |
RUN mamba install --yes \ | |
'grpcio-status' \ | |
'grpcio' \ | |
'pandas=2.0.3' \ | |
'pyarrow' && \ | |
mamba clean --all -f -y && \ | |
fix-permissions "${CONDA_DIR}" && \ | |
fix-permissions "/home/${NB_USER}" | |
WORKDIR "${HOME}" | |
EXPOSE 4040 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment