Last active
January 1, 2020 11:00
-
-
Save TomLous/6e00eaafc15f970d570b735f847682b3 to your computer and use it in GitHub Desktop.
Base image spark-runner docker file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Information | |
# Main Spark Docker file code: https://github.com/apache/spark/blob/master/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile | |
ARG SPARK_IMAGE=gcr.io/spark-operator/spark:v2.4.4 | |
FROM ${SPARK_IMAGE} | |
ARG VERSION | |
ARG VCS_REF | |
ARG BUILD_DATE | |
ENV SCALA_VERSION 2.12 | |
ENV SPARK_VERSION 2.4.4 | |
ENV HADOOP_VERSION 3.2.1 | |
ENV KUBERNETES_CLIENT_VERSION 4.6.4 | |
ENV REPO_URL http://central.maven.org/maven2 | |
ENV ARCHIVE_URL http://archive.apache.org/dist | |
# Adhere to opencontainers image spec https://github.com/opencontainers/image-spec/blob/master/annotations.md | |
LABEL org.opencontainers.image.title="graphiq-spark-runner" \ | |
org.opencontainers.image.created=${BUILD_DATE} \ | |
org.opencontainers.image.description="base image for Scala Spark jobs that need to run on kubernetes via the spark-operator" \ | |
org.opencontainers.image.source="https://github.com/TomLous/medium-spark-k8s" \ | |
org.opencontainers.image.documentation="https://github.com/TomLous/medium-spark-k8s/blob/master/README.md" \ | |
org.opencontainers.image.revison=${VCS_REF} \ | |
org.opencontainers.image.vendor="GraphIQ" \ | |
org.opencontainers.image.version=${VERSION} \ | |
org.opencontainers.image.authors="Tom Lous <[email protected]>" \ | |
version.scala=${SCALA_VERSION} \ | |
version.spark=${SPARK_VERSION} \ | |
version.hadoop=${HADOOP_VERSION} \ | |
version.kubernetes-client=${KUBERNETES_CLIENT_VERSION} | |
# Install Tools | |
RUN apk --no-cache add curl | |
# Hadoop Config | |
ENV HADOOP_HOME "/opt/hadoop" | |
RUN rm -rf ${HADOOP_HOME}/ \ | |
&& cd /opt \ | |
&& curl -sL --retry 3 "${ARCHIVE_URL}/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" | tar xz \ | |
&& chown -R root:root hadoop-${HADOOP_VERSION} \ | |
&& ln -sfn hadoop-${HADOOP_VERSION} hadoop \ | |
&& rm -rf ${HADOOP_HOME}/share/doc \ | |
&& find /opt/ -name *-sources.jar -delete | |
ENV PATH="${HADOOP_HOME}/bin:${PATH}" | |
ENV HADOOP_CONF_DIR "${HADOOP_HOME}/etc/hadoop" | |
# Spark Config | |
# Since the conf/ folder gets mounted over by the spark-operator we move the spark-env.sh to another folder to be sourced in the entrypoint.sh. No good solution exists to merge the original conf folder with the volumeMount | |
RUN rm -rf ${SPARK_HOME}/ \ | |
&& cd /opt \ | |
&& curl -sL --retry 3 "${ARCHIVE_URL}/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop-scala-${SCALA_VERSION}.tgz" | tar xz \ | |
&& mv spark-${SPARK_VERSION}-bin-without-hadoop-scala-${SCALA_VERSION} spark-${SPARK_VERSION} \ | |
&& chown -R root:root spark-${SPARK_VERSION} \ | |
&& ln -sfn spark-${SPARK_VERSION} spark \ | |
&& mkdir -p ${SPARK_HOME}/conf-org/ \ | |
&& mv ${SPARK_HOME}/conf/spark-env.sh.template ${SPARK_HOME}/conf-org/spark-env.sh \ | |
&& rm -rf ${SPARK_HOME}/examples ${SPARK_HOME}/data ${SPARK_HOME}/tests ${SPARK_HOME}/conf \ | |
&& echo 'export SPARK_DIST_CLASSPATH=$(hadoop classpath)' >> ${SPARK_HOME}/conf-org/spark-env.sh \ | |
&& echo 'export SPARK_EXTRA_CLASSPATH=$(hadoop classpath)' >> ${SPARK_HOME}/conf-org/spark-env.sh | |
ENV PATH="${SPARK_HOME}/bin:${PATH}" | |
# Kubernetes Client <= TODO Remove when this issue is fixed https://andygrove.io/2019/08/apache-spark-regressions-eks/ | |
RUN rm -rf ${SPARK_HOME}/jars/kubernetes-*.jar | |
ADD ${REPO_URL}/io/fabric8/kubernetes-client/${KUBERNETES_CLIENT_VERSION}/kubernetes-client-${KUBERNETES_CLIENT_VERSION}.jar ${SPARK_HOME}/jars | |
ADD ${REPO_URL}/io/fabric8/kubernetes-model/${KUBERNETES_CLIENT_VERSION}/kubernetes-model-${KUBERNETES_CLIENT_VERSION}.jar ${SPARK_HOME}/jars | |
ADD ${REPO_URL}/io/fabric8/kubernetes-model-common/${KUBERNETES_CLIENT_VERSION}/kubernetes-model-common-${KUBERNETES_CLIENT_VERSION}.jar ${SPARK_HOME}/jars | |
# Edit entrypoint to source spark-env.sh before running spark-submit | |
RUN sed -i '30i #CUSTOM\n' /opt/entrypoint.sh \ | |
&& sed -i '/#CUSTOM/a source ${SPARK_HOME}/conf-org/spark-env.sh\n' /opt/entrypoint.sh | |
ENTRYPOINT ["/opt/entrypoint.sh"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment