Skip to content

Instantly share code, notes, and snippets.

@Voronenko
Last active November 6, 2020 19:59
Show Gist options
  • Save Voronenko/36b8ab5d2165612092a4ae54c069b60a to your computer and use it in GitHub Desktop.
Save Voronenko/36b8ab5d2165612092a4ae54c069b60a to your computer and use it in GitHub Desktop.
Pairing session notes
docker build -t spark3:local .
version: "3.7"
services:
spark-master:
# image: elopezdelara/miniconda3-spark:latest
image: spark3:local
expose:
- "8080"
ports:
- "9080:8080"
command:
- start-master.sh
environment:
- SPARK_NO_DAEMONIZE=1
- SPARK_PUBLIC_DNS=localhost
networks:
- spark
volumes:
- ./events:/tmp/spark-events
spark-worker:
depends_on:
- spark-master
# image: elopezdelara/miniconda3-spark:latest
image: spark3:local
expose:
- "8081"
ports:
- "8081:8081"
command:
- start-slave.sh
- spark://spark-master:7077
environment:
- SPARK_NO_DAEMONIZE=1
- SPARK_PUBLIC_DNS=localhost
networks:
- spark
volumes:
- ./events:/tmp/spark-events
jupyter-notebook:
depends_on:
- spark-master
container_name: jupyter-notebook
image: spark3:local
expose:
- "4040"
- "8888"
ports:
- "4040:4040"
- "8888:8888"
networks:
- spark
volumes:
- ./notebooks:/opt/notebooks
- ./events:/tmp/spark-events
command:
- jupyter
- notebook
- --ip='*'
- --port=8888
- --no-browser
- --notebook-dir=/opt/notebooks
- --allow-root
- --NotebookApp.token=''
networks:
spark:
name: spark
FROM continuumio/miniconda3
ENV BASE_DIR /opt
ENV NOTEBOOKS_HOME ${BASE_DIR}/notebooks
ENV SPARK_HOME ${BASE_DIR}/spark
ENV PYSPARK_DRIVER_PYTHON ipython
ENV PATH ${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${PATH}
# Python Packages
RUN conda update -n base conda && \
conda install numpy pandas matplotlib scikit-learn jupyter ipython pyspark=3.0.0 -y --quiet && \
conda clean -tipsy && \
mkdir -p ${NOTEBOOKS_HOME}
RUN mkdir -p /usr/share/man/man1
RUN mkdir /tmp/spark-events
# OpenJDK
RUN apt-get update --fix-missing && \
apt-get install -y openjdk-11-jdk-headless procps curl wget && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Spark
RUN curl -s https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz | tar xz -C /tmp && \
mv /tmp/spark-3.0.0-bin-hadoop2.7 ${SPARK_HOME}
# Google Cloud Storage Connector
RUN wget -q https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar -P ${SPARK_HOME}/jars
RUN wget https://downloads.mysql.com/archives/get/p/3/file/mysql-connector-java-5.1.45.tar.gz
RUN tar xf mysql-connector-java-5.1.45.tar.gz
RUN mv mysql-connector-java-5.1.45/mysql-connector-java-5.1.45-bin.jar ${SPARK_HOME}/jars
RUN rm -rf mysql-connector-java-5.1.45
WORKDIR ${BASE_DIR}/work
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment