Created
May 23, 2018 16:29
-
-
Save pracucci/3ce7e162ad640dada72c5c20d95b781e to your computer and use it in GitHub Desktop.
Compile and run Spark JobServer for Amazon EMR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM ubuntu:16.04 | |
# Config | |
ARG SBT_VERSION=0.13.12 | |
ARG SJS_VERSION=0.8.0 | |
ARG EMR_VERSION=5.13.0 | |
ARG SPARK_VERSION=2.3.0 | |
# Install JDK 8 and some dependencies | |
RUN apt-get update -qq && apt-get install -y -qq openjdk-8-jdk wget python python-pip | |
RUN pip install py4j pyhocon pep8 pyspark | |
# Ensure the default Java max memory is greater than ~600MB otherwise | |
# the python's SubprocessSpec.scala tests won't run | |
RUN if [ $(java -XX:+PrintFlagsFinal -version 2> /dev/null | grep MaxHeapSize | awk '{ print $4 }') -lt 629145600 ]; then \ | |
echo "Please run Docker with more memory (ie. on OSX you can increase it from Docker Preferences)"; \ | |
exit 1; \ | |
fi | |
# Install sbt (dependency) | |
RUN wget https://dl.bintray.com/sbt/debian/sbt-${SBT_VERSION}.deb && \ | |
dpkg -i sbt-${SBT_VERSION}.deb && \ | |
rm -f sbt-${SBT_VERSION}.deb | |
# Define the SPARK_VERSION to compile against, read in project/Versions.scala | |
ENV SPARK_VERSION="${SPARK_VERSION}" | |
# Download Spark JobServer sources | |
RUN wget https://github.com/spark-jobserver/spark-jobserver/archive/v${SJS_VERSION}.tar.gz | |
RUN tar -zxf v${SJS_VERSION}.tar.gz | |
# Cherry pick the PR #1008 to fix context info route when "context-per-jvm = true" | |
RUN cd spark-jobserver-${SJS_VERSION} && \ | |
wget https://github.com/spark-jobserver/spark-jobserver/pull/1008.diff && \ | |
patch -p1 < 1008.diff | |
# Compile Spark JobServer from sources. We run assembly in a dedicated | |
# step because it's likely to fail, yet we don't want to loose the benefits | |
# of Docker layers caching - given "sbt package" is very slow | |
RUN cd spark-jobserver-${SJS_VERSION} && sbt clean update package | |
RUN cd spark-jobserver-${SJS_VERSION} && sbt assembly | |
# Install config files | |
ADD config/emr-${EMR_VERSION}.conf spark-jobserver-${SJS_VERSION}/config/emr-${EMR_VERSION}.conf | |
ADD config/emr-${EMR_VERSION}.sh spark-jobserver-${SJS_VERSION}/config/emr-${EMR_VERSION}.sh | |
ADD config/shiro.ini spark-jobserver-${SJS_VERSION}/config/shiro.ini | |
# Package | |
RUN cd spark-jobserver-${SJS_VERSION} && ./bin/server_package.sh emr-${EMR_VERSION} | |
# Rename the package, adding the EMR version | |
RUN mv /tmp/job-server/job-server.tar.gz /tmp/job-server/spark-job-server-${SJS_VERSION}-emr-${EMR_VERSION}.tar.gz | |
# Copy the tests jar into the job-server package output dir so that it's easier to pick up | |
RUN cp spark-jobserver-${SJS_VERSION}/job-server-tests/target/scala-2.11/job-server-tests_2.11-${SJS_VERSION}.jar /tmp/job-server/spark-job-server-tests-${SJS_VERSION}.jar |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
spark { | |
# spark.master will be passed to each job's JobContext | |
master = "yarn-client" | |
# This needs to match SPARK_HOME for cluster SparkContexts to be created successfully | |
home = "/usr/lib/spark" | |
jobserver { | |
port = 8090 | |
# Required when master = "yarn-client" | |
context-per-jvm = true | |
# Note: JobFileDAO is deprecated from v0.7.0 because of issues in | |
# production and will be removed in future, now defaults to H2 file. | |
jobdao = spark.jobserver.io.JobSqlDAO | |
filedao { | |
rootdir = /mnt/tmp/spark-jobserver/filedao/data | |
} | |
sqldao { | |
# Slick database driver, full classpath | |
slick-driver = slick.driver.H2Driver | |
# JDBC driver, full classpath | |
jdbc-driver = org.h2.Driver | |
# Directory where default H2 driver stores its data. Only needed for H2. | |
rootdir = /mnt/tmp/spark-jobserver/sqldao/data | |
# Full JDBC URL / init string, along with username and password. Sorry, needs to match above. | |
# Substitutions may be used to launch job-server, but leave it out here in the default or tests won't pass | |
jdbc { | |
url = "jdbc:h2:file:/mnt/tmp/spark-jobserver/sqldao/data/h2-db" | |
user = "" | |
password = "" | |
} | |
# DB connection pool settings | |
dbcp { | |
enabled = false | |
} | |
} | |
} | |
# Predefined Spark contexts | |
contexts {} | |
# Default context settings (if not overridden by the specific context) | |
context-settings { | |
num-cpu-cores = 1 | |
memory-per-node = 1G | |
spark.executor.instances = 2 | |
# If you wish to pass any settings directly to the sparkConf as-is, add them here in passthrough, | |
# such as hadoop connection settings that don't use the "spark." prefix | |
passthrough { | |
#es.nodes = "192.1.1.1" | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Default values | |
# | |
appdir=${appdir:-} | |
conffile=${conffile:-} | |
if [ -z "$appdir" ]; then | |
# Set to the directory where this script is located | |
appdir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |
fi | |
if [ -z "$conffile" ]; then | |
# Set the config file to be located in the same directored of this | |
# script, with the same name, but ending with .conf instead of .sh | |
conffile="$appdir/emr-5.13.0.conf" | |
fi | |
# | |
# Customizations | |
# | |
APP_USER=hadoop | |
APP_GROUP=hadoop | |
INSTALL_DIR=/mnt/lib/spark-jobserver | |
LOG_DIR=/mnt/var/log/spark-jobserver | |
PIDFILE=spark-jobserver.pid | |
JOBSERVER_MEMORY=1G | |
SPARK_VERSION=2.3.0 | |
SPARK_HOME=/usr/lib/spark | |
SPARK_CONF_DIR=/etc/spark/conf | |
HADOOP_CONF_DIR=/etc/hadoop/conf | |
YARN_CONF_DIR=/etc/hadoop/conf | |
SCALA_VERSION=2.11.8 | |
MANAGER_JAR_FILE="$appdir/spark-job-server.jar" | |
MANAGER_CONF_FILE="$(basename $conffile)" | |
MANAGER_EXTRA_JAVA_OPTIONS= | |
MANAGER_EXTRA_SPARK_CONFS="spark.yarn.submit.waitAppCompletion=false|spark.files=$appdir/log4jcluster.properties,$conffile" | |
MANAGER_LOGGING_OPTS="-Dlog4j.configuration=log4j-cluster.properties" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Shiro is used for authentication. Authentication is disabled in our | |
# cluster, so we don't need it, but the file needs to exist in order | |
# to get server_package.sh run successfully. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment