Created
January 10, 2020 14:40
-
-
Save tamizhgeek/d49da38e5aaf375203e42473c56dd24e to your computer and use it in GitHub Desktop.
GoCD agent with EMR goodies to connect to EMR and execute spark jobs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -e | |
usage(){ | |
echo """ | |
usage: | |
build_gocd_agent_with_emr_support.sh --ssh_key <SSH key used to log into EMR master> --cluster_ip <IP of the EMR master> | |
""" | |
} | |
if [ $# -eq 0 ]; then | |
usage | |
exit 1 | |
fi | |
ssh_key="" | |
cluster_ip="" | |
git_revision=`git rev-parse HEAD` | |
# parse the arguements | |
while : ; do | |
case "$1" in | |
--ssh_key) | |
[ -n "${ssh_key}" ] && usage | |
ssh_key="$2" | |
shift 2 ;; | |
--cluster_ip) | |
[ -n "${cluster_ip}" ] && usage | |
cluster_ip="$2" | |
shift 2 ;; | |
*) | |
break ;; | |
esac | |
done | |
cleanup() { | |
find . -name "hadoop-binaries-config*" | xargs rm | |
} | |
trap cleanup EXIT | |
ssh -i $ssh_key ec2-user@$cluster_ip <<EOF | |
rm -rf /tmp/hadoop-binaries-configs | |
echo "creating dirs..." | |
mkdir /tmp/hadoop-binaries-configs | |
mkdir /tmp/hadoop-binaries-configs/configs | |
mkdir -p /tmp/hadoop-binaries-configs/configs/hadoop | |
mkdir -p /tmp/hadoop-binaries-configs/configs/hive | |
mkdir -p /tmp/hadoop-binaries-configs/configs/spark | |
mkdir -p /tmp/hadoop-binaries-configs/configs/sqoop | |
echo "copying configs now" | |
cp -rL /etc/hadoop/conf /tmp/hadoop-binaries-configs/configs/hadoop | |
cp -rL /etc/hive/conf /tmp/hadoop-binaries-configs/configs/hive | |
cp -rL /etc/spark/conf /tmp/hadoop-binaries-configs/configs/spark | |
cp -rL /etc/sqoop/conf /tmp/hadoop-binaries-configs/configs/sqoop | |
cp -r /usr/lib/hadoop /tmp/hadoop-binaries-configs | |
cp -r /usr/lib/hadoop-mapreduce /tmp/hadoop-binaries-configs | |
cp -r /usr/lib/hadoop-lzo /tmp/hadoop-binaries-configs | |
cp -r /usr/lib/spark /tmp/hadoop-binaries-configs | |
cp -r /usr/lib/sqoop /tmp/hadoop-binaries-configs | |
cp -r /usr/lib/hadoop-hdfs /tmp/hadoop-binaries-configs | |
cp -r /usr/lib/hadoop-yarn /tmp/hadoop-binaries-configs | |
cp -r /usr/share/aws /tmp/hadoop-binaries-configs | |
cp -r /usr/share/java /tmp/hadoop-binaries-configs | |
cd /tmp | |
rm -rf hadoop-binaries-configs.tar.gz | |
echo "creating archive..." | |
tar -zcf hadoop-binaries-configs.tar.gz hadoop-binaries-configs | |
EOF | |
repo_prefix="<REPLACE_WITH_YOUR_DOCKER_REPOSITORY_PREFIX>" | |
scp -i $ssh_key ec2-user@${cluster_ip}:/tmp/hadoop-binaries-configs.tar.gz . | |
docker build -t "${repo_prefix}-${cluster_ip}:latest" . | |
docker tag "${repo_prefix}-${cluster_ip}:latest" "${repo_prefix}-${cluster_ip}:${git_revision}" | |
$(aws ecr get-login --no-include-email --region eu-west-1) | |
docker push ${repo_prefix}-${cluster_ip}:${git_revision} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ARG GOCD_VERSION=19.5.0 | |
FROM gocd/gocd-agent-centos-7:v$GOCD_VERSION | |
# install java and friends | |
RUN yum -y update && \ | |
yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \ | |
yum install -y java-1.8.0-openjdk-devel xmlstarlet which | |
# setup EMR shenanigans | |
RUN mkdir /tmp/emr-hadoop-downloads | |
COPY hadoop-binaries-configs.tar.gz /tmp/emr-hadoop-downloads | |
### Copy hadoop, hive, and spark configurations | |
### Copy emr jars to the right locations | |
RUN tar xzf /tmp/emr-hadoop-downloads/hadoop-binaries-configs.tar.gz -C /tmp/emr-hadoop-downloads/ && \ | |
cp -r /tmp/emr-hadoop-downloads/hadoop-binaries-configs/configs/hadoop /etc/hadoop && \ | |
cp -r /tmp/emr-hadoop-downloads/hadoop-binaries-configs/configs/hive /etc/hive && \ | |
cp -r /tmp/emr-hadoop-downloads/hadoop-binaries-configs/configs/spark /etc/spark && \ | |
cp -r /tmp/emr-hadoop-downloads/hadoop-binaries-configs/configs/sqoop /etc/sqoop && \ | |
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/aws /usr/share/aws && \ | |
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/hadoop /usr/lib/hadoop && \ | |
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/hadoop-hdfs /usr/lib/hadoop-hdfs && \ | |
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/hadoop-mapreduce /usr/lib/hadoop-mapreduce && \ | |
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/hadoop-yarn /usr/lib/hadoop-yarn && \ | |
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/hadoop-lzo /usr/lib/hadoop-lzo && \ | |
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/spark /usr/lib/spark && \ | |
mv /tmp/emr-hadoop-downloads/hadoop-binaries-configs/sqoop /usr/lib/sqoop && \ | |
cp -R /tmp/emr-hadoop-downloads/hadoop-binaries-configs/java/* /usr/share/java/ && \ | |
rm -rf /tmp/emr-hadoop-downloads | |
### Update SPARK and HADOOP environment variables. | |
ENV JAVA_HOME=/usr/lib/jvm/java-1.8.0/ | |
ENV HADOOP_HOME=/usr/lib/hadoop | |
ENV HADOOP_HDFS_HOME=/usr/lib/hadoop-hdfs | |
ENV HADOOP_MAPREDUCE_HOME=/usr/lib/hadoop-mapreduce | |
ENV HADOOP_YARN_HOME=/usr/lib/hadoop-yarn | |
ENV HADOOP_LIBEXEC_DIR=/usr/lib/hadoop/libexec | |
ENV SPARK_HOME=/usr/lib/spark | |
ENV SQOOP_HOME=/usr/lib/sqoop | |
ENV PATH=${PATH:-}:${SPARK_HOME:-}/bin:${HADOOP_HOME:-}/bin:${SQOOP_HOME:-}/bin:/usr/lib/hadoop-hdfs/bin | |
# ENV PYTHONPATH=${PYTHONPATH:-}:${SPARK_HOME:-}/python/ | |
# ENV PYTHONPATH=${PYTHONPATH:-}:${SPARK_HOME:-}/python/lib/py4j-0.10.7-src.zip | |
ENV HADOOP_CONF_DIR=/etc/hadoop/conf | |
ENV YARN_CONF_DIR=/etc/hadoop/conf | |
ENV SPARK_CONF_DIR=/etc/spark/conf | |
ENV HADOOP_USER_NAME=hdfs | |
#### Temp directories for hadoop | |
RUN mkdir /mnt1 /mnt2 /mnt3 && \ | |
chown go:root -R /mnt /mnt1 /mnt2 /mnt3 && \ | |
mkdir /var/log/spark && \ | |
chown go:root -R /var/log/spark | |
### Update the mapred-site.xml with dynamoDB access | |
RUN xmlstarlet edit -L --omit-decl \ | |
-s '//configuration' -t elem -n "property" \ | |
-s "//configuration/property[last()]" -t elem -n "name" -v "fs.s3.consistent.dynamodb.endpoint" \ | |
-s "//configuration/property[last()]" -t elem -n "value" -v "dynamodb.eu-west-1.amazonaws.com" \ | |
-s '//configuration' -t elem -n "property" \ | |
-s "//configuration/property[last()]" -t elem -n "name" -v "fs.s3.awsAccessKeyId" \ | |
-s "//configuration/property[last()]" -t elem -n "value" -v "_ACCESS_KEY_ID_" \ | |
-s '//configuration' -t elem -n "property" \ | |
-s "//configuration/property[last()]" -t elem -n "name" -v "fs.s3.awsSecretAccessKey" \ | |
-s "//configuration/property[last()]" -t elem -n "value" -v "_SECRET_KEY_" \ | |
$HADOOP_CONF_DIR/mapred-site.xml | |
### set some default spark config variables | |
RUN echo "spark.yarn.am.nodeLabelExpression " >> /etc/spark/conf/spark-defaults.conf && \ | |
echo "spark.dynamicAllocation.enabled true" >> /etc/spark/conf/spark-defaults.conf | |
# override entrypoint to supply credentials at runtime | |
ADD pimped_entrypoint.sh / | |
ENTRYPOINT ["/pimped_entrypoint.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Inject secrets into all files | |
sed -i "s/_ACCESS_KEY_ID_/${ACCESS_KEY_ID}/g" $HADOOP_CONF_DIR/mapred-site.xml | |
sed -i "s~_SECRET_KEY_~${SECRET_KEY}~g" $HADOOP_CONF_DIR/mapred-site.xml | |
./docker-entrypoint.sh |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment