Created
January 4, 2015 20:16
-
-
Save bigsnarfdude/c816402e5c125f74303d to your computer and use it in GitHub Desktop.
building emr from scratch script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// get old | |
s3cmd get s3://elasticmapreduce/samples/spark/0.8.1/spark-0.8.1-emr.tgz | |
tar -xvf spark-0.8.1-emr.tgz | |
// get newer | |
curl "http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-hadoop1.tgz" -o "spark-1.0.0-bin-hadoop1.tgz" # Grab the latest version of the prebuilt Spark jars. We grabbed Spark 1.0.0 with Hadoop 1 , spark-1.0.0-bin-hadoop1.tgz, from the Spark Downloads page | |
tar -xvf spark-1.0.0-bin-hadoop1.tgz | |
// old metrics properties files | |
cp spark-0.8.1-emr/conf/metrics.properties.aws spark-1.0.0-bin-hadoop1/conf/metrics.properties.aws # AWS requires the metrics.properties.aws file (ours is a copy of the metrics.properties.aws from the spark-0.8.1-emr.tgz posted by amazon at s3://elasticmapreduce/samples/spark/0.8.1/spark-0.8.1-emr.tgz) | |
// move lib folder to jars folder | |
mv spark-1.0.0-bin-hadoop1/lib spark-1.0.0-bin-hadoop1/jars # AWS looks for spark jars in the "jars" folder | |
// sbin folder to bin folder | |
mv spark-1.0.0-bin-hadoop1/sbin/* spark-1.0.0-bin-hadoop1/bin # AWS looks for scripts in the "bin" folder | |
// remove sbin and ec2 directories | |
rmdir spark-1.0.0-bin-hadoop1/sbin # AWS looks for scripts in the "bin" folder | |
rm -rf spark-1.0.0-bin-hadoop1/ec2 # The "ec2" directory is not needed | |
// modify spark-1.0.0-bin-hadoop1/compute-classpath.sh: by removing the following lines | |
########### | |
# # Build up classpath | |
# CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:$FWDIR/conf | |
# | |
########### | |
// modify spark-1.0.0-bin-hadoop1/compute-classpath.sh: add the following lines to include lzo path | |
// (Based on answer here: http://grokbase.com/t/cloudera/cdh-user/144bv47zb0/cdh5-0-spark-shell-cannot-work-when-enable-lzo-in-core-site-xml) | |
########### | |
# # add hadoop home | |
# export HADOOP_HOME=/home/hadoop/ | |
# # Build up classpath | |
# CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:$FWDIR/conf:$HADOOP_HOME/lib/hadoop-lzo.jar" | |
# export JAVA_LIBRARY_PATH=$JAVA_LIBRARY_PATH:$HADOOP_HOME/native/Linux-i386-32 | |
# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/native/Linux-i386-32 | |
// add the following lines to spark-1.0.0-bin-hadoop1/bin/spark-class after the section marked # Find the java binary | |
########### | |
# # set spark_library_path | |
# export HADOOP_HOME=/home/hadoop/ | |
# SPARK_LIBRARY_PATH=$SPARK_LIBRARY_PATH:$HADOOP_HOME/native/Linux-i386-32 | |
tar -zcvf spark-1.0.0-bin-hadoop1-emr.tgz spark-1.0.0-bin-hadoop1 # tar up the modified directory | |
s3cmd put spark-1.0.0-bin-hadoop1-emr.tgz s3://[your-s3-bucket]/spark-1.0.0-bin-hadoop1-emr.tgz # copy the tgz file to your s3 bucket | |
s3cmd get s3://intentmedia-spark/install-spark-shark.sh install-spark-shark.sh | |
// modify install-spark-shark.sh to work with your lzo libraries on s3 and your spark jar on s3: remove the following lines: | |
########### | |
# hadoop fs -copyToLocal s3://intentmedia-spark/hadoop-lzo.jar /home/hadoop/lib | |
# wget http://intentmedia-spark.s3.amazonaws.com/spark-1.0.0-bin-hadoop1.tgz | |
# | |
########### | |
// modify install-spark-shark.sh to work with your lzo libraries on s3 and your spark jar on s3: add the following lines | |
########### | |
# hadoop fs -copyToLocal s3://[your-s3-bucket]/hadoop-lzo.jar /home/hadoop/lib | |
# wget http://[your-s3-bucket].s3.amazonaws.com/spark-1.0.0-bin-hadoop1.tgz | |
s3cmd put install-spark-shark.sh s3://[your-s3-bucket]/install-spark-shark.sh # Copy the install-spark-shark.sh script to your s3 bucket so it can be referenced when invoking the elastic-mapreduce command line command. Note: your s3 bucket may need to be public for your EMR instances to have permission to access it. | |
########### | |
# Steps for compiling lzo on AWS EMR machine images (not necessary if you have a hadoop-lzo.jar that works with the EMR AMIs) | |
########### | |
# The lzo codec depends on native libraries and the jar needs to be built against them. To do this, one option is to ssh into a running EMR cluster and follow the instructions here: https://github.com/kevinweil/hadoop-lzo#building-and-configuring. | |
#Specifically, do: | |
# git clone https://github.com/twitter/hadoop-lzo cd hadoop-lzo mvn clean install | |
#Then, scp the hadoop-lzo.jar file from the "target" directory to your local machine and copy to [your-s3-bucket], maybe in a directory that reflects the architecture it was compiled for: s3cmd put hadoop-lzo.jar s3://[your-s3-bucket]/bootstrap/lzo/i386/hadoop-lzo.jar | |
#Note: when attempting to build the hadoop-lzo project you may run into an error with a jersey jar, which can be fixed by adding the following mirror in /etc/maven2/settings.xml: | |
#<mirror> <id>glassfish-repository</id> <mirrorOf>glassfish-repository</mirrorOf> <name>Repository for Glassfish</name> <url>https://maven.java.net/content/groups/glassfish/</url> </mirror> | |
#and also deleting the files with the wrong content from "~/.m2/repository/net/java/jvnet-parent", based on the answer here: https://answers.atlassian.com/questions/174059/problems-in-plugin-tutorial | |
#FYI (no action necessary) lzo-related changes made to the install-spark-shark.sh script were based aws forums thread about installing lzo: https://forums.aws.amazon.com/thread.jspa?threadID=93856 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment