Last active
August 29, 2015 14:05
-
-
Save tnbred/867111b8e1e600fa588e to your computer and use it in GitHub Desktop.
Install spark 1.0.2 with updated guava version on EMR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
require 'json' | |
require 'emr/common' | |
require 'digest' | |
require 'socket' | |
def run(cmd) | |
if ! system(cmd) then | |
raise "Command failed: #{cmd}" | |
end | |
end | |
def sudo(cmd) | |
run("sudo #{cmd}") | |
end | |
def println(*args) | |
print *args | |
puts | |
end | |
def local_ip | |
orig, Socket.do_not_reverse_lookup = Socket.do_not_reverse_lookup, true # turn off reverse DNS resolution temporarily | |
UDPSocket.open do |s| | |
s.connect '64.233.187.99', 1 | |
s.addr.last | |
end | |
ensure | |
Socket.do_not_reverse_lookup = orig | |
end | |
job_flow = Emr::JsonInfoFile.new('job-flow') | |
instance_info = Emr::JsonInfoFile.new('instance') | |
@hadoop_home="/home/hadoop" | |
@hadoop_apps="/home/hadoop/.versions" | |
@s3_spark_base_url="https://s3.amazonaws.com/elasticmapreduce/samples/spark" | |
@spark_url="http://d3kbcqa49mib13.cloudfront.net/spark-1.0.2-bin-hadoop2.tgz" | |
@spark_version="1.0.0" | |
@shark_version="0.9.1" | |
@scala_version="2.10.3" | |
@hadoop="hadoop2" | |
@local_dir= `mount`.split(' ').grep(/mnt/)[0] << "/spark/" | |
@hadoop_version= job_flow['hadoopVersion'] | |
@is_master = instance_info['isMaster'].to_s == 'true' | |
@master_dns=job_flow['masterPrivateDnsName'] | |
@master_ip=@is_master ? local_ip : `host #{@master_dns}`.scan(/[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}/)[0] | |
def download_from_s3 | |
println "downloading spark from #{@spark_url}" | |
sudo "curl -L --silent --show-error --fail --connect-timeout 60 --max-time 720 --retry 5 -O #{@spark_url}" | |
println "downloading shark from #{@s3_spark_base_url}/#{@spark_version}/shark-#{@shark_version}-bin-#{@hadoop}.tgz" | |
sudo "curl -L --silent --show-error --fail --connect-timeout 60 --max-time 720 --retry 5 -O #{@s3_spark_base_url}/#{@spark_version}/shark-#{@shark_version}-bin-#{@hadoop}.tgz" | |
println "downloading scala from #{@s3_spark_base_url}/#{@spark_version}/scala-#{@scala_version}.tgz" | |
sudo "curl -L --silent --show-error --fail --connect-timeout 60 --max-time 720 --retry 5 -O #{@s3_spark_base_url}/#{@spark_version}/scala-#{@scala_version}.tgz" | |
end | |
def untar_all | |
sudo "tar xzf spark-1.0.2-bin-hadoop2.tgz -C #{@hadoop_apps} && rm -f spark-1.0.2-bin-hadoop2.tgz" | |
sudo "tar xzf shark-#{@shark_version}-bin-#{@hadoop}.tgz -C #{@hadoop_apps} && rm -f shark-#{@shark_version}-bin-#{@hadoop}.tgz" | |
sudo "tar xzf scala-#{@scala_version}.tgz -C #{@hadoop_apps} && rm -f scala-#{@scala_version}.tgz" | |
end | |
def create_symlinks | |
sudo "ln -sf #{@hadoop_apps}/spark-1.0.2-bin-hadoop2 #{@hadoop_home}/spark" | |
sudo "ln -sf #{@hadoop_apps}/shark-#{@shark_version}-bin-#{@hadoop} #{@hadoop_home}/shark" | |
end | |
def write_to_bashrc | |
File.open('/home/hadoop/.bashrc','a') do |file_w| | |
file_w.write("export SCALA_HOME=#{@hadoop_apps}/scala-#{@scala_version}") | |
end | |
end | |
def mk_local_dir | |
sudo "mkdir #{@local_dir}" | |
end | |
def update_guava | |
sudo "rm -f #{@hadoop_home}/share/hadoop/common/lib/guava-11.0.2.jar" | |
sudo "curl -L --silent --show-error --fail --connect-timeout 60 --max-time 720 --retry 5 -O http://search.maven.org/remotecontent?filepath=com/google/guava/guava/14.0.1/guava-14.0.1.jar" | |
sudo "mv guava-14.0.1.jar #{@hadoop_home}/share/hadoop/common/lib/" | |
end | |
def create_spark_env | |
lzo_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/**/hadoop-*lzo.jar")[0] | |
if lzo_jar.nil? | |
then | |
lzo_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/**/hadoop-*lzo*.jar")[0] | |
end | |
if lzo_jar.nil? | |
println "lzo not found inside #{@hadoop_apps}/#{@hadoop_version}/share/" | |
end | |
File.open('/tmp/spark-env.sh','w') do |file_w| | |
file_w.write("export SPARK_MASTER_IP=#{@master_ip}\n") | |
file_w.write("export SCALA_HOME=#{@hadoop_apps}/scala-#{@scala_version}\n") | |
file_w.write("export SPARK_LOCAL_DIRS=#{@local_dir}\n") | |
file_w.write("export SPARK_CLASSPATH=\"/usr/share/aws/emr/emr-fs/lib/*:/usr/share/aws/emr/lib/*:#{@hadoop_home}/share/hadoop/common/lib/*:#{lzo_jar}\"\n") | |
file_w.write("export SPARK_DAEMON_JAVA_OPTS=\"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps\"\n") | |
end | |
sudo "mv /tmp/spark-env.sh #{@hadoop_home}/spark/conf/spark-env.sh" | |
end | |
def create_spark_defaults | |
File.open('/tmp/spark-defaults.conf','w') do |file_w| | |
file_w.write("spark.master spark://#{@master_ip}:7077\n") | |
end | |
sudo "mv /tmp/spark-defaults.conf #{@hadoop_home}/spark/conf/spark-defaults.conf" | |
end | |
def create_shark_env | |
File.open('/tmp/shark-env.sh','w') do |file_w| | |
file_w.write("export SPARK_HOME=/home/hadoop/spark\n") | |
file_w.write("export SPARK_MEM=1g\n") | |
file_w.write("export SHARK_MASTER_MEM=1g\n") | |
file_w.write("export _JAVA_OPTIONS=\"-Xmx2g\"\n") | |
file_w.write("source /home/hadoop/spark/conf/spark-env.sh\n") | |
end | |
sudo "mv /tmp/shark-env.sh #{@hadoop_home}/shark/conf/shark-env.sh" | |
end | |
def copy_files_to_spark_shark | |
gson_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/hadoop/common/**/gson*jar")[0] | |
aws_sdk_jar=Dir.glob("/usr/share/aws/emr/hadoop-state-pusher/**/aws-java-sdk*.jar")[0] | |
core_site_xml=Dir.glob("#{@hadoop_home}/conf/**/core-site.xml")[0] | |
hadoop_common_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/hadoop/common/hadoop-common-#{@hadoop_version}.jar")[0] | |
emr_metrics_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/hadoop/common/**/EmrMetrics-*.jar")[0] | |
shark_jars="#{@hadoop_home}/shark/lib_managed/jars/" | |
sudo "cp #{gson_jar} #{shark_jars}" | |
sudo "cp #{aws_sdk_jar} #{shark_jars}" | |
sudo "cp #{emr_metrics_jar} #{shark_jars}" | |
sudo "cp #{hadoop_common_jar} #{shark_jars}" | |
#copy core site to spark and shark | |
sudo "cp #{core_site_xml} #{@hadoop_home}/spark/conf/" | |
sudo "cp #{core_site_xml} #{@hadoop_home}/shark/conf/" | |
end | |
def test_connection_with_master | |
attempt=0 | |
until (system("nc -z #{@master_ip} 7077")) | |
attempt += 1 | |
if attempt < 20 | |
then | |
sleep(5) | |
else | |
break | |
end | |
end | |
if attempt == 20 | |
then | |
return false | |
else | |
return true | |
end | |
end | |
download_from_s3 | |
untar_all | |
create_symlinks | |
mk_local_dir | |
update_guava | |
create_spark_env | |
create_shark_env | |
create_spark_defaults | |
copy_files_to_spark_shark | |
#remove hadoop-core | |
hadoop_core_jar=Dir.glob("/home/hadoop/shark/lib_managed/jars/**/hadoop-core*jar")[0] | |
sudo "rm -rf #{hadoop_core_jar}" | |
if @is_master then | |
sudo "#{@hadoop_home}/spark/sbin/start-master.sh" | |
else | |
if test_connection_with_master | |
then | |
sudo "#{@hadoop_home}/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://#{@master_ip}:7077 >> /tmp/registeringWorkerLog.log 2>&1 &" | |
else | |
raise RuntimeError, 'Worker not able to connect to master' | |
end | |
end |
This comment suggests that lib/native needs to be added to the PATH: http://stackoverflow.com/a/10038290/541202
I tried adding it to the SPARK_CLASSPATH but that didn't help.
file_w.write("export SPARK_CLASSPATH=\"/usr/share/aws/emr/emr-fs/lib/*:/usr/share/aws/emr/lib/*:#{@hadoop_home}/share/hadoop/common/lib/*:#{lzo_jar}:#{@hadoop_apps}/#{@hadoop_version}/lib/native/*\"\n")
Did you try using the commands I describe in my SO answer ?
http://stackoverflow.com/questions/25420861/aws-emr-and-spark-1-0-0/25420862#25420862
If I remember correctly I had issue just exporting the SPARK_CLASSPATH that's why I used those arguments!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you for doing this. I gave it a try and ran the test example from the article: https://aws.amazon.com/articles/Elastic-MapReduce/4926593393724923
The example gives the expected output, but on the last line of the example, I get the traceback below.
Any reason lzo isn't in the build?
scala> val sortedList = reducedList.map(x => (x._2, x._1)).sortByKey(false).take(50)
14/09/04 18:51:43 INFO spark.SparkContext: Starting job: sortByKey at :17
14/09/04 18:51:45 ERROR lzo.GPLNativeCodeLoader: Could not load native gpl library
java.lang.UnsatisfiedLinkError: no gplcompression in java.library.path
at java.lang.ClassLoader.loadLibrary(ClassLoader.java:1886)
at java.lang.Runtime.loadLibrary0(Runtime.java:849)
...
14/09/04 18:51:45 ERROR lzo.LzoCodec: Cannot load native-lzo without native-hadoop
14/09/04 18:51:47 INFO mapred.FileInputFormat: Total input paths to process : 1