Skip to content

Instantly share code, notes, and snippets.

@tnbred
Last active August 29, 2015 14:05
Show Gist options
  • Save tnbred/867111b8e1e600fa588e to your computer and use it in GitHub Desktop.
Save tnbred/867111b8e1e600fa588e to your computer and use it in GitHub Desktop.
Install spark 1.0.2 with updated guava version on EMR
#!/usr/bin/ruby
require 'json'
require 'emr/common'
require 'digest'
require 'socket'
def run(cmd)
if ! system(cmd) then
raise "Command failed: #{cmd}"
end
end
def sudo(cmd)
run("sudo #{cmd}")
end
def println(*args)
print *args
puts
end
def local_ip
orig, Socket.do_not_reverse_lookup = Socket.do_not_reverse_lookup, true # turn off reverse DNS resolution temporarily
UDPSocket.open do |s|
s.connect '64.233.187.99', 1
s.addr.last
end
ensure
Socket.do_not_reverse_lookup = orig
end
job_flow = Emr::JsonInfoFile.new('job-flow')
instance_info = Emr::JsonInfoFile.new('instance')
@hadoop_home="/home/hadoop"
@hadoop_apps="/home/hadoop/.versions"
@s3_spark_base_url="https://s3.amazonaws.com/elasticmapreduce/samples/spark"
@spark_url="http://d3kbcqa49mib13.cloudfront.net/spark-1.0.2-bin-hadoop2.tgz"
@spark_version="1.0.0"
@shark_version="0.9.1"
@scala_version="2.10.3"
@hadoop="hadoop2"
@local_dir= `mount`.split(' ').grep(/mnt/)[0] << "/spark/"
@hadoop_version= job_flow['hadoopVersion']
@is_master = instance_info['isMaster'].to_s == 'true'
@master_dns=job_flow['masterPrivateDnsName']
@master_ip=@is_master ? local_ip : `host #{@master_dns}`.scan(/[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}/)[0]
def download_from_s3
println "downloading spark from #{@spark_url}"
sudo "curl -L --silent --show-error --fail --connect-timeout 60 --max-time 720 --retry 5 -O #{@spark_url}"
println "downloading shark from #{@s3_spark_base_url}/#{@spark_version}/shark-#{@shark_version}-bin-#{@hadoop}.tgz"
sudo "curl -L --silent --show-error --fail --connect-timeout 60 --max-time 720 --retry 5 -O #{@s3_spark_base_url}/#{@spark_version}/shark-#{@shark_version}-bin-#{@hadoop}.tgz"
println "downloading scala from #{@s3_spark_base_url}/#{@spark_version}/scala-#{@scala_version}.tgz"
sudo "curl -L --silent --show-error --fail --connect-timeout 60 --max-time 720 --retry 5 -O #{@s3_spark_base_url}/#{@spark_version}/scala-#{@scala_version}.tgz"
end
def untar_all
sudo "tar xzf spark-1.0.2-bin-hadoop2.tgz -C #{@hadoop_apps} && rm -f spark-1.0.2-bin-hadoop2.tgz"
sudo "tar xzf shark-#{@shark_version}-bin-#{@hadoop}.tgz -C #{@hadoop_apps} && rm -f shark-#{@shark_version}-bin-#{@hadoop}.tgz"
sudo "tar xzf scala-#{@scala_version}.tgz -C #{@hadoop_apps} && rm -f scala-#{@scala_version}.tgz"
end
def create_symlinks
sudo "ln -sf #{@hadoop_apps}/spark-1.0.2-bin-hadoop2 #{@hadoop_home}/spark"
sudo "ln -sf #{@hadoop_apps}/shark-#{@shark_version}-bin-#{@hadoop} #{@hadoop_home}/shark"
end
def write_to_bashrc
File.open('/home/hadoop/.bashrc','a') do |file_w|
file_w.write("export SCALA_HOME=#{@hadoop_apps}/scala-#{@scala_version}")
end
end
def mk_local_dir
sudo "mkdir #{@local_dir}"
end
def update_guava
sudo "rm -f #{@hadoop_home}/share/hadoop/common/lib/guava-11.0.2.jar"
sudo "curl -L --silent --show-error --fail --connect-timeout 60 --max-time 720 --retry 5 -O http://search.maven.org/remotecontent?filepath=com/google/guava/guava/14.0.1/guava-14.0.1.jar"
sudo "mv guava-14.0.1.jar #{@hadoop_home}/share/hadoop/common/lib/"
end
def create_spark_env
lzo_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/**/hadoop-*lzo.jar")[0]
if lzo_jar.nil?
then
lzo_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/**/hadoop-*lzo*.jar")[0]
end
if lzo_jar.nil?
println "lzo not found inside #{@hadoop_apps}/#{@hadoop_version}/share/"
end
File.open('/tmp/spark-env.sh','w') do |file_w|
file_w.write("export SPARK_MASTER_IP=#{@master_ip}\n")
file_w.write("export SCALA_HOME=#{@hadoop_apps}/scala-#{@scala_version}\n")
file_w.write("export SPARK_LOCAL_DIRS=#{@local_dir}\n")
file_w.write("export SPARK_CLASSPATH=\"/usr/share/aws/emr/emr-fs/lib/*:/usr/share/aws/emr/lib/*:#{@hadoop_home}/share/hadoop/common/lib/*:#{lzo_jar}\"\n")
file_w.write("export SPARK_DAEMON_JAVA_OPTS=\"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps\"\n")
end
sudo "mv /tmp/spark-env.sh #{@hadoop_home}/spark/conf/spark-env.sh"
end
def create_spark_defaults
File.open('/tmp/spark-defaults.conf','w') do |file_w|
file_w.write("spark.master spark://#{@master_ip}:7077\n")
end
sudo "mv /tmp/spark-defaults.conf #{@hadoop_home}/spark/conf/spark-defaults.conf"
end
def create_shark_env
File.open('/tmp/shark-env.sh','w') do |file_w|
file_w.write("export SPARK_HOME=/home/hadoop/spark\n")
file_w.write("export SPARK_MEM=1g\n")
file_w.write("export SHARK_MASTER_MEM=1g\n")
file_w.write("export _JAVA_OPTIONS=\"-Xmx2g\"\n")
file_w.write("source /home/hadoop/spark/conf/spark-env.sh\n")
end
sudo "mv /tmp/shark-env.sh #{@hadoop_home}/shark/conf/shark-env.sh"
end
def copy_files_to_spark_shark
gson_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/hadoop/common/**/gson*jar")[0]
aws_sdk_jar=Dir.glob("/usr/share/aws/emr/hadoop-state-pusher/**/aws-java-sdk*.jar")[0]
core_site_xml=Dir.glob("#{@hadoop_home}/conf/**/core-site.xml")[0]
hadoop_common_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/hadoop/common/hadoop-common-#{@hadoop_version}.jar")[0]
emr_metrics_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/hadoop/common/**/EmrMetrics-*.jar")[0]
shark_jars="#{@hadoop_home}/shark/lib_managed/jars/"
sudo "cp #{gson_jar} #{shark_jars}"
sudo "cp #{aws_sdk_jar} #{shark_jars}"
sudo "cp #{emr_metrics_jar} #{shark_jars}"
sudo "cp #{hadoop_common_jar} #{shark_jars}"
#copy core site to spark and shark
sudo "cp #{core_site_xml} #{@hadoop_home}/spark/conf/"
sudo "cp #{core_site_xml} #{@hadoop_home}/shark/conf/"
end
def test_connection_with_master
attempt=0
until (system("nc -z #{@master_ip} 7077"))
attempt += 1
if attempt < 20
then
sleep(5)
else
break
end
end
if attempt == 20
then
return false
else
return true
end
end
download_from_s3
untar_all
create_symlinks
mk_local_dir
update_guava
create_spark_env
create_shark_env
create_spark_defaults
copy_files_to_spark_shark
#remove hadoop-core
hadoop_core_jar=Dir.glob("/home/hadoop/shark/lib_managed/jars/**/hadoop-core*jar")[0]
sudo "rm -rf #{hadoop_core_jar}"
if @is_master then
sudo "#{@hadoop_home}/spark/sbin/start-master.sh"
else
if test_connection_with_master
then
sudo "#{@hadoop_home}/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://#{@master_ip}:7077 >> /tmp/registeringWorkerLog.log 2>&1 &"
else
raise RuntimeError, 'Worker not able to connect to master'
end
end
@maneta
Copy link

maneta commented Aug 21, 2014

Hello, could you tell me wich AMI on EMR are you using? I'm trying to use you script but in my ami version I dont have the "hadoop-common-#{@hadoop_version}.jar"

In fact I dont have the share/hadoop/common directory at all...

Thanks and best reggards

@maneta
Copy link

maneta commented Aug 21, 2014

Don Worry about it I Was trying to install on a hadoop 1.x thanks!

@tnbred
Copy link
Author

tnbred commented Aug 21, 2014

Indeed I should specify that this is meant for 3.0.3 or 3.0.4 ( which is the latest AMI with hadoop 2.2.0 at the time of this writing )

@jkleckner
Copy link

Thank you for doing this. I gave it a try and ran the test example from the article: https://aws.amazon.com/articles/Elastic-MapReduce/4926593393724923

The example gives the expected output, but on the last line of the example, I get the traceback below.
Any reason lzo isn't in the build?

scala> val sortedList = reducedList.map(x => (x._2, x._1)).sortByKey(false).take(50)
14/09/04 18:51:43 INFO spark.SparkContext: Starting job: sortByKey at :17
14/09/04 18:51:45 ERROR lzo.GPLNativeCodeLoader: Could not load native gpl library
java.lang.UnsatisfiedLinkError: no gplcompression in java.library.path
at java.lang.ClassLoader.loadLibrary(ClassLoader.java:1886)
at java.lang.Runtime.loadLibrary0(Runtime.java:849)
...

14/09/04 18:51:45 ERROR lzo.LzoCodec: Cannot load native-lzo without native-hadoop
14/09/04 18:51:47 INFO mapred.FileInputFormat: Total input paths to process : 1

@jkleckner
Copy link

This comment suggests that lib/native needs to be added to the PATH: http://stackoverflow.com/a/10038290/541202

I tried adding it to the SPARK_CLASSPATH but that didn't help.

file_w.write("export SPARK_CLASSPATH=\"/usr/share/aws/emr/emr-fs/lib/*:/usr/share/aws/emr/lib/*:#{@hadoop_home}/share/hadoop/common/lib/*:#{lzo_jar}:#{@hadoop_apps}/#{@hadoop_version}/lib/native/*\"\n")

@tnbred
Copy link
Author

tnbred commented Sep 24, 2014

Did you try using the commands I describe in my SO answer ?
http://stackoverflow.com/questions/25420861/aws-emr-and-spark-1-0-0/25420862#25420862
If I remember correctly I had issue just exporting the SPARK_CLASSPATH that's why I used those arguments!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment