-
-
Save tnbred/867111b8e1e600fa588e to your computer and use it in GitHub Desktop.
#!/usr/bin/ruby | |
require 'json' | |
require 'emr/common' | |
require 'digest' | |
require 'socket' | |
def run(cmd) | |
if ! system(cmd) then | |
raise "Command failed: #{cmd}" | |
end | |
end | |
def sudo(cmd) | |
run("sudo #{cmd}") | |
end | |
def println(*args) | |
print *args | |
puts | |
end | |
def local_ip | |
orig, Socket.do_not_reverse_lookup = Socket.do_not_reverse_lookup, true # turn off reverse DNS resolution temporarily | |
UDPSocket.open do |s| | |
s.connect '64.233.187.99', 1 | |
s.addr.last | |
end | |
ensure | |
Socket.do_not_reverse_lookup = orig | |
end | |
job_flow = Emr::JsonInfoFile.new('job-flow') | |
instance_info = Emr::JsonInfoFile.new('instance') | |
@hadoop_home="/home/hadoop" | |
@hadoop_apps="/home/hadoop/.versions" | |
@s3_spark_base_url="https://s3.amazonaws.com/elasticmapreduce/samples/spark" | |
@spark_url="http://d3kbcqa49mib13.cloudfront.net/spark-1.0.2-bin-hadoop2.tgz" | |
@spark_version="1.0.0" | |
@shark_version="0.9.1" | |
@scala_version="2.10.3" | |
@hadoop="hadoop2" | |
@local_dir= `mount`.split(' ').grep(/mnt/)[0] << "/spark/" | |
@hadoop_version= job_flow['hadoopVersion'] | |
@is_master = instance_info['isMaster'].to_s == 'true' | |
@master_dns=job_flow['masterPrivateDnsName'] | |
@master_ip=@is_master ? local_ip : `host #{@master_dns}`.scan(/[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}/)[0] | |
def download_from_s3 | |
println "downloading spark from #{@spark_url}" | |
sudo "curl -L --silent --show-error --fail --connect-timeout 60 --max-time 720 --retry 5 -O #{@spark_url}" | |
println "downloading shark from #{@s3_spark_base_url}/#{@spark_version}/shark-#{@shark_version}-bin-#{@hadoop}.tgz" | |
sudo "curl -L --silent --show-error --fail --connect-timeout 60 --max-time 720 --retry 5 -O #{@s3_spark_base_url}/#{@spark_version}/shark-#{@shark_version}-bin-#{@hadoop}.tgz" | |
println "downloading scala from #{@s3_spark_base_url}/#{@spark_version}/scala-#{@scala_version}.tgz" | |
sudo "curl -L --silent --show-error --fail --connect-timeout 60 --max-time 720 --retry 5 -O #{@s3_spark_base_url}/#{@spark_version}/scala-#{@scala_version}.tgz" | |
end | |
def untar_all | |
sudo "tar xzf spark-1.0.2-bin-hadoop2.tgz -C #{@hadoop_apps} && rm -f spark-1.0.2-bin-hadoop2.tgz" | |
sudo "tar xzf shark-#{@shark_version}-bin-#{@hadoop}.tgz -C #{@hadoop_apps} && rm -f shark-#{@shark_version}-bin-#{@hadoop}.tgz" | |
sudo "tar xzf scala-#{@scala_version}.tgz -C #{@hadoop_apps} && rm -f scala-#{@scala_version}.tgz" | |
end | |
def create_symlinks | |
sudo "ln -sf #{@hadoop_apps}/spark-1.0.2-bin-hadoop2 #{@hadoop_home}/spark" | |
sudo "ln -sf #{@hadoop_apps}/shark-#{@shark_version}-bin-#{@hadoop} #{@hadoop_home}/shark" | |
end | |
def write_to_bashrc | |
File.open('/home/hadoop/.bashrc','a') do |file_w| | |
file_w.write("export SCALA_HOME=#{@hadoop_apps}/scala-#{@scala_version}") | |
end | |
end | |
def mk_local_dir | |
sudo "mkdir #{@local_dir}" | |
end | |
def update_guava | |
sudo "rm -f #{@hadoop_home}/share/hadoop/common/lib/guava-11.0.2.jar" | |
sudo "curl -L --silent --show-error --fail --connect-timeout 60 --max-time 720 --retry 5 -O http://search.maven.org/remotecontent?filepath=com/google/guava/guava/14.0.1/guava-14.0.1.jar" | |
sudo "mv guava-14.0.1.jar #{@hadoop_home}/share/hadoop/common/lib/" | |
end | |
def create_spark_env | |
lzo_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/**/hadoop-*lzo.jar")[0] | |
if lzo_jar.nil? | |
then | |
lzo_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/**/hadoop-*lzo*.jar")[0] | |
end | |
if lzo_jar.nil? | |
println "lzo not found inside #{@hadoop_apps}/#{@hadoop_version}/share/" | |
end | |
File.open('/tmp/spark-env.sh','w') do |file_w| | |
file_w.write("export SPARK_MASTER_IP=#{@master_ip}\n") | |
file_w.write("export SCALA_HOME=#{@hadoop_apps}/scala-#{@scala_version}\n") | |
file_w.write("export SPARK_LOCAL_DIRS=#{@local_dir}\n") | |
file_w.write("export SPARK_CLASSPATH=\"/usr/share/aws/emr/emr-fs/lib/*:/usr/share/aws/emr/lib/*:#{@hadoop_home}/share/hadoop/common/lib/*:#{lzo_jar}\"\n") | |
file_w.write("export SPARK_DAEMON_JAVA_OPTS=\"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps\"\n") | |
end | |
sudo "mv /tmp/spark-env.sh #{@hadoop_home}/spark/conf/spark-env.sh" | |
end | |
def create_spark_defaults | |
File.open('/tmp/spark-defaults.conf','w') do |file_w| | |
file_w.write("spark.master spark://#{@master_ip}:7077\n") | |
end | |
sudo "mv /tmp/spark-defaults.conf #{@hadoop_home}/spark/conf/spark-defaults.conf" | |
end | |
def create_shark_env | |
File.open('/tmp/shark-env.sh','w') do |file_w| | |
file_w.write("export SPARK_HOME=/home/hadoop/spark\n") | |
file_w.write("export SPARK_MEM=1g\n") | |
file_w.write("export SHARK_MASTER_MEM=1g\n") | |
file_w.write("export _JAVA_OPTIONS=\"-Xmx2g\"\n") | |
file_w.write("source /home/hadoop/spark/conf/spark-env.sh\n") | |
end | |
sudo "mv /tmp/shark-env.sh #{@hadoop_home}/shark/conf/shark-env.sh" | |
end | |
def copy_files_to_spark_shark | |
gson_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/hadoop/common/**/gson*jar")[0] | |
aws_sdk_jar=Dir.glob("/usr/share/aws/emr/hadoop-state-pusher/**/aws-java-sdk*.jar")[0] | |
core_site_xml=Dir.glob("#{@hadoop_home}/conf/**/core-site.xml")[0] | |
hadoop_common_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/hadoop/common/hadoop-common-#{@hadoop_version}.jar")[0] | |
emr_metrics_jar=Dir.glob("#{@hadoop_apps}/#{@hadoop_version}/share/hadoop/common/**/EmrMetrics-*.jar")[0] | |
shark_jars="#{@hadoop_home}/shark/lib_managed/jars/" | |
sudo "cp #{gson_jar} #{shark_jars}" | |
sudo "cp #{aws_sdk_jar} #{shark_jars}" | |
sudo "cp #{emr_metrics_jar} #{shark_jars}" | |
sudo "cp #{hadoop_common_jar} #{shark_jars}" | |
#copy core site to spark and shark | |
sudo "cp #{core_site_xml} #{@hadoop_home}/spark/conf/" | |
sudo "cp #{core_site_xml} #{@hadoop_home}/shark/conf/" | |
end | |
def test_connection_with_master | |
attempt=0 | |
until (system("nc -z #{@master_ip} 7077")) | |
attempt += 1 | |
if attempt < 20 | |
then | |
sleep(5) | |
else | |
break | |
end | |
end | |
if attempt == 20 | |
then | |
return false | |
else | |
return true | |
end | |
end | |
download_from_s3 | |
untar_all | |
create_symlinks | |
mk_local_dir | |
update_guava | |
create_spark_env | |
create_shark_env | |
create_spark_defaults | |
copy_files_to_spark_shark | |
#remove hadoop-core | |
hadoop_core_jar=Dir.glob("/home/hadoop/shark/lib_managed/jars/**/hadoop-core*jar")[0] | |
sudo "rm -rf #{hadoop_core_jar}" | |
if @is_master then | |
sudo "#{@hadoop_home}/spark/sbin/start-master.sh" | |
else | |
if test_connection_with_master | |
then | |
sudo "#{@hadoop_home}/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://#{@master_ip}:7077 >> /tmp/registeringWorkerLog.log 2>&1 &" | |
else | |
raise RuntimeError, 'Worker not able to connect to master' | |
end | |
end |
Indeed I should specify that this is meant for 3.0.3 or 3.0.4 ( which is the latest AMI with hadoop 2.2.0 at the time of this writing )
Thank you for doing this. I gave it a try and ran the test example from the article: https://aws.amazon.com/articles/Elastic-MapReduce/4926593393724923
The example gives the expected output, but on the last line of the example, I get the traceback below.
Any reason lzo isn't in the build?
scala> val sortedList = reducedList.map(x => (x._2, x._1)).sortByKey(false).take(50)
14/09/04 18:51:43 INFO spark.SparkContext: Starting job: sortByKey at :17
14/09/04 18:51:45 ERROR lzo.GPLNativeCodeLoader: Could not load native gpl library
java.lang.UnsatisfiedLinkError: no gplcompression in java.library.path
at java.lang.ClassLoader.loadLibrary(ClassLoader.java:1886)
at java.lang.Runtime.loadLibrary0(Runtime.java:849)
...
14/09/04 18:51:45 ERROR lzo.LzoCodec: Cannot load native-lzo without native-hadoop
14/09/04 18:51:47 INFO mapred.FileInputFormat: Total input paths to process : 1
This comment suggests that lib/native needs to be added to the PATH: http://stackoverflow.com/a/10038290/541202
I tried adding it to the SPARK_CLASSPATH but that didn't help.
file_w.write("export SPARK_CLASSPATH=\"/usr/share/aws/emr/emr-fs/lib/*:/usr/share/aws/emr/lib/*:#{@hadoop_home}/share/hadoop/common/lib/*:#{lzo_jar}:#{@hadoop_apps}/#{@hadoop_version}/lib/native/*\"\n")
Did you try using the commands I describe in my SO answer ?
http://stackoverflow.com/questions/25420861/aws-emr-and-spark-1-0-0/25420862#25420862
If I remember correctly I had issue just exporting the SPARK_CLASSPATH that's why I used those arguments!
Don Worry about it I Was trying to install on a hadoop 1.x thanks!