-
-
Save andershammar/224e1077021d0ea376dd to your computer and use it in GitHub Desktop.
#!/bin/bash -ex | |
if [ "$(cat /mnt/var/lib/info/instance.json | jq -r .isMaster)" == "true" ]; then | |
# Install Git | |
sudo yum -y install git | |
# Install Maven | |
wget -P /tmp http://apache.mirrors.spacedump.net/maven/maven-3/3.3.3/binaries/apache-maven-3.3.3-bin.tar.gz | |
sudo mkdir /opt/apache-maven | |
sudo tar -xvzf /tmp/apache-maven-3.3.3-bin.tar.gz -C /opt/apache-maven | |
cat <<EOF >> /home/hadoop/.bashrc | |
# Maven | |
export MAVEN_HOME=/opt/apache-maven/apache-maven-3.3.3 | |
export PATH=\$MAVEN_HOME/bin:\$PATH | |
EOF | |
source /home/hadoop/.bashrc | |
# Install Zeppelin | |
git clone https://github.com/apache/incubator-zeppelin.git /home/hadoop/zeppelin | |
cd /home/hadoop/zeppelin | |
mvn clean package -Pspark-1.4 -Dhadoop.version=2.6.0 -Phadoop-2.6 -Pyarn -DskipTests | |
# Configure Zeppelin | |
SPARK_DEFAULTS=/usr/lib/spark/conf/spark-defaults.conf | |
declare -a ZEPPELIN_JAVA_OPTS | |
if [ -f $SPARK_DEFAULTS ]; then | |
ZEPPELIN_JAVA_OPTS=("${ZEPPELIN_JAVA_OPTS[@]}" \ | |
$(grep spark.executor.instances $SPARK_DEFAULTS | awk '{print "-D" $1 "=" $2}')) | |
ZEPPELIN_JAVA_OPTS=("${ZEPPELIN_JAVA_OPTS[@]}" \ | |
$(grep spark.executor.cores $SPARK_DEFAULTS | awk '{print "-D" $1 "=" $2}')) | |
ZEPPELIN_JAVA_OPTS=("${ZEPPELIN_JAVA_OPTS[@]}" \ | |
$(grep spark.executor.memory $SPARK_DEFAULTS | awk '{print "-D" $1 "=" $2}')) | |
ZEPPELIN_JAVA_OPTS=("${ZEPPELIN_JAVA_OPTS[@]}" \ | |
$(grep spark.default.parallelism $SPARK_DEFAULTS | awk '{print "-D" $1 "=" $2}')) | |
ZEPPELIN_JAVA_OPTS=("${ZEPPELIN_JAVA_OPTS[@]}" \ | |
$(grep spark.yarn.executor.memoryOverhead $SPARK_DEFAULTS | awk '{print "-D" $1 "=" $2}')) | |
fi | |
echo "${ZEPPELIN_JAVA_OPTS[@]}" | |
cp conf/zeppelin-env.sh.template conf/zeppelin-env.sh | |
cat <<EOF >> conf/zeppelin-env.sh | |
export MASTER=yarn-client | |
export HADOOP_HOME=/usr/lib/hadoop | |
export HADOOP_CONF_DIR=/etc/hadoop/conf | |
export ZEPPELIN_SPARK_USEHIVECONTEXT=false | |
export ZEPPELIN_JAVA_OPTS="${ZEPPELIN_JAVA_OPTS[@]}" | |
export PYTHONPATH=$PYTHONPATH:/usr/lib/spark/python | |
EOF | |
cat <<'EOF' > 0001-Add-Hadoop-libraries-and-EMRFS-to-Zeppelin-classpath.patch | |
From 2b0226e45207758d526522bd22d497c9def7c008 Mon Sep 17 00:00:00 2001 | |
From: Anders Hammar <[email protected]> | |
Date: Fri, 18 Sep 2015 10:24:18 +0000 | |
Subject: [PATCH] Add Hadoop libraries and EMRFS to Zeppelin classpath | |
--- | |
bin/interpreter.sh | 13 +++++++++++++ | |
1 file changed, 13 insertions(+) | |
diff --git a/bin/interpreter.sh b/bin/interpreter.sh | |
index e03a13b..de458f2 100755 | |
--- a/bin/interpreter.sh | |
+++ b/bin/interpreter.sh | |
@@ -89,8 +89,21 @@ if [[ "${INTERPRETER_ID}" == "spark" ]]; then | |
# CDH | |
addJarInDir "${HADOOP_HOME}" | |
addJarInDir "${HADOOP_HOME}/lib" | |
+ | |
+ # Hadoop libraries | |
+ addJarInDir "${HADOOP_HOME}/../hadoop-hdfs" | |
+ addJarInDir "${HADOOP_HOME}/../hadoop-mapreduce" | |
+ addJarInDir "${HADOOP_HOME}/../hadoop-yarn" | |
+ | |
+ # Hadoop LZO | |
+ addJarInDir "${HADOOP_HOME}/../hadoop-lzo/lib" | |
fi | |
+ # Add EMRFS libraries | |
+ addJarInDir "/usr/share/aws/emr/emrfs/conf" | |
+ addJarInDir "/usr/share/aws/emr/emrfs/lib" | |
+ addJarInDir "/usr/share/aws/emr/emrfs/auxlib" | |
+ | |
addJarInDir "${INTERPRETER_DIR}/dep" | |
PYSPARKPATH="${ZEPPELIN_HOME}/interpreter/spark/pyspark/pyspark.zip:${ZEPPELIN_HOME}/interpreter/spark/pyspark/py4j-0.8.2.1-src.zip" | |
-- | |
2.1.0 | |
EOF | |
git config user.email "[email protected]" | |
git config user.name "Your Name" | |
git am 0001-Add-Hadoop-libraries-and-EMRFS-to-Zeppelin-classpath.patch | |
# Start the Zeppelin daemon | |
bin/zeppelin-daemon.sh start | |
fi |
Hi Anders!
I launched EMR cluster with same version of software as I can see here,
Applied your file (manually),
When I enter Zeppelin and try to actually do something in Scala in a notebook I get this. Ever bumped into this?
val people = sc.textFile("s3://mybucket/storage-archive/run=2015-08-15*")
people.take(10)
people: org.apache.spark.rdd.RDD[String] = s3://mybucket/storage-archive/run=2015-08-15* MapPartitionsRDD[3] at textFile at <console>:23
java.lang.RuntimeException: Error in configuring object
at org.apache.hadoop.util.ReflectionUtils.setJobConf(ReflectionUtils.java:109)
at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:75)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:133)
at org.apache.spark.rdd.HadoopRDD.getInputFormat(HadoopRDD.scala:186)
at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:199)
...
Caused by: java.lang.ClassNotFoundException: Class com.hadoop.compression.lzo.LzoCodec not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:1801)
at org.apache.hadoop.io.compress.CompressionCodecFactory.getCodecClasses(CompressionCodecFactory.java:128)
... 59 more
I think that you can add the below one in the common.sh
+ addJarInDir "${SPARK_HOME}/classpath/emr"
+ addJarInDir "${SPARK_HOME}/classpath/emrfs"
+ addJarInDir "${HADOOP_HOME}/share/hadoop/common/lib"
+ addJarInDir "${HADOOP_HOME}/share/hadoop/common/lib/hadoop-lzo.jar"
Thanks for you all comments. I recently updated the script so that it works with Amazon EMR release 4.x clusters.
Note that the script is still using Zeppelin's bundled Spark installation since I haven't been able to get Zeppelin to work together with Amazon's Spark installation (via SPARK_HOME/HADOOP_HOME) yet.
Doesn't look like good idea to patch master in this way: Zeppelin updates this file and patch doesn't work...
anyway, i stuck with "org.apache.spark.SparkException: Found both spark.driver.extraClassPath and
SPARK_CLASSPATH. Use only the former."
@mwacc,
I've got same problem and fixed it with changing ./bin/interpreter.sh
CLASSPATH+=":${ZEPPELIN_CLASSPATH}"
SparkException: Found both spark.driver.extraClassPath and SPARK_CLASSPATH. Use only the former
export SPARK_CLASSPATH+=":${CLASSPATH}"
if [[ -n "${SPARK_SUBMIT}" ]]; then
${SPARK_SUBMIT} --class ${ZEPPELIN_SERVER} --driver-java-options "${JAVA_INTP_OPTS}" ${SPARK_SUBMIT_OPTIONS} ${SPARK_APP_JAR} ${PORT} &
else
${ZEPPELIN_RUNNER} ${JAVA_INTP_OPTS} -cp ${CLASSPATH} ${ZEPPELIN_SERVER} ${PORT} &
fi
Hello guys
I followed the instructions here got the zeppelin working on AWS EMR cluster with spark for release label 4.4.0.
But now I faced an issue when I try to add R interpreter in zeppelin. The maven command I used is
mvn clean package -Pyarn -Pr -DskipTests
I am trying to use the spark provided by EMR cluster so in zeppelin-env.sh I had this configuration:
export MASTER=yarn-client
export HADOOP_HOME=/usr/lib/hadoop
export SPARK_HOME=/usr/lib/spark
export HADOOP_CONF_DIR=/etc/hadoop/conf
export ZEPPELIN_SPARK_USEHIVECONTEXT=false
export ZEPPELIN_JAVA_OPTS="${ZEPPELIN_JAVA_OPTS[@]}"
export PYTHONPATH=$PYTHONPATH:/usr/lib/spark/python
Added spark_home variable
But now it doesn't work even for the spark interpreter.
There is another small issue I want to ask here, when I change something in zeppelin-env.sh and save the file then restart zeppelin-deamon, I should expect the change got applied immediately right? Or do I need to do something else to make sure the change applied? Right now I don't see the change made in zeppelin-env.sh get applied after I restarted zeppelin.
Thanks in advance
Hello guys, can this be run on an existing EMR cluster?
Hi I am using Zeppelin on the new EMR as it's now a built in option. I notice there's no %dep interpreter. How are we to add external packages?
anyone got it working on EMR 5.8.0 ?
since there were no answers, I've created a simple bootstrap script to install Zeppelin 0.8 on EMR (tested with EMR 5.16.0)
Thanks, Anders. It's very helpful for me to setup the apache zeppelin on AWS EMR cluster. And I just add installing nodejs, npm into your script.
sudo yum -y install nodejs npm --enablerepo=epel