Skip to content

Instantly share code, notes, and snippets.

@howardatwork
Last active June 29, 2016 09:02
Show Gist options
  • Save howardatwork/ecde7c22c66ae259a410 to your computer and use it in GitHub Desktop.
Save howardatwork/ecde7c22c66ae259a410 to your computer and use it in GitHub Desktop.
install spark
#!/bin/bash
sudo apt-get -y update
sudo apt-get -y install software-properties-common
# installation of Java. Should install only one!
#default JDK
sudo apt-get -y install default-jdk
java -version
#Optiona for Open JDK
#sudo apt-get install openjdk-7-jdk
#This is for oracle JDK optional
#sudo add-apt-repository -y ppa:webupd8team/java
#sudo apt-get -y update
#sudo apt-get -y install oracle-java7-installer
#Manually run this later if you want to change JDK
#sudo update-alternatives --config java|javac|...
# Installation of commonly used python scipy tools
#sudo apt-get -y install python-numpy python-scipy python-matplotlib ipython ipython-notebook python-pandas python-sympy python-nose
#Add a new group "hadoop" and dedicated hadoop user "hduser" . Although its not manadatory but its recommended to keep hadoop installation separate :
#sudo addgroup hadoop
#sudo adduser --ingroup hadoop hduser
#sudo adduser hduser sudo
#password-less access to local machine
ssh-keygen -t rsa -P "" -f ~/.ssh/id_rsa
cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
#wget http://mirror.tcpdiag.net/apache/hive/stable/apache-hive-1.2.1-bin.tar.gz
wget http://apache.osuosl.org/hadoop/common/hadoop-2.7.1/hadoop-2.7.1.tar.gz
tar zxvf hadoop-2.7.1.tar.gz
cat >>~/.bashrc <<EOF
export JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64
export HADOOP_INSTALL=\$HOME/hadoop-2.7.1
export HADOOP_HOME=\$HOME/hadoop-2.7.1
export PATH=\$PATH:\$HADOOP_INSTALL/bin:\$HADOOP_INSTALL/sbin
export HADOOP_MAPRED_HOME=\$HADOOP_INSTALL
export HADOOP_COMMON_HOME=\$HADOOP_INSTALL
export HADOOP_HDFS_HOME=\$HADOOP_INSTALL
export YARN_HOME=\$HADOOP_INSTALL
export HADOOP_COMMON_LIB_NATIVE_DIR=\$HADOOP_INSTALL/lib/native
export HADOOP_OPTS="-Djava.library.path=\$HADOOP_INSTALL/lib/native"
export HADOOP_CONF_DIR=\$HADOOP_INSTALL/etc/hadoop
EOF
export JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64
export HADOOP_INSTALL=$HOME/hadoop-2.7.1
export HADOOP_HOME=$HOME/hadoop-2.7.1
export PATH=$PATH:$HADOOP_INSTALL/bin:$HADOOP_INSTALL/sbin
export HADOOP_MAPRED_HOME=$HADOOP_INSTALL
export HADOOP_COMMON_HOME=$HADOOP_INSTALL
export HADOOP_HDFS_HOME=$HADOOP_INSTALL
export YARN_HOME=$HADOOP_INSTALL
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_INSTALL/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_INSTALL/lib/native"
export HADOOP_CONF_DIR=$HADOOP_INSTALL/etc/hadoop
#install snappy compression
sudo apt-get install -y libsnappy-dev
cp /usr/lib/libsnappy* $HADOOP_HOME/lib/native/
sed -i 's/\${JAVA_HOME}/\/usr\/lib\/jvm\/java-7-openjdk-amd64/g' $HADOOP_CONF_DIR/hadoop-env.sh
cat >$HADOOP_CONF_DIR/core-site.xml <<EOF
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
EOF
cat >$HADOOP_CONF_DIR/yarn-site.xml <<EOF
<configuration>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
EOF
cat >$HADOOP_CONF_DIR/hdfs-site.xml <<EOF
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.name.dir</name>
<value>file:///home/vagrant/hadoopinfra/hdfs/namenode</value>
</property>
<property>
<name>dfs.data.dir</name>
<value>file:///home/vagrant/hadoopinfra/hdfs/datanode</value>
</property>
</configuration>
EOF
cat >$HADOOP_CONF_DIR/mapred-site.xml <<EOF
<configuration>
<property>
<name>mapred.job.tracker</name>
<value>localhost:54311</value>
<description>The host and port that the MapReduce job tracker runs
at. If "local", then jobs are run in-process as a single map
and reduce task.
</description>
</property>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
EOF
#format the HDFS
hadoop namenode -format
# start Hadoop. Note that start-all.sh is deprecated.
#start-dfs.sh
#start-yarn.sh
#Accessing Hadoop on Browser
#The default port number to access Hadoop is 50070. Use the following url to get Hadoop services on browser.
#http://localhost:50070/
#check the Yarn UI: http://localhost:8088/cluster
# Installation of scala
wget http://www.scala-lang.org/files/archive/scala-2.10.6.deb
sudo dpkg -i scala-2.10.6.deb
sudo apt-get install -y -f
# Installation of sbt
wget https://dl.bintray.com/sbt/debian/sbt-0.13.8.deb
sudo dpkg -i sbt-0.13.8.deb
# Downloading spark
#wget http://d3kbcqa49mib13.cloudfront.net/spark-1.5.1.tgz
#tar -zxf spark-1.5.1.tgz
#cd spark-1.5.1
wget http://d3kbcqa49mib13.cloudfront.net/spark-1.5.2-bin-hadoop2.6.tgz
tar -zxf spark-1.5.2-bin-hadoop2.6.tgz
cp spark-1.5.2-bin-hadoop2.6/conf/spark-env.sh.template spark-1.5.2-bin-hadoop2.6/conf/spark-env.sh
cp spark-1.5.2-bin-hadoop2.6/conf/spark-defaults.conf.template spark-1.5.2-bin-hadoop2.6/conf/spark-defaults.conf
cp spark-1.5.2-bin-hadoop2.6/conf/log4j.properties.template spark-1.5.2-bin-hadoop2.6/conf/log4j.properties
cp spark-1.5.2-bin-hadoop2.6/conf/slaves.template spark-1.5.2-bin-hadoop2.6/conf/slaves
mkdir spark-1.5.2-bin-hadoop2.6/logs
cat >>~/.bashrc <<EOF
export SPARK_HOME=\$HOME/spark-1.5.2-bin-hadoop2.6
export SPARK_CONF_DIR=\$SPARK_HOME/conf
export PATH=\$PATH:\$SPARK_HOME/bin
export LIVY_HOME=\$HOME/hue/apps/spark/java
EOF
export SPARK_HOME=$HOME/spark-1.5.2-bin-hadoop2.6
export SPARK_CONF_DIR=$SPARK_HOME/conf
export PATH=$PATH:$SPARK_HOME/bin
export LIVY_HOME=$HOME/hue/apps/spark/java
cat >>~/spark-1.5.2-bin-hadoop2.6/conf/spark-env.sh <<EOF
export SPARK_MASTER_IP=localhost
export SPARK_WORKER_CORES=1
export SPARK_WORKER_MEMORY=800m
export SPARK_WORKER_INSTANCES=1
EOF
cat >>~/spark-1.5.2-bin-hadoop2.6/conf/spark-defaults.conf <<EOF
spark.driver.extraClassPath $HOME/hadoop-2.7.1/share/hadoop/tools/lib/hadoop-azure-2.7.1.jar:$HOME/hadoop-2.7.1/share/hadoop/tools/lib/azure-storage-2.0.0.jar
EOF
#mvn -Pyarn -Phadoop-2.6 -Dhadoop.version=2.6.0 -Phive -Phive-thriftserver -Dscala-2.10.0 -DskipTests clean package
#Livy server
sudo apt-get install -y maven
sudo apt-get install -y git
git clone https://github.com/cloudera/hue.git
cd hue/apps/spark/java
mvn -DskipTests -Dspark.version=1.5.2 clean package
#./bin/livy-server &
cd
cp ~/hue/apps/spark/java/conf/livy-defaults.conf.template ~/hue/apps/spark/java/conf/livy-defaults.conf
cat >>~/hue/apps/spark/java/conf/livy-defaults.conf <<EOF
livy.server.session.factory = yarn
EOF
sudo apt-get install -y zookeeperd
wget http://mirrors.sonic.net/apache/kafka/0.8.2.2/kafka_2.10-0.8.2.2.tgz
tar zxvf kafka_2.10-0.8.2.2.tgz
cat >>~/kafka_2.10-0.8.2.2/config/server.properties <<EOF
delete.topic.enable = true
EOF
#start kafka with nohup
# ~/kafka_2.10-0.8.2.2/bin/kafka-server-start.sh -daemon ~/kafka_2.10-0.8.2.2/config/server.properties
wget http://apache.claz.org/hive/stable/apache-hive-1.2.1-bin.tar.gz
tar zxvf apache-hive-1.2.1-bin.tar.gz
cat >>~/.bashrc <<EOF
export HIVE_HOME=\$HOME/apache-hive-1.2.1-bin
export HCAT_HOME=~/apache-hive-1.2.1-bin/hcatalog
export PATH=\$PATH:\$HIVE_HOME/bin:\$HCAT_HOME/bin
#export CLASSPATH=\$CLASSPATH:\$HADOOP_HOME/lib/*:\$HIVE_HOME/lib/*
EOF
export HIVE_HOME=$HOME/apache-hive-1.2.1-bin
export HCAT_HOME=~/apache-hive-1.2.1-bin/hcatalog
export PATH=$PATH:$HIVE_HOME/bin:$HCAT_HOME/bin
#export CLASSPATH=$CLASSPATH:$HADOOP_HOME/lib/*:$HIVE_HOME/lib/*
cp $HIVE_HOME/conf/hive-env.sh.template $HIVE_HOME/conf/hive-env.sh
cat >>$HIVE_HOME/conf/hive-env.sh <<EOF
export HADOOP_HOME=\$HADOOP_HOME
EOF
#use default derby db
wget http://archive.apache.org/dist/db/derby/db-derby-10.10.2.0/db-derby-10.10.2.0-bin.tar.gz
tar zxvf db-derby-10.10.2.0-bin.tar.gz
cat >>~/.bashrc <<EOF
export DERBY_HOME=~/db-derby-10.10.2.0-bin
export PATH=\$PATH:\$DERBY_HOME/bin
export CLASSPATH=\$CLASSPATH:\$DERBY_HOME/lib/derbyclient.jar:\$DERBY_HOME/lib/derbytools.jar
EOF
export DERBY_HOME=~/db-derby-10.10.2.0-bin
export PATH=$PATH:$DERBY_HOME/bin
export CLASSPATH=$CLASSPATH:$DERBY_HOME/lib/derbyclient.jar:$DERBY_HOME/lib/derbytools.jar
mkdir $DERBY_HOME/data
#copy the derby jdbc driver to hadoop and hive
cp $DERBY_HOME/lib/derbyclient.jar $HIVE_HOME/lib/
#config HIVE metastore to use derby
cat >$HIVE_HOME/conf/hive-site.xml <<EOF
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:derby://localhost:1527/metastore_db;create=true </value>
<description>JDBC connect string for a JDBC metastore </description>
</property>
</configuration>
EOF
#create a symlink for spark to hive-site.xml
ln -s $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/hive-site.xml
cat >$HIVE_HOME/conf/jpox.properties <<EOF
javax.jdo.PersistenceManagerFactoryClass = org.jpox.PersistenceManagerFactoryImpl
org.jpox.autoCreateSchema = false
org.jpox.validateTables = false
org.jpox.validateColumns = false
org.jpox.validateConstraints = false
org.jpox.storeManagerType = rdbms
org.jpox.autoCreateSchema = true
org.jpox.autoStartMechanismMode = checked
org.jpox.transactionIsolation = read_committed
javax.jdo.option.DetachAllOnCommit = true
javax.jdo.option.NontransactionalRead = true
javax.jdo.option.ConnectionDriverName = org.apache.derby.jdbc.ClientDriver
javax.jdo.option.ConnectionURL = jdbc:derby://localhost:1527/metastore_db;create = true
javax.jdo.option.ConnectionUserName = APP
javax.jdo.option.ConnectionPassword = mine
EOF
#do this manually if something goes wrong with hive. Hive should auto create them
# $HADOOP_HOME/bin/hadoop fs -mkdir /tmp
# $HADOOP_HOME/bin/hadoop fs -mkdir /user/hive/warehouse
# $HADOOP_HOME/bin/hadoop fs -chmod g+w /tmp
# $HADOOP_HOME/bin/hadoop fs -chmod g+w /user/hive/warehouse
wget http://mirrors.ocf.berkeley.edu/apache/hbase/0.98.17/hbase-0.98.17-hadoop2-bin.tar.gz
tar zxvf hbase-0.98.17-hadoop2-bin.tar.gz
#set JAVA_HOME FOR hbase and don't manage and start zoo keeper. We already have that.
cat >>~/hbase-0.98.17-hadoop2/conf/hbase-env.sh <<EOF
export JAVA_HOME=$JAVA_HOME
export HBASE_MANAGES_ZK=false
EOF
cat >>~/.bashrc <<EOF
export HBASE_HOME=~/hbase-0.98.17-hadoop2
export PATH=\$PATH:\$HBASE_HOME/bin
EOF
export HBASE_HOME=~/hbase-0.98.17-hadoop2
export PATH=$PATH:$HBASE_HOME/bin
cat >~/hbase-0.98.17-hadoop2/conf/hbase-site.xml <<EOF
<configuration>
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
//Here you have to set the path where you want HBase to store its files.
<property>
<name>hbase.rootdir</name>
<value>hdfs://localhost:9000/hbase</value>
</property>
//Here you have to set the path where you want HBase to store its built in zo
okeeper files.
<property>
<name>hbase.zookeeper.property.dataDir</name>
<value>/home/vagrant/zookeeperdata</value>
</property>
</configuration>
EOF
wget https://dist.apache.org/repos/dist/release/kylin/apache-kylin-1.2/apache-kylin-1.2-bin.tar.gz
tar zxvf apache-kylin-1.2-bin.tar.gz
cat >>~/.bashrc <<EOF
export KYLIN_HOME=~/apache-kylin-1.2-bin
export PATH=\$PATH:\$KYLIN_HOME/bin
EOF
export KYLIN_HOME=~/apache-kylin-1.2-bin
export PATH=$PATH:$KYLIN_HOME/bin
#add this to kylin.properties for 1.2 because hbase bin dist is built with hadoop 2.2 not 2.4+
#Kylin uses this to check for cube build status. Got this from their forum.
cat >>$KYLIN_HOME/conf/kylin.properties <<EOF
kylin.job.yarn.app.rest.check.status.url=http://localhost:8088/ws/v1/cluster/apps/${job_id}?anonymous=true
EOF
#Need to create /kylin manually in HDFS if it's not there
#create handy startall.sh and stopall.sh. Modify as needed
cat >>$HOME/startall.sh <<EOF
start-dfs.sh
start-yarn.sh
startNetworkServer -h 0.0.0.0 &
start-hbase.sh
EOF
chmod a+x $HOME/startall.sh
cat >>$HOME/stopall.sh <<EOF
stop-dfs.sh
stop-yarn.sh
stopNetworkServer -h 0.0.0.0
stop-hbase.sh
EOF
chmod a+x $HOME/stopall.sh
#Useful info:
#ports
#Livy - 8998
#hdfs name node - 9000
#namenode UI - 50070
#Yarn cluster UI - 8088
#zookeeper - 2181
#kafka - 9092
#hbase UI - http://localhost:60010
#kylin UI = 7070
#start services
#start-dfs.sh
#start-yarn.sh
#local mode for livy:
#./hue/apps/spark/java/bin/livy-server &
#env \
# LIVY_SERVER_JAVA_OPTS="-Dlivy.server.session.factory=yarn" \
# CLASSPATH=`hadoop classpath` \
# $LIVY_HOME/bin/livy-server &
# ~/kafka_2.10-0.8.2.2/bin/kafka-server-start.sh -daemon ~/kafka_2.10-0.8.2.2/config/server.properties
#start derby
#startNetworkServer -h 0.0.0.0
# ./hbase-0.98.17-hadoop2/bin/start-hbase.sh
#run $KYLIN_HOME/bin/sample.sh to create sample kylin project
# $KYLIN_HOME/bin/kylin.sh start
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment