Skip to content

Instantly share code, notes, and snippets.

@sblack4
Last active September 10, 2018 16:51
Show Gist options
  • Select an option

  • Save sblack4/b8d904acb0a3d7a256a7ff2d53e58cb0 to your computer and use it in GitHub Desktop.

Select an option

Save sblack4/b8d904acb0a3d7a256a7ff2d53e58cb0 to your computer and use it in GitHub Desktop.
Bootstrap files for the Realtime Analytics workshop
#!/bin/bash
#
# by http://sblack4.github.io - [email protected]
# updated 2018-07-30
# credit goes to David Bayard for providing the majority of this script
# see his journey 2 (where I got a lot of this) below, I highly recommend it!
# https://github.com/oracle/learning-library/tree/master/workshops/journey2-new-data-lake
echo "Command being run: $0"
# global variables
DIRNAME=`dirname $0`
BASENAME=`basename $0`
BASEDIR=$(cd "$DIRNAME" ; pwd)
_HOSTNAME=$(hostname -f)
function check_root() {
# check if we are root. otherwise, exit
euid=`id -u`
if [ $euid -ne 0 ]; then
echo "This script must be run as root."
echo "please try 'sudo ./bootstrap.sh' "
exit 1
fi
}
function update_yum() {
# clean yum
echo "cleaning up yum metadata just in case"
yum clean metadata
# install helper tool: locate
echo "installing my favorite tools"
yum install -y mlocate git vim wget
updatedb
}
function get_obj_store_url() {
# detect if this is being run by hand or as part of instance creation
# if part of instance creation, this file will live at /u01/app/oracle/tools/bdce/bdcsce/impl-20/vm-scripts
if [ ${BASEDIR} = "/u01/app/oracle/tools/bdce/bdcsce/impl-20/vm-scripts" ]
then
echo This is being run automatically.
objectStoreURL=$(getBaseObjectStoreUrl)
else
echo This is being run manually.
BASEDIR=/u01/app/oracle/tools/bdce/bdcsce/impl-20/vm-scripts
source ${BASEDIR}/constants.sh
source ${BASEDIR}/bdcsce_bootstrap_helper.sh --source_only
objectStoreURL=$(getBaseObjectStoreUrl)
fi
}
function sudo_zeppelin() {
# setup sudo for zeppelin
echo "setting up sudoers for zeppelin"
echo 'zeppelin ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers
echo "last 10 lines of /etc/sudoers"
tail -10 /etc/sudoers
}
function import_notebooks() {
for i in $(getAmbariServerNodes); do
if [ ${_HOSTNAME} = $i ]; then
echo "running singleton Zeppelin section"
echo "downloading lab notebooks"
cd /tmp
git clone https://gist.github.com/7206c7bb83f505a3450844310d4e3f4d.git notebooks
# make sure not to use proxy server for this stuff
export no_proxy='127.0.0.1'
export NO_PROXY='127.0.0.1'
# import notebooks
# https://zeppelin.apache.org/docs/0.7.0/rest-api/rest-notebook.html#import-a-note
echo "importing lab notebooks"
# unzip -o Notes.zip
sed -i -- "s~swift://\$CONTAINER.default~$objectStoreURL~g" *.json
sed -i -- "s~swift://journeyC.default~$objectStoreURL~g" *.json
for note in /tmp/notebooks/*.json
do
echo $note
curl -X POST -d @"$note" -H "Content-Type: application/json" http://127.0.0.1:9995/api/notebook/import
done
# fix sh interpreter timeout and spark kafka dependency
# https://zeppelin.apache.org/docs/0.7.0/rest-api/rest-interpreter.html
echo "fixing sh interpreter timeout"
cat <<EOF > /tmp/sh_settings.py
#!/usr/local/bin/python
#based on https://community.hortonworks.com/articles/36031/sample-code-to-automate-interacting-with-zeppelin.html by Ali Bajwa
import time
def post_request(url, body):
import json, urllib2
encoded_body = json.dumps(body)
req = urllib2.Request(str(url), encoded_body)
req.get_method = lambda: 'PUT'
try:
response = urllib2.urlopen(req, encoded_body).read()
except urllib2.HTTPError, error:
print 'Exception: ' + error.read()
jsonresp = json.loads(response.decode('utf-8'))
print jsonresp['status']
import json, urllib2
zeppelin_int_url = 'http://127.0.0.1:9995/api/interpreter/setting/'
data = json.load(urllib2.urlopen(zeppelin_int_url))
for body in data['body']:
if body['group'] == 'sh':
shbody = body
elif body['group'] == 'spark':
sparkbody = body
shbody['properties']['shell.command.timeout.millisecs'] = '3000000'
post_request(zeppelin_int_url + shbody['id'], shbody)
#time.sleep(120)
my_dict = {'groupArtifactVersion': 'org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0', 'local': False}
sparkbody['dependencies'].append(my_dict)
#post_request(zeppelin_int_url + sparkbody['id'], sparkbody)
EOF
# cat /tmp/sh_settings.py
python /tmp/sh_settings.py
fi
break
done
# end of the do this section only on 1 zeppelin server
}
function install_kafka_producer() {
echo "starting bootstrap script to create a kafka producer on ubuntu"
cd /opt
# download and run anaconda installer in silent mode
curl -O https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh
sudo chmod +x Anaconda3-5.0.1-Linux-x86_64.sh
./Anaconda3-5.0.1-Linux-x86_64.sh -b -p $HOME/anaconda
export PATH="$HOME/anaconda/bin:$PATH"
# clone git repo with kafka producer
git clone https://gist.github.com/7ceb0b255baa1030f9df789f14702489.git kafka_producer
chmod -R 777 /opt/kafka_producer
# cd kafka_producer
# run it! (only after you've updated the config file - you can do this in the notebooks)
# ./run.sh &
}
function get_streaming_files() {
echo "downloading streaming files"
last_dir=$(pwd)
su hdfs
cd /tmp
wget -O lib.zip https://github.com/sblack4/spark-scala-kafka-consumer/blob/master/lib.zip?raw=true
unzip lib.zip
hadoop fs -put lib/spark-kafka-consumer_2.11-1.0.jar /spark
hadoop fs -put lib/spark-streaming-kafka-0-8-assembly_2.11-2.1.1.jar /spark
exit
cd last_dir
}
function main() {
echo "Running bootstrap for Realtime Analytics with OAC Datalake"
check_root
update_yum
get_obj_store_url
sudo_zeppelin
import_notebooks
install_kafka_producer
echo "done with bootstrap for realtime analytics with OAC datalake"
echo "Log file when this runs will be copied up to your default container."
echo "Can also be viewed on the bdcsce server via: cat /u01/bdcsce/data/var/log/bootstrap.*"
}
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment