sblack4 · September 10, 2018 16:51
diff --git a/bootstrap.sh b/bootstrap.sh
 #!/bin/bash

 # 
 # by http://sblack4.github.io  - [email protected]
 # updated 2018-07-30
 # credit goes to David Bayard for providing the majority of this script
 # see his journey 2 (where I got a lot of this) below, I highly recommend it!
 # https://github.com/oracle/learning-library/tree/master/workshops/journey2-new-data-lake 


 echo "Command being run: $0"

 # global variables 
 DIRNAME=`dirname $0`
 BASENAME=`basename $0`
 BASEDIR=$(cd "$DIRNAME" ; pwd)
 _HOSTNAME=$(hostname -f)


 function check_root() {
  # check if we are root.  otherwise, exit
  euid=`id -u`
  if [ $euid -ne 0 ]; then
    echo "This script must be run as root."
    echo "please try 'sudo ./bootstrap.sh' "
    exit 1
  fi
 }

 function update_yum() {
  #  clean yum
  echo "cleaning up yum metadata just in case"
  yum clean metadata
  
  # install helper tool: locate
  echo "installing my favorite tools"
  yum install -y mlocate git vim wget
  updatedb
 }


 function get_obj_store_url() {
  # detect if this is being run by hand or as part of instance creation
  # if part of instance creation, this file will live at /u01/app/oracle/tools/bdce/bdcsce/impl-20/vm-scripts

  if [ ${BASEDIR} = "/u01/app/oracle/tools/bdce/bdcsce/impl-20/vm-scripts" ]
  then
    echo This is being run automatically.
    objectStoreURL=$(getBaseObjectStoreUrl)
  else
    echo This is being run manually.
    BASEDIR=/u01/app/oracle/tools/bdce/bdcsce/impl-20/vm-scripts
    source ${BASEDIR}/constants.sh
    source ${BASEDIR}/bdcsce_bootstrap_helper.sh --source_only
    objectStoreURL=$(getBaseObjectStoreUrl)
  fi
 }

 function sudo_zeppelin() {
  #  setup sudo for zeppelin
  echo "setting up sudoers for zeppelin"
  echo 'zeppelin  ALL=(ALL)  NOPASSWD: ALL' >> /etc/sudoers
  echo "last 10 lines of /etc/sudoers"
  tail -10 /etc/sudoers
 }

 function import_notebooks() {
  for i in $(getAmbariServerNodes); do
    if [ ${_HOSTNAME} = $i ]; then
      echo "running singleton Zeppelin section"
      echo "downloading lab notebooks"
      cd /tmp

      git clone https://gist.github.com/7206c7bb83f505a3450844310d4e3f4d.git notebooks
  
      # make sure not to use proxy server for this stuff
      export no_proxy='127.0.0.1'
      export NO_PROXY='127.0.0.1'
  
      # import notebooks
      # https://zeppelin.apache.org/docs/0.7.0/rest-api/rest-notebook.html#import-a-note
      echo "importing lab notebooks"
      # unzip -o Notes.zip
      sed -i -- "s~swift://\$CONTAINER.default~$objectStoreURL~g" *.json
      sed -i -- "s~swift://journeyC.default~$objectStoreURL~g" *.json
      for note in /tmp/notebooks/*.json
      do
        echo $note
        curl -X POST -d @"$note" -H "Content-Type: application/json" http://127.0.0.1:9995/api/notebook/import
      done
  
      # fix sh interpreter timeout and spark kafka dependency
      # https://zeppelin.apache.org/docs/0.7.0/rest-api/rest-interpreter.html
      echo "fixing sh interpreter timeout"
      cat <<EOF > /tmp/sh_settings.py
 #!/usr/local/bin/python
 #based on https://community.hortonworks.com/articles/36031/sample-code-to-automate-interacting-with-zeppelin.html by Ali Bajwa
 import time
 def post_request(url, body):
  import json, urllib2
  encoded_body = json.dumps(body)
  req = urllib2.Request(str(url), encoded_body)
  req.get_method = lambda: 'PUT'
  try:
    response = urllib2.urlopen(req, encoded_body).read()
  except urllib2.HTTPError, error:
    print 'Exception: ' + error.read()
  jsonresp = json.loads(response.decode('utf-8'))
  print jsonresp['status']
        
 
 
 import json, urllib2
 zeppelin_int_url = 'http://127.0.0.1:9995/api/interpreter/setting/'
 data = json.load(urllib2.urlopen(zeppelin_int_url))
 for body in data['body']:
  if body['group'] == 'sh':
    shbody = body
  elif body['group'] == 'spark':
    sparkbody = body    
    

 
 
 shbody['properties']['shell.command.timeout.millisecs'] = '3000000'
 post_request(zeppelin_int_url + shbody['id'], shbody)

 #time.sleep(120)
 my_dict = {'groupArtifactVersion':  'org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0',       'local': False}
 sparkbody['dependencies'].append(my_dict)
 #post_request(zeppelin_int_url + sparkbody['id'], sparkbody)
 EOF
      # cat /tmp/sh_settings.py
      python /tmp/sh_settings.py

    fi
    break
  done
  # end of the do this section only on 1 zeppelin server	
 }

 function install_kafka_producer() {
  echo "starting bootstrap script to create a kafka producer on ubuntu"
  cd /opt
  
  # download and run anaconda installer in silent mode
  curl -O https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh
  sudo chmod +x Anaconda3-5.0.1-Linux-x86_64.sh
  ./Anaconda3-5.0.1-Linux-x86_64.sh -b -p $HOME/anaconda
  export PATH="$HOME/anaconda/bin:$PATH"
  
  # clone git repo with kafka producer
  git clone https://gist.github.com/7ceb0b255baa1030f9df789f14702489.git kafka_producer

  chmod -R 777 /opt/kafka_producer

  # cd kafka_producer
  
  # run it! (only after you've updated the config file - you can do this in the notebooks)
  # ./run.sh & 
 }

 function get_streaming_files() {
  echo "downloading streaming files"

  last_dir=$(pwd)
  su hdfs
  cd /tmp

  wget -O lib.zip https://github.com/sblack4/spark-scala-kafka-consumer/blob/master/lib.zip?raw=true 
  unzip lib.zip 
  hadoop fs -put lib/spark-kafka-consumer_2.11-1.0.jar /spark
  hadoop fs -put lib/spark-streaming-kafka-0-8-assembly_2.11-2.1.1.jar /spark

  exit
  cd last_dir

 }

 function main() {
  echo "Running bootstrap for Realtime Analytics with OAC Datalake"
  
  check_root
  update_yum
  get_obj_store_url
  sudo_zeppelin
  import_notebooks
  install_kafka_producer

  echo "done with bootstrap for realtime analytics with OAC datalake"
  echo "Log file when this runs will be copied up to your default container."
  echo "Can also be viewed on the bdcsce server via: cat /u01/bdcsce/data/var/log/bootstrap.*"
 }

 main
	#!/bin/bash

	#
	# by http://sblack4.github.io - [email protected]
	# updated 2018-07-30
	# credit goes to David Bayard for providing the majority of this script
	# see his journey 2 (where I got a lot of this) below, I highly recommend it!
	# https://github.com/oracle/learning-library/tree/master/workshops/journey2-new-data-lake


	echo "Command being run: $0"

	# global variables
	DIRNAME=`dirname $0`
	BASENAME=`basename $0`
	BASEDIR=$(cd "$DIRNAME" ; pwd)
	_HOSTNAME=$(hostname -f)


	function check_root() {
	# check if we are root. otherwise, exit
	euid=`id -u`
	if [ $euid -ne 0 ]; then
	echo "This script must be run as root."
	echo "please try 'sudo ./bootstrap.sh' "
	exit 1
	fi
	}

	function update_yum() {
	# clean yum
	echo "cleaning up yum metadata just in case"
	yum clean metadata

	# install helper tool: locate
	echo "installing my favorite tools"
	yum install -y mlocate git vim wget
	updatedb
	}


	function get_obj_store_url() {
	# detect if this is being run by hand or as part of instance creation
	# if part of instance creation, this file will live at /u01/app/oracle/tools/bdce/bdcsce/impl-20/vm-scripts

	if [ ${BASEDIR} = "/u01/app/oracle/tools/bdce/bdcsce/impl-20/vm-scripts" ]
	then
	echo This is being run automatically.
	objectStoreURL=$(getBaseObjectStoreUrl)
	else
	echo This is being run manually.
	BASEDIR=/u01/app/oracle/tools/bdce/bdcsce/impl-20/vm-scripts
	source ${BASEDIR}/constants.sh
	source ${BASEDIR}/bdcsce_bootstrap_helper.sh --source_only
	objectStoreURL=$(getBaseObjectStoreUrl)
	fi
	}

	function sudo_zeppelin() {
	# setup sudo for zeppelin
	echo "setting up sudoers for zeppelin"
	echo 'zeppelin ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers
	echo "last 10 lines of /etc/sudoers"
	tail -10 /etc/sudoers
	}

	function import_notebooks() {
	for i in $(getAmbariServerNodes); do
	if [ ${_HOSTNAME} = $i ]; then
	echo "running singleton Zeppelin section"
	echo "downloading lab notebooks"
	cd /tmp

	git clone https://gist.github.com/7206c7bb83f505a3450844310d4e3f4d.git notebooks

	# make sure not to use proxy server for this stuff
	export no_proxy='127.0.0.1'
	export NO_PROXY='127.0.0.1'

	# import notebooks
	# https://zeppelin.apache.org/docs/0.7.0/rest-api/rest-notebook.html#import-a-note
	echo "importing lab notebooks"
	# unzip -o Notes.zip
	sed -i -- "s~swift://\$CONTAINER.default~$objectStoreURL~g" *.json
	sed -i -- "s~swift://journeyC.default~$objectStoreURL~g" *.json
	for note in /tmp/notebooks/*.json
	do
	echo $note
	curl -X POST -d @"$note" -H "Content-Type: application/json" http://127.0.0.1:9995/api/notebook/import
	done

	# fix sh interpreter timeout and spark kafka dependency
	# https://zeppelin.apache.org/docs/0.7.0/rest-api/rest-interpreter.html
	echo "fixing sh interpreter timeout"
	cat <<EOF > /tmp/sh_settings.py
	#!/usr/local/bin/python
	#based on https://community.hortonworks.com/articles/36031/sample-code-to-automate-interacting-with-zeppelin.html by Ali Bajwa
	import time
	def post_request(url, body):
	import json, urllib2
	encoded_body = json.dumps(body)
	req = urllib2.Request(str(url), encoded_body)
	req.get_method = lambda: 'PUT'
	try:
	response = urllib2.urlopen(req, encoded_body).read()
	except urllib2.HTTPError, error:
	print 'Exception: ' + error.read()
	jsonresp = json.loads(response.decode('utf-8'))
	print jsonresp['status']



	import json, urllib2
	zeppelin_int_url = 'http://127.0.0.1:9995/api/interpreter/setting/'
	data = json.load(urllib2.urlopen(zeppelin_int_url))
	for body in data['body']:
	if body['group'] == 'sh':
	shbody = body
	elif body['group'] == 'spark':
	sparkbody = body




	shbody['properties']['shell.command.timeout.millisecs'] = '3000000'
	post_request(zeppelin_int_url + shbody['id'], shbody)

	#time.sleep(120)
	my_dict = {'groupArtifactVersion': 'org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0', 'local': False}
	sparkbody['dependencies'].append(my_dict)
	#post_request(zeppelin_int_url + sparkbody['id'], sparkbody)
	EOF
	# cat /tmp/sh_settings.py
	python /tmp/sh_settings.py

	fi
	break
	done
	# end of the do this section only on 1 zeppelin server
	}

	function install_kafka_producer() {
	echo "starting bootstrap script to create a kafka producer on ubuntu"
	cd /opt

	# download and run anaconda installer in silent mode
	curl -O https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh
	sudo chmod +x Anaconda3-5.0.1-Linux-x86_64.sh
	./Anaconda3-5.0.1-Linux-x86_64.sh -b -p $HOME/anaconda
	export PATH="$HOME/anaconda/bin:$PATH"

	# clone git repo with kafka producer
	git clone https://gist.github.com/7ceb0b255baa1030f9df789f14702489.git kafka_producer

	chmod -R 777 /opt/kafka_producer

	# cd kafka_producer

	# run it! (only after you've updated the config file - you can do this in the notebooks)
	# ./run.sh &
	}

	function get_streaming_files() {
	echo "downloading streaming files"

	last_dir=$(pwd)
	su hdfs
	cd /tmp

	wget -O lib.zip https://github.com/sblack4/spark-scala-kafka-consumer/blob/master/lib.zip?raw=true
	unzip lib.zip
	hadoop fs -put lib/spark-kafka-consumer_2.11-1.0.jar /spark
	hadoop fs -put lib/spark-streaming-kafka-0-8-assembly_2.11-2.1.1.jar /spark

	exit
	cd last_dir

	}

	function main() {
	echo "Running bootstrap for Realtime Analytics with OAC Datalake"

	check_root
	update_yum
	get_obj_store_url
	sudo_zeppelin
	import_notebooks
	install_kafka_producer

	echo "done with bootstrap for realtime analytics with OAC datalake"
	echo "Log file when this runs will be copied up to your default container."
	echo "Can also be viewed on the bdcsce server via: cat /u01/bdcsce/data/var/log/bootstrap.*"
	}

	main
No results found