mplourde · March 10, 2016 20:27
diff --git a/gistfile1.txt b/gistfile1.txt
 # create instance, create key, install awscli, do awscli configure, 

 export AWS_ACCESS_KEY_ID=XX
 export AWS_SECRET_ACCESS_KEY=XX
 export SPARK_HOME=~/spark-1.4.1-bin-hadoop2.6

 $SPARK_HOME/ec2/spark-ec2 --key-pair=XX \
 --instance-type=m1.large \
 --zone us-east-1a \
 --identity-file=XX.pem \
 --copy-aws-credentials \
 launch my-spark-cluster -s 2


 # ... takes a while
 # ssh to master node
 $SPARK_HOME/ec2/spark-ec2 -k XX -i XX.pem login my-spark-cluster
 # $SPARK_HOME/ec2/spark-ec2 destroy my-spark-cluster
 # $SPARK_HOME/ec2/spark-ec2 stop/start my-spark-cluster

 # ~/.bashrc
 #-------------------
 export PYSPARK_PYTHON=python27
 export PYSPARK_DRIVER_PYTHON=python27
 export IPYTHON=1
 export SPARK_HOME=/root/spark/
 #--------------------
 source .bashrc
 # install modules across workers
 wget https://bootstrap.pypa.io/get-pip.py
 python27 get-pip.py 
 yum install python27-devel 
 pip2.7 install numpy
 /root/spark-ec2/copy-dir /usr/local/lib/python2.7/site-packages/numpy

 pip2.7 install virtualenv
 virtualenv millionsongs
 source ./millionsongs/bin/activate
 pip install ipython
 pip install pyzmq
 pip install jinja2
 pip install tornado
 pip install jsonschema
 pip install terminado
 pip install py4j
 yum -y install python27-matplotlib
 yum -y install freetype-devel
 pip install matplotlib

 ipython profile create pyspark
 # edit ~/.ipython/profile_pyspark/ipython_notebook_config.py
 # -------------
 c = get_config()
 
 c.NotebookApp.ip = '*'
 c.NotebookApp.open_browser = False
 c.NotebookApp.port = 8888
 c.NotebookApp.open_browser = False

 PWDFILE="/root/.ipython/profile_pyspark/nbpasswd.txt"
 c.NotebookApp.password = open(PWDFILE).read().strip()
 # ------------

 # set password
 python -c 'from IPython.lib import passwd; print passwd()' > ~/.ipython/profile_pyspark/nbpasswd.txt

 # ~/.ipython/profile_pyspark/startup/00-pyspark-setup.py 
 #-------------
 import os
 import sys
 
 spark_home = os.environ.get('SPARK_HOME', None)
 if not spark_home:
  raise ValueError('SPARK_HOME environment variable is not set')
 sys.path.insert(0, os.path.join(spark_home, 'python'))
 sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.1-src.zip'))
 CLUSTER_URL = open('/root/spark-ec2/cluster-url').read().strip()
 execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
 #-------------

 # get some data
 wget http://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip
 unzip YearPredictionMSD.txt.zip
 mkdir data mv YearPredictionMSD.txt ./data
 # add data to HDFS
 ~/ephemeral-hdfs/bin/hadoop fs -put /root/data/YearPredictionMSD.txt /root/data/YearPredictionMSD.txt

 ~/ephemeral-hdfs/bin/hadoop fs -put YearPredictionMSD.txt YearPredictionMSD.txt

 ipython notebook --profile=pyspark

 # OTHER NOTES
 # see name node at http://localhost:50070
 # to add more slaves
 #  stop the cluster
 #  delete existing slaves
 #  recreate cluster with spark-ec2 using --use-existing-master
 #  log into master, ephermeral-hdfs/bin/stop-all.sh
 #  ephermeral-hdfs/bin/start-all.sh
 #  ephermeral-hdfs/bin/hadoop-daemon.sh start datanode
 #  reload data onto ephemeral hdfs as above
 #  jps (should see NameNode, DataNode)
 # you will then have to recopy any necessary packages to the slaves with
 #  /root/spark-ec2/copy-dir /usr/local/lib/python2.7/site-packages/numpy
 #  as above.
	# create instance, create key, install awscli, do awscli configure,

	export AWS_ACCESS_KEY_ID=XX
	export AWS_SECRET_ACCESS_KEY=XX
	export SPARK_HOME=~/spark-1.4.1-bin-hadoop2.6

	$SPARK_HOME/ec2/spark-ec2 --key-pair=XX \
	--instance-type=m1.large \
	--zone us-east-1a \
	--identity-file=XX.pem \
	--copy-aws-credentials \
	launch my-spark-cluster -s 2


	# ... takes a while
	# ssh to master node
	$SPARK_HOME/ec2/spark-ec2 -k XX -i XX.pem login my-spark-cluster
	# $SPARK_HOME/ec2/spark-ec2 destroy my-spark-cluster
	# $SPARK_HOME/ec2/spark-ec2 stop/start my-spark-cluster

	# ~/.bashrc
	#-------------------
	export PYSPARK_PYTHON=python27
	export PYSPARK_DRIVER_PYTHON=python27
	export IPYTHON=1
	export SPARK_HOME=/root/spark/
	#--------------------
	source .bashrc
	# install modules across workers
	wget https://bootstrap.pypa.io/get-pip.py
	python27 get-pip.py
	yum install python27-devel
	pip2.7 install numpy
	/root/spark-ec2/copy-dir /usr/local/lib/python2.7/site-packages/numpy

	pip2.7 install virtualenv
	virtualenv millionsongs
	source ./millionsongs/bin/activate
	pip install ipython
	pip install pyzmq
	pip install jinja2
	pip install tornado
	pip install jsonschema
	pip install terminado
	pip install py4j
	yum -y install python27-matplotlib
	yum -y install freetype-devel
	pip install matplotlib

	ipython profile create pyspark
	# edit ~/.ipython/profile_pyspark/ipython_notebook_config.py
	# -------------
	c = get_config()

	c.NotebookApp.ip = '*'
	c.NotebookApp.open_browser = False
	c.NotebookApp.port = 8888
	c.NotebookApp.open_browser = False

	PWDFILE="/root/.ipython/profile_pyspark/nbpasswd.txt"
	c.NotebookApp.password = open(PWDFILE).read().strip()
	# ------------

	# set password
	python -c 'from IPython.lib import passwd; print passwd()' > ~/.ipython/profile_pyspark/nbpasswd.txt

	# ~/.ipython/profile_pyspark/startup/00-pyspark-setup.py
	#-------------
	import os
	import sys

	spark_home = os.environ.get('SPARK_HOME', None)
	if not spark_home:
	raise ValueError('SPARK_HOME environment variable is not set')
	sys.path.insert(0, os.path.join(spark_home, 'python'))
	sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.1-src.zip'))
	CLUSTER_URL = open('/root/spark-ec2/cluster-url').read().strip()
	execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
	#-------------

	# get some data
	wget http://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip
	unzip YearPredictionMSD.txt.zip
	mkdir data mv YearPredictionMSD.txt ./data
	# add data to HDFS
	~/ephemeral-hdfs/bin/hadoop fs -put /root/data/YearPredictionMSD.txt /root/data/YearPredictionMSD.txt

	~/ephemeral-hdfs/bin/hadoop fs -put YearPredictionMSD.txt YearPredictionMSD.txt

	ipython notebook --profile=pyspark

	# OTHER NOTES
	# see name node at http://localhost:50070
	# to add more slaves
	# stop the cluster
	# delete existing slaves
	# recreate cluster with spark-ec2 using --use-existing-master
	# log into master, ephermeral-hdfs/bin/stop-all.sh
	# ephermeral-hdfs/bin/start-all.sh
	# ephermeral-hdfs/bin/hadoop-daemon.sh start datanode
	# reload data onto ephemeral hdfs as above
	# jps (should see NameNode, DataNode)
	# you will then have to recopy any necessary packages to the slaves with
	# /root/spark-ec2/copy-dir /usr/local/lib/python2.7/site-packages/numpy
	# as above.
No results found