Last active
March 10, 2016 20:27
-
-
Save mplourde/f068ee466f5b7af906f8 to your computer and use it in GitHub Desktop.
0 to cluster with notebook
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create instance, create key, install awscli, do awscli configure, | |
export AWS_ACCESS_KEY_ID=XX | |
export AWS_SECRET_ACCESS_KEY=XX | |
export SPARK_HOME=~/spark-1.4.1-bin-hadoop2.6 | |
$SPARK_HOME/ec2/spark-ec2 --key-pair=XX \ | |
--instance-type=m1.large \ | |
--zone us-east-1a \ | |
--identity-file=XX.pem \ | |
--copy-aws-credentials \ | |
launch my-spark-cluster -s 2 | |
# ... takes a while | |
# ssh to master node | |
$SPARK_HOME/ec2/spark-ec2 -k XX -i XX.pem login my-spark-cluster | |
# $SPARK_HOME/ec2/spark-ec2 destroy my-spark-cluster | |
# $SPARK_HOME/ec2/spark-ec2 stop/start my-spark-cluster | |
# ~/.bashrc | |
#------------------- | |
export PYSPARK_PYTHON=python27 | |
export PYSPARK_DRIVER_PYTHON=python27 | |
export IPYTHON=1 | |
export SPARK_HOME=/root/spark/ | |
#-------------------- | |
source .bashrc | |
# install modules across workers | |
wget https://bootstrap.pypa.io/get-pip.py | |
python27 get-pip.py | |
yum install python27-devel | |
pip2.7 install numpy | |
/root/spark-ec2/copy-dir /usr/local/lib/python2.7/site-packages/numpy | |
pip2.7 install virtualenv | |
virtualenv millionsongs | |
source ./millionsongs/bin/activate | |
pip install ipython | |
pip install pyzmq | |
pip install jinja2 | |
pip install tornado | |
pip install jsonschema | |
pip install terminado | |
pip install py4j | |
yum -y install python27-matplotlib | |
yum -y install freetype-devel | |
pip install matplotlib | |
ipython profile create pyspark | |
# edit ~/.ipython/profile_pyspark/ipython_notebook_config.py | |
# ------------- | |
c = get_config() | |
c.NotebookApp.ip = '*' | |
c.NotebookApp.open_browser = False | |
c.NotebookApp.port = 8888 | |
c.NotebookApp.open_browser = False | |
PWDFILE="/root/.ipython/profile_pyspark/nbpasswd.txt" | |
c.NotebookApp.password = open(PWDFILE).read().strip() | |
# ------------ | |
# set password | |
python -c 'from IPython.lib import passwd; print passwd()' > ~/.ipython/profile_pyspark/nbpasswd.txt | |
# ~/.ipython/profile_pyspark/startup/00-pyspark-setup.py | |
#------------- | |
import os | |
import sys | |
spark_home = os.environ.get('SPARK_HOME', None) | |
if not spark_home: | |
raise ValueError('SPARK_HOME environment variable is not set') | |
sys.path.insert(0, os.path.join(spark_home, 'python')) | |
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.1-src.zip')) | |
CLUSTER_URL = open('/root/spark-ec2/cluster-url').read().strip() | |
execfile(os.path.join(spark_home, 'python/pyspark/shell.py')) | |
#------------- | |
# get some data | |
wget http://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip | |
unzip YearPredictionMSD.txt.zip | |
mkdir data mv YearPredictionMSD.txt ./data | |
# add data to HDFS | |
~/ephemeral-hdfs/bin/hadoop fs -put /root/data/YearPredictionMSD.txt /root/data/YearPredictionMSD.txt | |
~/ephemeral-hdfs/bin/hadoop fs -put YearPredictionMSD.txt YearPredictionMSD.txt | |
ipython notebook --profile=pyspark | |
# OTHER NOTES | |
# see name node at http://localhost:50070 | |
# to add more slaves | |
# stop the cluster | |
# delete existing slaves | |
# recreate cluster with spark-ec2 using --use-existing-master | |
# log into master, ephermeral-hdfs/bin/stop-all.sh | |
# ephermeral-hdfs/bin/start-all.sh | |
# ephermeral-hdfs/bin/hadoop-daemon.sh start datanode | |
# reload data onto ephemeral hdfs as above | |
# jps (should see NameNode, DataNode) | |
# you will then have to recopy any necessary packages to the slaves with | |
# /root/spark-ec2/copy-dir /usr/local/lib/python2.7/site-packages/numpy | |
# as above. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment