wget http://archive.cloudera.com/cdh5/one-click-install/redhat/6/x86_64/cloudera-cdh-5-0.x86_64.rpm
sudo yum --nogpgcheck localinstall cloudera-cdh-5-0.x86_64.rpm
sudo yum clean all
sudo yum install hadoop-hdfs-namenode
sudo yum install R git
sudo yum install spark-core spark-master spark-python
cd
wget http://cran.cnr.berkeley.edu/src/contrib/rJava_0.9-6.tar.gz
sudo R CMD INSTALL rJava_0.9-6.tar.gz
git clone https://github.com/amplab-extras/SparkR-pkg.git
cd SparkR-pkg
./install-dev.sh
#Need to add ssh key in slave's authorized keys
ssh-kegen # This will generate keys
cat ~/.ssh/id_rsa.pub # Copy this to ~/.ssh/authorized_keys on the slave machine
cd
rsync -az SparkR-pkg ec2-user@<slave-hostname>:~/
# This is to run SparkR from Spark
chmod a+rx /home/ec2-user
wget http://archive.cloudera.com/cdh5/one-click-install/redhat/6/x86_64/cloudera-cdh-5-0.x86_64.rpm
sudo yum --nogpgcheck localinstall cloudera-cdh-5-0.x86_64.rpm
sudo yum -y clean all
sudo yum -y install hadoop-hdfs-datanode
sudo yum -y install R git
sudo yum -y install spark-core spark-worker spark-python
cd
wget http://cran.cnr.berkeley.edu/src/contrib/rJava_0.9-6.tar.gz
sudo R CMD INSTALL rJava_0.9-6.tar.gz
sudo cp -r /etc/hadoop/conf.empty /etc/hadoop/conf.my_cluster
# Copy core-site.xml and hdfs-site.xml from my gist
wget https://gist.github.com/shivaram/9240335#file-core-site-xml
sudo mv 9240335 /etc/hadoop/conf.my_cluster/core-site.xml
wget https://gist.github.com/shivaram/9240335#file-hdfs-site.xml
sudo mv 9240335 /etc/hadoop/conf.my_cluster/hdfs-site.xml
sudo mkdir -p /mnt/ephemeral-hdfs/data
sudo chown -R hdfs:hdfs /mnt/ephemeral-hdfs
rsync -az /etc/hadoop/conf.my_cluster ec2-user@<slave-hostname>:~/
sudo alternatives --verbose --install /etc/hadoop/conf hadoop-conf /etc/hadoop/conf.my_cluster 50
sudo alternatives --set hadoop-conf /etc/hadoop/conf.my_cluster
sudo -s
sudo -u hdfs hadoop namenode -format
sudo service hadoop-hdfs-namenode start
sudo mkdir -p /mnt/ephemeral-hdfs/data
sudo chown -R hdfs:hdfs /mnt/ephemeral-hdfs
sudo mv conf.my_cluster /etc/hadoop/
sudo alternatives --verbose --install /etc/hadoop/conf hadoop-conf /etc/hadoop/conf.my_cluster 50
sudo alternatives --set hadoop-conf /etc/hadoop/conf.my_cluster
sudo service hadoop-hdfs-datanode start
On master node
Edit /etc/spark/conf/spark-env.sh -- Fill in ec2 hostname for STANDALONE_MASTER_HOST
Also add a line export SPARK_LOCAL_IP=`wget -q -O - http://169.254.169.254/latest/meta-data/public-hostname`
rsync -az /etc/spark/conf/spark-env.sh ec2-user@<slave-hostname>:~/
sudo service spark-master start
On slave node
sudo mv spark-conf.sh /etc/spark/conf/
sudo service spark-worker start
Launch spark-shell and run
val a = sc.parallelize(1 to 100, 2)
a.count
On master run
source /etc/spark/conf/spark-env.sh
SPARK_HOME=/usr/lib/spark ./sparkR
Inside R console run
a <- parallelize(sc, 1:100, 2L)
count(a)
q()
Test pi.R
SPARK_HOME=/usr/lib/spark ./sparkR examples/pi.R <spark_master_url>