Spark Training Environment Setup

Anaconda

# Download installer
wget https://repo.anaconda.com/archive/Anaconda3-2020.02-Linux-x86_64.sh

bash Anaconda3-2020.02-Linux-x86_64.sh

Wherever it prompts, type YES and press ENTER.

Oracle JDK 8

# Add PPA to the sources list
sudo apt-add-repository ppa:webupd8team/java

# Refresh sources list
sudo apt-get update

# Install JDK
sudo apt-get install oracle-java8-installer

# Setup JAVA_HOME
echo 'JAVA_HOME="/usr/lib/jvm/java-8-oracle"' | sudo tee -a /etc/environment
source /etc/environment

To verify installation, run

java -version

Hadoop

Installation

# Download Hadoop package
wget http://mirrors.whoishostingthis.com/apache/hadoop/common/hadoop-3.2.1/hadoop-3.2.1.tar.gz

# Extract the Archive
sudo tar -xvf hadoop-3.2.1.tar.gz -C /opt/
sudo mv /opt/hadoop-3.2.1 /opt/hadoop
rm hadoop-3.2.1.tar.gz

# Allow permissions for the current user
sudo chown ubuntu:ubuntu -R /opt/hadoop

Configuration

# Setup Environment variables
cat >> ~/.bashrc << EOF
export HADOOP_HOME=/opt/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
EOF

sudo cat >> /opt/hadoop/etc/hadoop/core-site.xml << EOF
<configuration>
  <property>
    <name>fs.defaultFS</name>
    <value>hdfs://localhost:9000</value>
  </property>
</configuration>
EOF

sudo cat >> /opt/hadoop/etc/hadoop/hdfs-site.xml << EOF
<configuration>
  <property>
    <name>dfs.datanode.data.dir</name>
    <value>file:///opt/hadoop_tmp/hdfs/datanode</value>
  </property>
  <property>
    <name>dfs.namenode.name.dir</name>
    <value>file:///opt/hadoop_tmp/hdfs/namenode</value>
  </property>
  <property>
    <name>dfs.replication</name>
    <value>1</value>
  </property>
</configuration> 
EOF

sudo cat >> /opt/hadoop/etc/hadoop/mapred-site.xml << EOF
<configuration>
  <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
  </property>
</configuration>
EOF

sudo cat >> /opt/hadoop/etc/hadoop/yarn-site.xml << EOF
<configuration>
  <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
  <property>
    <name>yarn.nodemanager.auxservices.mapreduce.shuffle.class</name>  
    <value>org.apache.hadoop.mapred.ShuffleHandler</value>
  </property>
</configuration>
EOF

# Create data directories and set file system permissions
sudo mkdir -p /opt/hadoop_tmp/hdfs/datanode /opt/hadoop_tmp/hdfs/namenode
sudo chown ubuntu:ubuntu -R /opt/hadoop_tmp

# Generate SSH Keypairs and add it to authorized keys
ssh-keygen
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys

# Format the name node
hdfs namenode -format -force

# Start the DFS and Yarn processes
start-dfs.sh && start-yarn.sh

To verify, run the following commands.

# Check running processes
jps

# Create and list directories
hdfs dfs -mkdir /test
hdfs dfs -ls /

Access the Cluster monitor and DFS explorer in the following URLs.

Hive

Installation

# Download the Hive package
wget http://apachemirror.wuchna.com/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz

# Extract the archive file
tar -xvf apache-hive-3.1.2-bin.tar.gz

# Move it to the opt directory
mv ./apache-hive-3.1.2-bin /opt/hive

Configuration

cat >> ~/.bashrc << EOF
export HIVE_HOME=/opt/hive
export PATH=$PATH:$HIVE_HOME/bin
EOF

# Apply patch to make Hive 3.1.2 work on Hadoop 3.2.1
sudo rm -f /opt/hive/lib/guava-19.0.jar
sudo cp /opt/hadoop/share/hadoop/hdfs/lib/guava-27.0-jre.jar /opt/hive/lib/

# Change the hive permission
sudo chown ubuntu:ubuntu -R /opt/hive/

# Initialize the Hive Schema
$HIVE_HOME/bin/schematool -initSchema -dbType derby

To verify, run the following.

hive
>   show databases;
>   show tables;

Spark

Installation

# Download the Spark package
wget https://www.apache.org/dyn/closer.lua/spark/spark-2.4.6/spark-2.4.6-bin-without-hadoop.tgz

# Extract the archive and place it under opt directory
sudo tar -xvf spark-2.4.6-bin-without-hadoop.tgz -C /opt/
sudo mv /opt/spark-2.4.6-bin-without-hadoop /opt/spark
sudo chown ubuntu:ubuntu -R /opt/spark

Configuration

cat >> ~/.bashrc << EOF
export SPARK_HOME=/opt/spark
export PATH=$PATH:$SPARK_HOME/bin
export SPARK_DIST_CLASSPATH=$(hadoop classpath)
export TERM=xterm-color
EOF

To verify, run the following.

# Check Spark version
spark-shell --version

# Copy a spark example to HDFS
hdfs dfs -put /opt/spark/examples/src/main/resources/users.parquet /users.parquet

# Open Spark-shell
spark-shell
>   val df = spark.read.parquet("hdfs://localhost:9000/users.parquet")
>   df.collect.foreach(println)

Final Notes

Whenever machine is restarted, make sure Hadoop services are running. To start Hadoop services, run the following.
```
start-dfs.sh && start-yarn.sh
```
Anaconda, Spyder and Jupyter tools can be accessed by typing the following commands in the commandline.
```
conda --help

jupyter --help

spyder
```

Kalki5/Doc.md

Spark Training Environment Setup

Anaconda

Oracle JDK 8

Hadoop

Installation

Configuration

Hive

Installation

Configuration

Spark

Installation

Configuration

Final Notes