# Download installer
wget https://repo.anaconda.com/archive/Anaconda3-2020.02-Linux-x86_64.sh
bash Anaconda3-2020.02-Linux-x86_64.sh
Wherever it prompts, type YES and press ENTER.
# Add PPA to the sources list
sudo apt-add-repository ppa:webupd8team/java
# Refresh sources list
sudo apt-get update
# Install JDK
sudo apt-get install oracle-java8-installer
# Setup JAVA_HOME
echo 'JAVA_HOME="/usr/lib/jvm/java-8-oracle"' | sudo tee -a /etc/environment
source /etc/environment
To verify installation, run
java -version
# Download Hadoop package
wget http://mirrors.whoishostingthis.com/apache/hadoop/common/hadoop-3.2.1/hadoop-3.2.1.tar.gz
# Extract the Archive
sudo tar -xvf hadoop-3.2.1.tar.gz -C /opt/
sudo mv /opt/hadoop-3.2.1 /opt/hadoop
rm hadoop-3.2.1.tar.gz
# Allow permissions for the current user
sudo chown ubuntu:ubuntu -R /opt/hadoop
# Setup Environment variables
cat >> ~/.bashrc << EOF
export HADOOP_HOME=/opt/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
EOF
sudo cat >> /opt/hadoop/etc/hadoop/core-site.xml << EOF
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
EOF
sudo cat >> /opt/hadoop/etc/hadoop/hdfs-site.xml << EOF
<configuration>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:///opt/hadoop_tmp/hdfs/datanode</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:///opt/hadoop_tmp/hdfs/namenode</value>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
EOF
sudo cat >> /opt/hadoop/etc/hadoop/mapred-site.xml << EOF
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
EOF
sudo cat >> /opt/hadoop/etc/hadoop/yarn-site.xml << EOF
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.auxservices.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
</configuration>
EOF
# Create data directories and set file system permissions
sudo mkdir -p /opt/hadoop_tmp/hdfs/datanode /opt/hadoop_tmp/hdfs/namenode
sudo chown ubuntu:ubuntu -R /opt/hadoop_tmp
# Generate SSH Keypairs and add it to authorized keys
ssh-keygen
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
# Format the name node
hdfs namenode -format -force
# Start the DFS and Yarn processes
start-dfs.sh && start-yarn.sh
To verify, run the following commands.
# Check running processes
jps
# Create and list directories
hdfs dfs -mkdir /test
hdfs dfs -ls /
Access the Cluster monitor and DFS explorer in the following URLs.
# Download the Hive package
wget http://apachemirror.wuchna.com/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz
# Extract the archive file
tar -xvf apache-hive-3.1.2-bin.tar.gz
# Move it to the opt directory
mv ./apache-hive-3.1.2-bin /opt/hive
cat >> ~/.bashrc << EOF
export HIVE_HOME=/opt/hive
export PATH=$PATH:$HIVE_HOME/bin
EOF
# Apply patch to make Hive 3.1.2 work on Hadoop 3.2.1
sudo rm -f /opt/hive/lib/guava-19.0.jar
sudo cp /opt/hadoop/share/hadoop/hdfs/lib/guava-27.0-jre.jar /opt/hive/lib/
# Change the hive permission
sudo chown ubuntu:ubuntu -R /opt/hive/
# Initialize the Hive Schema
$HIVE_HOME/bin/schematool -initSchema -dbType derby
To verify, run the following.
hive
> show databases;
> show tables;
# Download the Spark package
wget https://www.apache.org/dyn/closer.lua/spark/spark-2.4.6/spark-2.4.6-bin-without-hadoop.tgz
# Extract the archive and place it under opt directory
sudo tar -xvf spark-2.4.6-bin-without-hadoop.tgz -C /opt/
sudo mv /opt/spark-2.4.6-bin-without-hadoop /opt/spark
sudo chown ubuntu:ubuntu -R /opt/spark
cat >> ~/.bashrc << EOF
export SPARK_HOME=/opt/spark
export PATH=$PATH:$SPARK_HOME/bin
export SPARK_DIST_CLASSPATH=$(hadoop classpath)
export TERM=xterm-color
EOF
To verify, run the following.
# Check Spark version
spark-shell --version
# Copy a spark example to HDFS
hdfs dfs -put /opt/spark/examples/src/main/resources/users.parquet /users.parquet
# Open Spark-shell
spark-shell
> val df = spark.read.parquet("hdfs://localhost:9000/users.parquet")
> df.collect.foreach(println)
- Whenever machine is restarted, make sure Hadoop services are running. To start Hadoop services, run the following.
start-dfs.sh && start-yarn.sh
- Anaconda, Spyder and Jupyter tools can be accessed by typing the following commands in the commandline.
conda --help jupyter --help spyder