Skip to content

Instantly share code, notes, and snippets.

@e-roux
Last active March 20, 2019 06:23
Show Gist options
  • Save e-roux/aa6d987b4acc3e14195f7f7f7e1b7822 to your computer and use it in GitHub Desktop.
Save e-roux/aa6d987b4acc3e14195f7f7f7e1b7822 to your computer and use it in GitHub Desktop.
Hadoop on raspberry
#!/bin/bash
#######################################################
# Install Java and other packages
#######################################################
sudo apt-get update
sudo apt-get --assume-yes install oracle-java8-jdk \
emacs-nox autoconf cmake zlib1g-dev libsasl2-dev \
software-properties-common build-essential automake \
libtool cmake zlib1g-dev pkg-config libssl-dev \
libsasl2-dev snappy libsnappy-dev bzip2 libbz2-dev \
libjansson-dev fuse libfuse-dev zstd
# -- HADOOP ENVIRONMENT VARIABLES START -- #
export HADOOP_HOME=/opt/hadoop
export HIVE_HOME=/opt/hive
export SPARK_HOME=/opt/spark
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HIVE_HOME/sbin:$SPARK_HOME/sbin
export HADOOP_CONF_DIR=/etc/hadoop
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib"
# Remove messages
# WARN util.NativeCodeLoader: Unable to load native-hadoop
# library for your platform
export HADOOP_HOME_WARN_SUPPRESS=1
export HADOOP_ROOT_LOGGER="WARN,DRFA"
# export HDFS_NAMENODE_USER=root
# export HDFS_DATANODE_USER=root
# export HDFS_SECONDARYNAMENODE_USER=root
export YARN_HOME=$HADOOP_HOME
export SPARK_CONF_DIR=/etc/spark
export SPARK_MASTER_HOST=localhost
# -- HADOOP ENVIRONMENT VARIABLES END -- #
export JAVA_HOME=/usr/lib/jvm/jdk-8-oracle-arm32-vfp-hflt/jre
#!/bin/bash
# ProtocolBuffers is an open source project supporting Google's
# ProtocolBuffer's platform-neutral and language-neutral interprocess-communication (IPC) and serialization framework. It has an Interface Definition Language (IDL) that is used to describe the wire- and file formats; this IDL is then pre-compiled into source code for the target languages (Python, Java and C++ included), which are then used in the applications.
cd /tmp
wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.bz2
tar xf protobuf-2.5.0.tar.bz2
cd protobuf-2.5.0/
sh autogen.sh
./configure
make
make install
sudo ldconfig
#!/bin/bash
#
# Source:
# https://developer.ibm.com/recipes/tutorials/building-a-hadoop-cluster-with-raspberry-pi/
# Installation and expand Filesystem must be done manually on each node
# For ssh connection, a ssh file must be created in /boot directory
# 1. Add environment variables to .bashrc
# 2. Create the installation and HDFS directories
# On the master : /etc/hosts
# hdfs-site.xml
# yarn-site.xml
# core-site.xml
EMAIL="[email protected]"
get_latest_release_number() {
curl --silent "https://github.com/$1/releases/latest" | sed 's#.*tag/\(.*\)\".*#\1#';
}
#######################################################
# 1. SW version and user
#######################################################
HIVE=$(curl --silent https://www-eu.apache.org/dist/hive/ | perl -ne "m/(?:hive-)(3[\.\d]*)/ && print $&")
declare -A VERSION
VERSION=(["HADOOP"]="3.2.0" \
["SPARK"]="2.4.0" \
["HIVE"]=${HIVE##*-})
HADOOP_VERSION="3.2.0"
HIVE_VERSION=${HIVE##*-}
SPARK_VERSION="2.4.0"
HADOOP_USER=$USER
#######################################################
# 2. Add hadoop specific environment variables to .bashrc
#######################################################
sudo cp hadoop.bashrc /etc/profile.d/hadoop.sh
sudo chmod 644 /etc/profile.d/hadoop.sh
. /etc/profile.d/hadoop.sh
#######################################################
# 3. Create the installation and HDFS directories
#######################################################
sudo mkdir -p /opt/{hadoop,hdfs/{datanode,namenode},hive,presto/{etc/catalog,data},spark}
# Create config directory
sudo mkdir -p /etc/{hadoop,hive,impala,presto,spark}
sudo chown -R ${HADOOP_USER}:${HADOOP_USER} /opt/{hadoop,hdfs,hive,presto,spark} /etc/{hadoop,impala,presto,spark}
#######################################################
# 3. Setup connectivity.
#######################################################
# Bash 4 support associative arrays
# for host in "${!IPs[@]}";
# do echo "$host - ${IPs[$host]}";
# done
declare -A IPs
MASTER="hadoopmaster"
IPs=(["hadoopmaster"]="192.168.178.51" \
["hadoopworker01"]="192.168.178.52")
CURRENT_IP=$(ip route get 1 | awk '{print $NF;exit}')
MASTER_IP=$(host ${MASTER} | awk '/has address/ { print $4 }')
# Set IS_MASTER if currently on master hadoop node
[[ $CURRENT_IP = $MASTER_IP ]] && IS_MASTER=true || IS_MASTER=false
#######################################################
# Generate and replicate SSH keys.
#######################################################
ssh-keygen -f $HOME/.ssh/id_rsa -N '' -t rsa -b 4096 -C ${EMAIL}
for host in "${!IPs[@]}";
do echo $host;
ssh-copy-id -i $HOME/.ssh/id_rsa ${HADOOP_USER}@${host};
sudo bash -c "echo -e \"${IPs[$host]}\t${host}\" >> /etc/hosts"
done
#######################################################
# Install Hadoop in the namenode
#######################################################
if IS_MASTER;
then
DIST=https://www-eu.apache.org/dist
curl -o /tmp/hadoop.tar.gz http://www-eu.apache.org/dist/hadoop/common/stable/hadoop-${HADOOP_VERSION}.tar.gz
curl -o /tmp/hive.tar.gz https://archive.apache.org/dist/hive/hive-${HIVE_VERSION}/apache-hive-${HIVE_VERSION}-bin.tar.gz
curl -o spark.tgz $DIST/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop2.7.tgz
sudo tar xvf /tmp/hadoop.tar.gz \
--directory=${HADOOP_HOME} \
--exclude=hadoop-${HADOOP_VERSION}/share/doc \
--strip 1
sudo tar xvf /tmp/hive.tar.gz \
--directory=${HIVE_HOME} \
--exclude=apache-hive-${HIVE_VERSION}-bin/ql/src/test \
--strip 1
sudo tar xzvf /tmp/spark.tgz \
--directory=${SPARK_HOME} \
--strip 1
fi
# Copy configuration to nodes
for host in "${!IPs[@]}";
do [ $host != $MASTER ] && \
# sudo scp -r $HADOOP_CONF_DIR pi@${host}:${HADOOP_CONF_DIR}
sudo scp $HADOOP_CONF_DIR/{core-site.xml,hadoop-env.sh,hdfs-site.xml,mapred-site.xml,yarn-site.xml,workers} \
pi@${host}:${HADOOP_CONF_DIR}
done
rm ${HADOOP_HOME}/{sbin,bin}/*.cmd
rm ${HADOOP_CONF_DIR}/*.cmd
rmdir ${HADOOP_HOME}/etc
#######################################################
# Add the master and slaves files
#######################################################
# Only in the Master node.
if IS_MASTER;
then
sudo bash -c "echo \"${MASTER}\" >> ${HADOOP_CONF_DIR}/master"
for host in "${!IPs[@]}";
do [ $host != $MASTER ] && \
sudo bash -c "echo \"${host}\" >> ${HADOOP_CONF_DIR}/slaves"
done
fi
if [ $CURRENT_IP = $MASTER_IP ];
then sudo mkdir -p /opt/hdfs/namenode
else sudo mkdir -p /opt/hdfs/datanode
fi
#######################################################
# Copy the basic configuration to the slave nodes
#######################################################
if IS_MASTER;
then
for host in "${!IPs[@]}";
do [ $host != $MASTER ] && \
rsync -avxP ${HADOOP_HOME} ${HADOOP_USER}@${host}:/opt/hadoop
done
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment