Created
May 3, 2017 18:25
-
-
Save michaelmior/56e8a903d85862a6793870f0d99070af to your computer and use it in GitHub Desktop.
Spark installation on YARN
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -ex | |
SPARK_VERSION="2.1.0-bin-hadoop2.7" | |
HADOOP_VERSION="2.7.3" | |
SHORT_USER=$(echo $USER | cut -d \\ -f2) | |
function add_to_env { | |
host=$1 | |
value=$2 | |
ssh $host "echo '$value' | sudo tee -a /etc/environment > /dev/null" | |
} | |
# Function to add properties to a configuration file on a remote host | |
function add_property { | |
host=$1 | |
name=$2 | |
value=$3 | |
file=$4 | |
ssh $host "sudo xmlstarlet ed -L \ | |
-s '/configuration' -t elem -n property --var new-field '\$prev' \ | |
-s '\$new-field' -t elem -n name -v $name \ | |
-s '\$new-field' -t elem -n value -v $value \ | |
$file" | |
} | |
# Check for correct arguments | |
if [ "$#" -lt 3 ]; then | |
echo "Usage: $0 NameNode ResourceManager [slave1 slave2 ...]" > /dev/stderr | |
exit 1 | |
fi | |
echo "Downloading tarballs" > /dev/stderr | |
wget -P ~ -c http://apache.mirrors.spacedump.net/hadoop/common/stable/hadoop-$HADOOP_VERSION.tar.gz | |
wget -P ~ -c http://d3kbcqa49mib13.cloudfront.net/spark-$SPARK_VERSION.tgz | |
# Get hostnames | |
namenode="$1" | |
resourcemanager="$2" | |
for host in "$@"; do | |
echo "Installing on $host..." > /dev/stderr | |
# Enable /etc/environment with sudo | |
ssh $host "echo 'session required pam_env.so readenv=1' | sudo tee -a /etc/pam.d/sudo > /dev/null" | |
# Properly configure environment | |
add_to_env $host "JAVA_HOME=\"/usr/lib/jvm/java-8-oracle/\"" | |
add_to_env $host "HADOOP_HOME=\"/opt/hadoop-$HADOOP_VERSION\"" | |
add_to_env $host "HADOOP_PREFIX=\"/opt/hadoop-$HADOOP_VERSION\"" | |
add_to_env $host "HADOOP_COMMON_HOME=\"/opt/hadoop-$HADOOP_VERSION\"" | |
add_to_env $host "HADOOP_CONF_DIR=\"/opt/hadoop-$HADOOP_VERSION/etc/hadoop\"" | |
add_to_env $host "HADOOP_HDFS_HOME=\"/opt/hadoop-$HADOOP_VERSION\"" | |
add_to_env $host "HADOOP_YARN_HOME=\"/opt/hadoop-$HADOOP_VERSION\"" | |
add_to_env $host "SPARK_HOME=\"/opt/spark-$SPARK_VERSION\"" | |
# This step is required so daemons listen on the correct interface | |
ssh $host sudo sed -i "/$host/d" /etc/hosts | |
ssh $host sudo tar zxf spark-$SPARK_VERSION.tgz -C /opt | |
ssh $host sudo tar zxf hadoop-$HADOOP_VERSION.tar.gz -C /opt | |
# Install xmlstarlet to make manipulating configs easier | |
ssh $host sudo apt-get update -qq && ssh $host sudo apt-get install -qq xmlstarlet | |
# Create HDFS directory | |
ssh $host sudo rm -rf /ssd1/$SHORT_USER/hdfs | |
ssh $host sudo mkdir -p /ssd1/$SHORT_USER/hdfs/datanode | |
ssh $host sudo mkdir -p /ssd1/$SHORT_USER/hdfs/namenode | |
ssh $host sudo mkdir -p /ssd1/$SHORT_USER/tmp | |
add_property $host \ | |
dfs.datanode.data.dir \ | |
file:///ssd1/$SHORT_USER/hdfs/datanode \ | |
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/hdfs-site.xml | |
add_property $host \ | |
dfs.namenode.name.dir \ | |
file:///ssd1/$SHORT_USER/hdfs/namenode \ | |
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/hdfs-site.xml | |
add_property $host \ | |
dfs.namenode.datanode.registration.ip-hostname-check \ | |
false \ | |
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/hdfs-site.xml | |
add_property $host \ | |
hadoop.tmp.dir \ | |
/ssd1/$SHORT_USER/tmp \ | |
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/core-site.xml | |
# Set the NameNode and ResourceManager | |
add_property $host \ | |
fs.defaultFS \ | |
hdfs://$namenode \ | |
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/core-site.xml | |
add_property $host \ | |
yarn.resourcemanager.hostname \ | |
$resourcemanager \ | |
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/yarn-site.xml | |
# Configure YARN resource limits | |
add_property $host \ | |
yarn.nodemanager.resource.memory-mb \ | |
49152 \ | |
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/yarn-site.xml | |
add_property $host \ | |
yarn.nodemanager.resource.cpu-vcores \ | |
10 \ | |
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/yarn-site.xml | |
add_property $host \ | |
yarn.nodemanager.vmem-check-enabled \ | |
false \ | |
/opt/hadoop-$HADOOP_VERSION/etc/hadoop/yarn-site.xml | |
done | |
# Remove non-slaves from arguments | |
shift 2 | |
# Format HDFS and start the NameNode | |
echo "Starting NameNode on $namenode" > /dev/stderr | |
ssh $namenode sudo /opt/hadoop-$HADOOP_VERSION/bin/hdfs namenode -format | |
ssh $namenode sudo /opt/hadoop-$HADOOP_VERSION/sbin/hadoop-daemon.sh start namenode | |
# Start DataNodes | |
for slave in "$@"; do | |
echo "Starting DataNode on $slave" > /dev/stderr | |
ssh $slave sudo /opt/hadoop-$HADOOP_VERSION/sbin/hadoop-daemon.sh start datanode | |
done | |
# Create a directory for Spark event logs | |
ssh $namenode sudo /opt/hadoop-$HADOOP_VERSION/bin/hdfs dfs -mkdir /spark-logs | |
# Start ResourceManager | |
echo "Starting ResourceManager on $resourcemanager" > /dev/stderr | |
ssh $resourcemanager sudo /opt/hadoop-$HADOOP_VERSION/sbin/yarn-daemon.sh start resourcemanager | |
# Start NodeManagers | |
for slave in "$@"; do | |
echo "Starting NodeManager on $host" > /dev/stderr | |
ssh $slave sudo /opt/hadoop-$HADOOP_VERSION/sbin/yarn-daemon.sh start nodemanager | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment