Skip to content

Instantly share code, notes, and snippets.

@sturadnidge
Last active December 20, 2015 22:39
Show Gist options
  • Save sturadnidge/6206277 to your computer and use it in GitHub Desktop.
Save sturadnidge/6206277 to your computer and use it in GitHub Desktop.
CDH4 MRv1 install in pseudo-distributed mode, plus optional ZooKeeper, HBase, HttpFS. Assumes you have a Sun JDK installed and JAVA_HOME set for root if installed somewhere other than /usr/java/. This is all basically a summary of various parts of https://ccp.cloudera.com/display/CDH4DOC/CDH4+Documentation
# Add Cloudera RPM-GPG-KEY and repo
rpm --import http://archive.cloudera.com/cdh4/redhat/6/x86_64/cdh/RPM-GPG-KEY-cloudera
rpm -ivh http://archive.cloudera.com/cdh4/one-click-install/redhat/6/x86_64/cloudera-cdh-4-0.x86_64.rpm
# note: if you want to install a specific version,
# modify /etc/yum.repos.d/cloudera-cdh4.repo accordingly.
# For example, if you want to install 4.2.1, use the following baseurl:
# baseurl=http://archive.cloudera.com/cdh4/redhat/6/x86_64/cdh/4.2.1/
# Install CDH4 Base
yum -y install hadoop-0.20-conf-pseudo
# if you can't use the system-wide Java runtime, create a file /etc/hadoop/conf/hadoop-env.sh
# and define JAVA_HOME in there
# ensure DNS is working, or that you have an entry in /etc/hosts for <hostname.fqdn>
# Update fs.default.name value to actual <hostname.fqdn> in /etc/hadoop/conf/core-site.xml
sed -i 's/localhost/<hostname.fqdn>/' /etc/hadoop/conf/core-site.xml
# Update mapred.job.tracker value to actual <hostname.fqdn> in /etc/hadoop/conf/mapred-site.xml
sed -i 's/localhost/<hostname.fqdn>/' /etc/hadoop/conf/mapred-site.xml
# make a directory for hadoop and grant ownership
mkdir -p /opt/hadoop
chown -R hdfs:hadoop /opt/hadoop
# configure data directory
mkdir /opt/hadoop/dfs
# update /etc/hadoop/conf/hdfs-site.xml
<property>
<name>dfs.namenode.name.dir</name>
<value>file:///opt/hadoop/nn</value>
</property>
<property>
<name>dfs.namenode.checkpoint.dir</name>
<value>file:///opt/hadoop/snn</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:///opt/hadoop/dfs/data</value>
</property>
# move hadoop.tmp directory
mv /var/lib/hadoop-hdfs/cache /opt/hadoop/
ln -s /opt/hadoop/cache /var/lib/hadoop-hdfs/cache
# move logs
mkdir -p /opt/hadoop/log
mv /var/log/hadoop-0.20-mapreduce /opt/hadoop/log/
mv /var/log/hadoop-hdfs /opt/hadoop/log/
mv /var/log/zookeeper /opt/hadoop/log/
ln -s /opt/hadoop/log/hadoop-0.20-mapreduce /var/log/hadoop-0.20-mapreduce
ln -s /opt/hadoop/log/hadoop-hdfs /var/log/hadoop-hdfs
ln -s /opt/hadoop/log/zookeeper /var/log/zookeeper
# Format the NameNode
sudo -u hdfs hdfs namenode -format
# Start HDFS
for service in /etc/init.d/hadoop-hdfs-* ; do $service start; done
# do all of the following as the HDFS user
su - hdfs
# Create the HDFS /tmp directory before someone else does it and gets the perms wrong
hadoop fs -mkdir /tmp
hadoop fs -chmod -R 1777 /tmp
# Create and permission the hadoop.tmp directories for mapred user
hadoop fs -mkdir -p /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
hadoop fs -chmod 1777 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
hadoop fs -chown -R mapred /var/lib/hadoop-hdfs/cache/mapred
# change back to rut
exit
# Start MapReduce
for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done
# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
DataNode
NameNode
SecondaryNameNode
JobTracker
TaskTracker
#
# Zookeeper
#
# Install zookeeper server package
# Base package was already installed by hadoop-0.20-conf-pseudo
yum -y install zookeeper-server
# set JAVA_HOME in /usr/lib/zookeeper/bin/zkEnv.sh if you can't use the system-wide Java runtime
# Initialise Zookeeper
service zookeeper-server init --myid=1
# update zookeeper snapshot directory
mv /var/lib/zookeeper /opt/hadoop/
ln -s /opt/hadoop/zookeeper /var/lib/zookeeper
# Start zookeeper
service zookeeper-server start
# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following process
QuorumPeerMain
#
# HBase
#
# set dfs.datanode.max.xcievers in /etc/hadoop/conf/hdfs-site.xml (yes, it's actually misspelled)
# Insert the following XML property between the <configuration> and </configuration> tags
<property>
<name>dfs.datanode.max.xcievers</name>
<value>4096</value>
</property>
# restart mapred & hdfs
for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service stop; done
for service in /etc/init.d/hadoop-hdfs-* ; do $service restart; done
for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done
# Install HBase
yum -y install hbase-master hbase-regionserver
# tell hbase to use standalone zookeeper
# add the following to /etc/hbase/conf/hbase-env.sh
export HBASE_MANAGES_ZK=false
# Modify /etc/hbase/conf/hbase-site.xml
# Be sure to change <hostname.fqdn> to your actual fully qualified hostname
# Insert the following XML properties between the <configuration> and </configuration> tags
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<property>
<name>hbase.rootdir</name>
<value>hdfs://<hostname.fqdn>:8020/hbase</value>
</property>
# Create the /hbase directory in HDFS
sudo -u hdfs hadoop fs -mkdir /hbase
sudo -u hdfs hadoop fs -chown hbase /hbase
# set JAVA_HOME in /etc/hbase/conf/hbase-env.sh if you can't use the system-wide Java runtime
# move logs
mv /var/log/hbase /opt/hadoop/log/
ln -s /opt/hadoop/log/hbase /var/log/hbase
# Start HBase
for service in /etc/init.d/hbase-*; do $service start; done
# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
HRegionServer
HMaster
#
# Httpfs
#
# Install httpfs
yum -y install hadoop-httpfs
# move logs
mv /var/log/hadoop-httpfs /opt/hadoop/log/
ln -s /opt/hadoop/log/hadoop-httpfs /var/log/hadoop-httpfs
# export JAVA_HOME in /etc/hadoop-httpfs/conf/httpfs-env.sh
# Start httpfs - you should see some fairly typical Tomcat startup messages
service hadoop-httpfs start
# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
Bootstrap
#
# we're done, running '$JAVA_HOME/bin/jps' should show all of the following processes
# (order doesn't matter)
#
Bootstrap
DataNode
NameNode
SecondaryNameNode
JobTracker
TaskTracker
QuorumPeerMain
HMaster
HRegionServer
#
# you can also point a web broswer at <hostname> ports 50030, 50060, 50070, 50090, 60010, 60030
#
#
# Additional notes
#
# - User setup
# Create Linux users and corresponding HDFS home directories as needed
useradd -m -U <user>
sudo -u hdfs hadoop fs -mkdir /user/<user>
sudo -u hdfs hadoop fs -chown <user> /user/<user>
# - Shutting down / Starting up
#
# Order matters! To shutdown, do the following:
#
for service in /etc/init.d/hbase-*; do $service stop; done
service zookeeper-server stop
for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service stop; done
for service in /etc/init.d/hadoop-hdfs-* ; do $service stop; done
#
# Then to start back up:
#
for service in /etc/init.d/hadoop-hdfs-* ; do $service start; done
for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done
service zookeeper-server start
for service in /etc/init.d/hbase-*; do $service start; done
# - Hadoop service accounts
# If you want to be picky with the GID/UID's of the Hadoop service accounts then do
# the following before running the Cloudera RPM's... otherwise just let the RPM's do it for you.
# Common Hadoop group
groupadd --gid 6666 hadoop
# Map Reduce
groupadd --gid 6667 mapred
useradd --no-create-home --home-dir /usr/lib/hadoop-0.20-mapreduce \
--shell /bin/bash --uid 6667 --gid mapred --groups hadoop --comment "Hadoop MapReduce" mapred
# HDFS
groupadd --gid 6668 hdfs
useradd --no-create-home --home-dir /usr/lib/hadoop-hdfs \
--shell /bin/bash --uid 6668 --gid hdfs --groups hadoop --comment "Hadoop HDFS" hdfs
# Zookeeper
groupadd --gid 6669 zookeeper
useradd --no-create-home --home-dir /var/run/zookeeper \
--shell /sbin/nologin --uid 6669 --gid zookeeper --comment "Zookeeper" zookeeper
# HBase
groupadd --gid 6670 hbase
useradd --no-create-home --home-dir /var/run/hbase \
--shell /sbin/nologin --uid 6670 --gid hbase --comment "HBase" hbase
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment