Skip to content

Instantly share code, notes, and snippets.

@javadba
Created March 3, 2013 22:48
Show Gist options
  • Save javadba/5078682 to your computer and use it in GitHub Desktop.
Save javadba/5078682 to your computer and use it in GitHub Desktop.
# Assumes you have Sun JDK installed already and JAVA_HOME set to that for root
# This is all basically a summary of various parts of https://ccp.cloudera.com/display/CDH4DOC/CDH4+Documentation
# Add Cloudera RPM-GPG-KEY and repo
rpm --import http://archive.cloudera.com/cdh4/redhat/6/x86_64/cdh/RPM-GPG-KEY-cloudera
rpm -ivh http://archive.cloudera.com/cdh4/one-click-install/redhat/6/x86_64/cloudera-cdh-4-0.x86_64.rpm
# Install CDH4 Base
yum install hadoop-0.20-conf-pseudo
# set JAVA_HOME in /etc/hadoop/conf/hadoop-env.sh if you can't use the system-wide Java runtime
# Update fs.default.name value to actual <hostname> in /etc/hadoop/conf/core-site.xml
sed -i 's/localhost/<hostname>/' /etc/hadoop/conf/core-site.xml
# Update mapred.job.tracker value to actual <hostname> in /etc/hadoop/conf/mapred-site.xml
sed -i 's/localhost/<hostname>/' /etc/hadoop/conf/mapred-site.xml
# Format the NameNode
sudo -u hdfs hdfs namenode -format
# Start HDFS
for service in /etc/init.d/hadoop-hdfs-* ; do $service start; done
# do all of the following as the HDFS user
su - hdfs
# Create the HDFS /tmp directory before someone else does it and gets the perms wrong
hadoop fs -mkdir /tmp
hadoop fs -chmod -R 1777 /tmp
# Create and permission the MapReduce system directories
hadoop fs -mkdir /var
hadoop fs -mkdir /var/lib
hadoop fs -mkdir /var/lib/hadoop-hdfs
hadoop fs -mkdir /var/lib/hadoop-hdfs/cache
hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred
hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred/mapred
hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
hadoop fs -chmod 1777 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
hadoop fs -chown -R mapred /var/lib/hadoop-hdfs/cache/mapred
# Verify the HDFS File Structure
hadoop fs -ls -R /
# Should look as follows:
# drwxrwxrwt - hdfs supergroup 0 2012-04-19 15:14 /tmp
# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var
# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var/lib
# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var/lib/hadoop-hdfs
# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var/lib/hadoop-hdfs/cache
# drwxr-xr-x - mapred supergroup 0 2012-04-19 15:19 /var/lib/hadoop-hdfs/cache/mapred
# drwxr-xr-x - mapred supergroup 0 2012-04-19 15:29 /var/lib/hadoop-hdfs/cache/mapred/mapred
# drwxrwxrwt - mapred supergroup 0 2012-04-19 15:33 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
# change back to rut
exit
# Start MapReduce
for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done
# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
DataNode
NameNode
SecondaryNameNode
JobTracker
TaskTracker
#
# Zookeeper
#
# Install zookeeper server package
# Base package was already installed by hadoop-0.20-conf-pseudo
yum install zookeeper-server
# set JAVA_HOME in /usr/lib/zookeeper/bin/zkEnv.sh if you can't use the system-wide Java runtime
# Initialise Zookeeper
service zookeeper-server init --myid=1
# Start zookeeper
service zookeeper-server start
# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following process
QuorumPeerMain
#
# HBase
#
# set dfs.datanode.max.xcievers in /etc/hadoop/conf/hdfs-site.xml (yes, it's actually misspelled)
# Insert the following XML property between the <configuration> and </configuration> tags
<property>
<name>dfs.datanode.max.xcievers</name>
<value>4096</value>
</property>
# restart hdfs
for service in /etc/init.d/hadoop-hdfs-* ; do $service restart; done
# Install HBase
yum install hbase-master hbase-regionserver
# Modify /etc/hbase/conf/hbase-site.xml
# Be sure to change <hostname> to your actual hostname
# Insert the following XML properties between the <configuration> and </configuration> tags
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<property>
<name>hbase.rootdir</name>
<value>hdfs://<hostname>:8020/hbase</value>
</property>
# Create the /hbase directory in HDFS
sudo -u hdfs hadoop fs -mkdir /hbase
sudo -u hdfs hadoop fs -chown hbase /hbase
# set JAVA_HOME in /etc/hbase/conf/hbase-env.sh if you can't use the system-wide Java runtime
# Start HBase master
service hbase-master start
# start region server
service hbase-regionserver start
# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
HRegionServer
HMaster
#
# we're done, running '$JAVA_HOME/bin/jps | sort' should show all of the following processes (order doesn't matter)
#
JobTracker
TaskTracker
QuorumPeerMain
DataNode
NameNode
SecondaryNameNode
HMaster
HRegionServer
#
# Additional notes
#
#
# - User setup
# Create Linux users and corresponding HDFS home directories as needed
useradd -m -U <user>
sudo -u hdfs hadoop fs -mkdir /user/<user>
sudo -u hdfs hadoop fs -chown <user> /user/<user>
# - Shutting down / Starting up
# Order matters! To shutdown, do the following:
for service in /etc/init.d/hbase-*; do $service stop; done
service zookeeper-server stop
for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service stop; done
for service in /etc/init.d/hadoop-hdfs-* ; do $service stop; done
#
# Then to start back up:
#
for service in /etc/init.d/hadoop-hdfs-* ; do $service start; done
for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done
service zookeeper-server start
for service in /etc/init.d/hbase-*; do $service start; done
# - Disk Space
# Depending on your usage and your Linux environment, you may run out of disk space quickly.
# To fix this, moving /var/lib/hadoop-hdfs onto a dedicated file system is a good start.
# Doing the same for /var/log/hadoop-hdfs isn't a bad idea either.
# Make sure you retain (or duplicate) the exact same file system permissions in the new location if you do this.
# - Hadoop service accounts
# If you want to be picky with the GID/UID's of the Hadoop service accounts then do the following before running the Cloudera RPM's... otherwise just let the RPM's do it for you.
# Common Hadoop group
groupadd --gid 6666 hadoop
# Map Reduce
groupadd --gid 6667 mapred
useradd --no-create-home --home-dir /usr/lib/hadoop-0.20-mapreduce --shell /bin/bash --uid 6667 --gid mapred --groups hadoop --comment "Hadoop MapReduce" mapred
# HDFS
groupadd --gid 6668 hdfs
useradd --no-create-home --home-dir /usr/lib/hadoop-hdfs --shell /bin/bash --uid 6668 --gid hdfs --groups hadoop --comment "Hadoop HDFS" hdfs
# Zookeeper
groupadd --gid 6669 zookeeper
useradd --no-create-home --home-dir /var/run/zookeeper --shell /sbin/nologin --uid 6669 --gid zookeeper --comment "Zookeeper" zookeeper
# HBase
groupadd --gid 6670 hbase
useradd --no-create-home --home-dir /var/run/hbase --shell /sbin/nologin --uid 6670 --gid hbase --comment "HBase" hbase
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment