Skip to content

Instantly share code, notes, and snippets.

@develash
Created December 9, 2012 04:54
Show Gist options
  • Save develash/4243381 to your computer and use it in GitHub Desktop.
Save develash/4243381 to your computer and use it in GitHub Desktop.
CDH4 MRv1 Psuedo-distributed Everything Installation (CentOS / RHEL 6.x 64bit)
# Assumes you have Sun JDK installed already and JAVA_HOME set to that for root
# This is all basically a summary of various parts of https://ccp.cloudera.com/display/CDH4DOC/CDH4+Documentation
# Add Cloudera RPM-GPG-KEY and repo
rpm --import http://archive.cloudera.com/cdh4/redhat/6/x86_64/cdh/RPM-GPG-KEY-cloudera
rpm -ivh http://archive.cloudera.com/cdh4/one-click-install/redhat/6/x86_64/cloudera-cdh-4-0.x86_64.rpm
# Install CDH4 Base
yum install hadoop-0.20-conf-pseudo
# set JAVA_HOME in /etc/hadoop/conf/hadoop-env.sh if you can't use the system-wide Java runtime
# Update fs.default.name value to actual <hostname> in /etc/hadoop/conf/core-site.xml
sed -i 's/localhost/<hostname>/' /etc/hadoop/conf/core-site.xml
# Update mapred.job.tracker value to actual <hostname> in /etc/hadoop/conf/mapred-site.xml
sed -i 's/localhost/<hostname>/' /etc/hadoop/conf/mapred-site.xml
# Format the NameNode
sudo -u hdfs hdfs namenode -format
# Start HDFS
for service in /etc/init.d/hadoop-hdfs-* ; do $service start; done
# do all of the following as the HDFS user
su - hdfs
# Create the HDFS /tmp directory before someone else does it and gets the perms wrong
hadoop fs -mkdir /tmp
hadoop fs -chmod -R 1777 /tmp
# Create and permission the MapReduce system directories
hadoop fs -mkdir /var
hadoop fs -mkdir /var/lib
hadoop fs -mkdir /var/lib/hadoop-hdfs
hadoop fs -mkdir /var/lib/hadoop-hdfs/cache
hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred
hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred/mapred
hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
hadoop fs -chmod 1777 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
hadoop fs -chown -R mapred /var/lib/hadoop-hdfs/cache/mapred
# Verify the HDFS File Structure
hadoop fs -ls -R /
# Should look as follows:
# drwxrwxrwt - hdfs supergroup 0 2012-04-19 15:14 /tmp
# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var
# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var/lib
# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var/lib/hadoop-hdfs
# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var/lib/hadoop-hdfs/cache
# drwxr-xr-x - mapred supergroup 0 2012-04-19 15:19 /var/lib/hadoop-hdfs/cache/mapred
# drwxr-xr-x - mapred supergroup 0 2012-04-19 15:29 /var/lib/hadoop-hdfs/cache/mapred/mapred
# drwxrwxrwt - mapred supergroup 0 2012-04-19 15:33 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
# change back to rut
exit
# Start MapReduce
for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done
# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
DataNode
NameNode
SecondaryNameNode
JobTracker
TaskTracker
#
# Zookeeper
#
# Install zookeeper server package
# Base package was already installed by hadoop-0.20-conf-pseudo
yum install zookeeper-server
# set JAVA_HOME in /usr/lib/zookeeper/bin/zkEnv.sh if you can't use the system-wide Java runtime
# Initialise Zookeeper
service zookeeper-server init --myid=1
# Start zookeeper
service zookeeper-server start
# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following process
QuorumPeerMain
#
# HBase
#
# set dfs.datanode.max.xcievers in /etc/hadoop/conf/hdfs-site.xml (yes, it's actually misspelled)
# Insert the following XML property between the <configuration> and </configuration> tags
<property>
<name>dfs.datanode.max.xcievers</name>
<value>4096</value>
</property>
# restart hdfs
for service in /etc/init.d/hadoop-hdfs-* ; do $service restart; done
# Install HBase
yum install hbase-master hbase-regionserver
# Modify /etc/hbase/conf/hbase-site.xml
# Be sure to change <hostname> to your actual hostname
# Insert the following XML properties between the <configuration> and </configuration> tags
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<property>
<name>hbase.rootdir</name>
<value>hdfs://<hostname>:8020/hbase</value>
</property>
# Create the /hbase directory in HDFS
sudo -u hdfs hadoop fs -mkdir /hbase
sudo -u hdfs hadoop fs -chown hbase /hbase
# set JAVA_HOME in /etc/hbase/conf/hbase-env.sh if you can't use the system-wide Java runtime
# Start HBase master
service hbase-master start
# start region server
service hbase-regionserver start
# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
HRegionServer
HMaster
#
# we're done, running '$JAVA_HOME/bin/jps | sort' should show all of the following processes (order doesn't matter)
#
JobTracker
TaskTracker
QuorumPeerMain
DataNode
NameNode
SecondaryNameNode
HMaster
HRegionServer
#
# Additional notes
#
#
# - User setup
# Create Linux users and corresponding HDFS home directories as needed
useradd -m -U <user>
sudo -u hdfs hadoop fs -mkdir /user/<user>
sudo -u hdfs hadoop fs -chown <user> /user/<user>
# - Shutting down / Starting up
# Order matters! To shutdown, do the following:
for service in /etc/init.d/hbase-*; do $service stop; done
service zookeeper-server stop
for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service stop; done
for service in /etc/init.d/hadoop-hdfs-* ; do $service stop; done
#
# Then to start back up:
#
for service in /etc/init.d/hadoop-hdfs-* ; do $service start; done
for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done
service zookeeper-server start
for service in /etc/init.d/hbase-*; do $service start; done
# - Disk Space
# Depending on your usage and your Linux environment, you may run out of disk space quickly.
# To fix this, moving /var/lib/hadoop-hdfs onto a dedicated file system is a good start.
# Doing the same for /var/log/hadoop-hdfs isn't a bad idea either.
# Make sure you retain (or duplicate) the exact same file system permissions in the new location if you do this.
# - Hadoop service accounts
# If you want to be picky with the GID/UID's of the Hadoop service accounts then do the following before running the Cloudera RPM's... otherwise just let the RPM's do it for you.
# Common Hadoop group
groupadd --gid 6666 hadoop
# Map Reduce
groupadd --gid 6667 mapred
useradd --no-create-home --home-dir /usr/lib/hadoop-0.20-mapreduce --shell /bin/bash --uid 6667 --gid mapred --groups hadoop --comment "Hadoop MapReduce" mapred
# HDFS
groupadd --gid 6668 hdfs
useradd --no-create-home --home-dir /usr/lib/hadoop-hdfs --shell /bin/bash --uid 6668 --gid hdfs --groups hadoop --comment "Hadoop HDFS" hdfs
# Zookeeper
groupadd --gid 6669 zookeeper
useradd --no-create-home --home-dir /var/run/zookeeper --shell /sbin/nologin --uid 6669 --gid zookeeper --comment "Zookeeper" zookeeper
# HBase
groupadd --gid 6670 hbase
useradd --no-create-home --home-dir /var/run/hbase --shell /sbin/nologin --uid 6670 --gid hbase --comment "HBase" hbase
@develash
Copy link
Author

develash commented Dec 9, 2012

interesting but MRv2 needed

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment