javadba · March 3, 2013 22:48
diff --git a/gistfile1.txt b/gistfile1.txt
 # Assumes you have Sun JDK installed already and JAVA_HOME set to that for root
 # This is all basically a summary of various parts of https://ccp.cloudera.com/display/CDH4DOC/CDH4+Documentation

 # Add Cloudera RPM-GPG-KEY and repo
 rpm --import http://archive.cloudera.com/cdh4/redhat/6/x86_64/cdh/RPM-GPG-KEY-cloudera
 rpm -ivh http://archive.cloudera.com/cdh4/one-click-install/redhat/6/x86_64/cloudera-cdh-4-0.x86_64.rpm

 # Install CDH4 Base
 yum install hadoop-0.20-conf-pseudo

 # set JAVA_HOME in /etc/hadoop/conf/hadoop-env.sh if you can't use the system-wide Java runtime

 # Update fs.default.name value to actual <hostname> in /etc/hadoop/conf/core-site.xml
 sed -i 's/localhost/<hostname>/' /etc/hadoop/conf/core-site.xml 

 # Update mapred.job.tracker value to actual <hostname> in /etc/hadoop/conf/mapred-site.xml
 sed -i 's/localhost/<hostname>/' /etc/hadoop/conf/mapred-site.xml

 # Format the NameNode
 sudo -u hdfs hdfs namenode -format

 # Start HDFS
 for service in /etc/init.d/hadoop-hdfs-* ; do $service start; done

 # do all of the following as the HDFS user
 su - hdfs

 # Create the HDFS /tmp directory before someone else does it and gets the perms wrong
 hadoop fs -mkdir /tmp
 hadoop fs -chmod -R 1777 /tmp

 # Create and permission the MapReduce system directories
 hadoop fs -mkdir /var
 hadoop fs -mkdir /var/lib
 hadoop fs -mkdir /var/lib/hadoop-hdfs
 hadoop fs -mkdir /var/lib/hadoop-hdfs/cache
 hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred
 hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred/mapred
 hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
 hadoop fs -chmod 1777 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
 hadoop fs -chown -R mapred /var/lib/hadoop-hdfs/cache/mapred

 # Verify the HDFS File Structure
 hadoop fs -ls -R /

 # Should look as follows:
 # drwxrwxrwt   - hdfs     supergroup          0 2012-04-19 15:14 /tmp
 # drwxr-xr-x   - hdfs     supergroup          0 2012-04-19 15:16 /var
 # drwxr-xr-x   - hdfs     supergroup          0 2012-04-19 15:16 /var/lib
 # drwxr-xr-x   - hdfs     supergroup          0 2012-04-19 15:16 /var/lib/hadoop-hdfs
 # drwxr-xr-x   - hdfs     supergroup          0 2012-04-19 15:16 /var/lib/hadoop-hdfs/cache
 # drwxr-xr-x   - mapred   supergroup          0 2012-04-19 15:19 /var/lib/hadoop-hdfs/cache/mapred
 # drwxr-xr-x   - mapred   supergroup          0 2012-04-19 15:29 /var/lib/hadoop-hdfs/cache/mapred/mapred
 # drwxrwxrwt   - mapred   supergroup          0 2012-04-19 15:33 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging

 # change back to rut
 exit

 # Start MapReduce
 for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done

 # Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
 DataNode
 NameNode
 SecondaryNameNode
 JobTracker
 TaskTracker

 #
 # Zookeeper
 #

 # Install zookeeper server package
 # Base package was already installed by hadoop-0.20-conf-pseudo
 yum install zookeeper-server

 # set JAVA_HOME in /usr/lib/zookeeper/bin/zkEnv.sh if you can't use the system-wide Java runtime

 # Initialise Zookeeper
 service zookeeper-server init --myid=1

 # Start zookeeper
 service zookeeper-server start

 # Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following process
 QuorumPeerMain

 #
 # HBase
 #

 # set dfs.datanode.max.xcievers in /etc/hadoop/conf/hdfs-site.xml (yes, it's actually misspelled)
 # Insert the following XML property between the <configuration> and </configuration> tags
 <property>
  <name>dfs.datanode.max.xcievers</name>
  <value>4096</value>
 </property>

 # restart hdfs
 for service in /etc/init.d/hadoop-hdfs-* ; do $service restart; done

 # Install HBase
 yum install hbase-master hbase-regionserver

 # Modify /etc/hbase/conf/hbase-site.xml
 # Be sure to change <hostname> to your actual hostname
 # Insert the following XML properties between the <configuration> and </configuration> tags
 <property>
  <name>hbase.cluster.distributed</name>
  <value>true</value>
 </property>
 <property>
  <name>hbase.rootdir</name>
  <value>hdfs://<hostname>:8020/hbase</value>
 </property>

 # Create the /hbase directory in HDFS
 sudo -u hdfs hadoop fs -mkdir /hbase
 sudo -u hdfs hadoop fs -chown hbase /hbase

 # set JAVA_HOME in /etc/hbase/conf/hbase-env.sh if you can't use the system-wide Java runtime

 # Start HBase master
 service hbase-master start

 # start region server
 service hbase-regionserver start

 # Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
 HRegionServer
 HMaster

 #
 # we're done, running '$JAVA_HOME/bin/jps | sort' should show all of the following processes (order doesn't matter)
 #
 JobTracker
 TaskTracker
 QuorumPeerMain
 DataNode
 NameNode
 SecondaryNameNode
 HMaster
 HRegionServer

 #
 # Additional notes
 #

 #
 # - User setup
 # Create Linux users and corresponding HDFS home directories as needed
 useradd -m -U <user>
 sudo -u hdfs hadoop fs -mkdir  /user/<user>
 sudo -u hdfs hadoop fs -chown <user> /user/<user>

 # - Shutting down / Starting up
 # Order matters! To shutdown, do the following:
 for service in /etc/init.d/hbase-*; do $service stop; done
 service zookeeper-server stop
 for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service stop; done
 for service in /etc/init.d/hadoop-hdfs-* ; do $service stop; done
 #
 # Then to start back up:
 #
 for service in /etc/init.d/hadoop-hdfs-* ; do $service start; done
 for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done
 service zookeeper-server start
 for service in /etc/init.d/hbase-*; do $service start; done

 # - Disk Space
 # Depending on your usage and your Linux environment, you may run out of disk space quickly.
 # To fix this, moving /var/lib/hadoop-hdfs onto a dedicated file system is a good start.
 # Doing the same for /var/log/hadoop-hdfs isn't a bad idea either.
 # Make sure you retain (or duplicate) the exact same file system permissions in the new location if you do this.

 # - Hadoop service accounts
 # If you want to be picky with the GID/UID's of the Hadoop service accounts then do the following before running the Cloudera RPM's... otherwise just let the RPM's do it for you.
 # Common Hadoop group
 groupadd --gid 6666 hadoop
 # Map Reduce
 groupadd --gid 6667 mapred
 useradd --no-create-home --home-dir /usr/lib/hadoop-0.20-mapreduce --shell /bin/bash --uid 6667 --gid mapred --groups hadoop --comment "Hadoop MapReduce" mapred
 # HDFS
 groupadd --gid 6668 hdfs
 useradd --no-create-home --home-dir /usr/lib/hadoop-hdfs --shell /bin/bash --uid 6668 --gid hdfs --groups hadoop --comment "Hadoop HDFS" hdfs
 # Zookeeper
 groupadd --gid 6669 zookeeper
 useradd --no-create-home --home-dir /var/run/zookeeper --shell /sbin/nologin --uid 6669 --gid zookeeper --comment "Zookeeper" zookeeper
 # HBase
 groupadd --gid 6670 hbase
 useradd --no-create-home --home-dir /var/run/hbase --shell /sbin/nologin --uid 6670 --gid hbase --comment "HBase" hbase
	# Assumes you have Sun JDK installed already and JAVA_HOME set to that for root
	# This is all basically a summary of various parts of https://ccp.cloudera.com/display/CDH4DOC/CDH4+Documentation

	# Add Cloudera RPM-GPG-KEY and repo
	rpm --import http://archive.cloudera.com/cdh4/redhat/6/x86_64/cdh/RPM-GPG-KEY-cloudera
	rpm -ivh http://archive.cloudera.com/cdh4/one-click-install/redhat/6/x86_64/cloudera-cdh-4-0.x86_64.rpm

	# Install CDH4 Base
	yum install hadoop-0.20-conf-pseudo

	# set JAVA_HOME in /etc/hadoop/conf/hadoop-env.sh if you can't use the system-wide Java runtime

	# Update fs.default.name value to actual <hostname> in /etc/hadoop/conf/core-site.xml
	sed -i 's/localhost/<hostname>/' /etc/hadoop/conf/core-site.xml

	# Update mapred.job.tracker value to actual <hostname> in /etc/hadoop/conf/mapred-site.xml
	sed -i 's/localhost/<hostname>/' /etc/hadoop/conf/mapred-site.xml

	# Format the NameNode
	sudo -u hdfs hdfs namenode -format

	# Start HDFS
	for service in /etc/init.d/hadoop-hdfs-* ; do $service start; done

	# do all of the following as the HDFS user
	su - hdfs

	# Create the HDFS /tmp directory before someone else does it and gets the perms wrong
	hadoop fs -mkdir /tmp
	hadoop fs -chmod -R 1777 /tmp

	# Create and permission the MapReduce system directories
	hadoop fs -mkdir /var
	hadoop fs -mkdir /var/lib
	hadoop fs -mkdir /var/lib/hadoop-hdfs
	hadoop fs -mkdir /var/lib/hadoop-hdfs/cache
	hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred
	hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred/mapred
	hadoop fs -mkdir /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
	hadoop fs -chmod 1777 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging
	hadoop fs -chown -R mapred /var/lib/hadoop-hdfs/cache/mapred

	# Verify the HDFS File Structure
	hadoop fs -ls -R /

	# Should look as follows:
	# drwxrwxrwt - hdfs supergroup 0 2012-04-19 15:14 /tmp
	# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var
	# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var/lib
	# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var/lib/hadoop-hdfs
	# drwxr-xr-x - hdfs supergroup 0 2012-04-19 15:16 /var/lib/hadoop-hdfs/cache
	# drwxr-xr-x - mapred supergroup 0 2012-04-19 15:19 /var/lib/hadoop-hdfs/cache/mapred
	# drwxr-xr-x - mapred supergroup 0 2012-04-19 15:29 /var/lib/hadoop-hdfs/cache/mapred/mapred
	# drwxrwxrwt - mapred supergroup 0 2012-04-19 15:33 /var/lib/hadoop-hdfs/cache/mapred/mapred/staging

	# change back to rut
	exit

	# Start MapReduce
	for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done

	# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
	DataNode
	NameNode
	SecondaryNameNode
	JobTracker
	TaskTracker

	#
	# Zookeeper
	#

	# Install zookeeper server package
	# Base package was already installed by hadoop-0.20-conf-pseudo
	yum install zookeeper-server

	# set JAVA_HOME in /usr/lib/zookeeper/bin/zkEnv.sh if you can't use the system-wide Java runtime

	# Initialise Zookeeper
	service zookeeper-server init --myid=1

	# Start zookeeper
	service zookeeper-server start

	# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following process
	QuorumPeerMain

	#
	# HBase
	#

	# set dfs.datanode.max.xcievers in /etc/hadoop/conf/hdfs-site.xml (yes, it's actually misspelled)
	# Insert the following XML property between the <configuration> and </configuration> tags
	<property>
	<name>dfs.datanode.max.xcievers</name>
	<value>4096</value>
	</property>

	# restart hdfs
	for service in /etc/init.d/hadoop-hdfs-* ; do $service restart; done

	# Install HBase
	yum install hbase-master hbase-regionserver

	# Modify /etc/hbase/conf/hbase-site.xml
	# Be sure to change <hostname> to your actual hostname
	# Insert the following XML properties between the <configuration> and </configuration> tags
	<property>
	<name>hbase.cluster.distributed</name>
	<value>true</value>
	</property>
	<property>
	<name>hbase.rootdir</name>
	<value>hdfs://<hostname>:8020/hbase</value>
	</property>

	# Create the /hbase directory in HDFS
	sudo -u hdfs hadoop fs -mkdir /hbase
	sudo -u hdfs hadoop fs -chown hbase /hbase

	# set JAVA_HOME in /etc/hbase/conf/hbase-env.sh if you can't use the system-wide Java runtime

	# Start HBase master
	service hbase-master start

	# start region server
	service hbase-regionserver start

	# Check everything worked, run '$JAVA_HOME/bin/jps' and look for the following processes
	HRegionServer
	HMaster

	#
	# we're done, running '$JAVA_HOME/bin/jps \| sort' should show all of the following processes (order doesn't matter)
	#
	JobTracker
	TaskTracker
	QuorumPeerMain
	DataNode
	NameNode
	SecondaryNameNode
	HMaster
	HRegionServer

	#
	# Additional notes
	#

	#
	# - User setup
	# Create Linux users and corresponding HDFS home directories as needed
	useradd -m -U <user>
	sudo -u hdfs hadoop fs -mkdir /user/<user>
	sudo -u hdfs hadoop fs -chown <user> /user/<user>

	# - Shutting down / Starting up
	# Order matters! To shutdown, do the following:
	for service in /etc/init.d/hbase-*; do $service stop; done
	service zookeeper-server stop
	for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service stop; done
	for service in /etc/init.d/hadoop-hdfs-* ; do $service stop; done
	#
	# Then to start back up:
	#
	for service in /etc/init.d/hadoop-hdfs-* ; do $service start; done
	for service in /etc/init.d/hadoop-0.20-mapreduce-*; do $service start; done
	service zookeeper-server start
	for service in /etc/init.d/hbase-*; do $service start; done

	# - Disk Space
	# Depending on your usage and your Linux environment, you may run out of disk space quickly.
	# To fix this, moving /var/lib/hadoop-hdfs onto a dedicated file system is a good start.
	# Doing the same for /var/log/hadoop-hdfs isn't a bad idea either.
	# Make sure you retain (or duplicate) the exact same file system permissions in the new location if you do this.

	# - Hadoop service accounts
	# If you want to be picky with the GID/UID's of the Hadoop service accounts then do the following before running the Cloudera RPM's... otherwise just let the RPM's do it for you.
	# Common Hadoop group
	groupadd --gid 6666 hadoop
	# Map Reduce
	groupadd --gid 6667 mapred
	useradd --no-create-home --home-dir /usr/lib/hadoop-0.20-mapreduce --shell /bin/bash --uid 6667 --gid mapred --groups hadoop --comment "Hadoop MapReduce" mapred
	# HDFS
	groupadd --gid 6668 hdfs
	useradd --no-create-home --home-dir /usr/lib/hadoop-hdfs --shell /bin/bash --uid 6668 --gid hdfs --groups hadoop --comment "Hadoop HDFS" hdfs
	# Zookeeper
	groupadd --gid 6669 zookeeper
	useradd --no-create-home --home-dir /var/run/zookeeper --shell /sbin/nologin --uid 6669 --gid zookeeper --comment "Zookeeper" zookeeper
	# HBase
	groupadd --gid 6670 hbase
	useradd --no-create-home --home-dir /var/run/hbase --shell /sbin/nologin --uid 6670 --gid hbase --comment "HBase" hbase