basharam · December 29, 2014 10:39 · basharam · Jan 2, 2015
diff --git a/hadoop_multinode_cluster_setup b/hadoop_multinode_cluster_setup
 #Hadoop 2.6.0 Multinode cluster Setup
 From Blog http://www.michael-noll.com/tutorials/running-hadoop-on-ubuntu-linux-multi-node-cluster/

 ###Machine 1(master).

 Prequisite:

 java version

 	java -version
 	java version "1.7.0_72"
 	Java(TM) SE Runtime Environment (build 1.7.0_72-b14)
 	Java HotSpot(TM) 64-Bit Server VM (build 24.72-b04, mixed mode)

 	$JAVA_HOME=/usr/lib/jvm/java-7-oracle/jre

 	Hadoop Stable version:2.6.0


 setup steps(For single node cluster)

 	mkdir -p /usr/local/hd
 	chmod 666 /usr/local/hd
 	Unzip hadoop2.6 in /usr/local/hd

 add in .bashrc

 	export HD_HOME=/usr/local/hd
 	export JAVA_HOME=/usr/lib/jvm/java-7-oracle/jre

 Create distributed filesystem file

 	mkdir -p /app/hd/tmp
 	chmod 666 /app/hd/tmp

 connection to localhost with ssh passwordless

 -generate the public private keys

 	$ ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa

 -authorize the key by adding it to the list of authorized keys

 	$ cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys

 -test that you can log in with no password

 	$ ssh localhost

 open hadoop-env.sh from HD_HOME/etc/hadoop and update $JAVA_HOME


 	export JAVA_HOME=/usr/lib/jvm/java-7-oracle/jre

 #### Open core-site.xml,mapred-site.xml,hdfs-site.xml

 ####-core-site.xml #####
 add both property tag in configuration tags <configuration>
 	<property>
 		<name>hadoop.tmp.dir</name>
 		<value>/app/hd/tmp</value> <! folder created in step 5.>
 		<description>A base for other temporary directories.
 		</description>
 	</property>

 	<property>
 		<name>fs.default.name</name>
 		<value>hdfs://localhost:54310</value>
 		<description>
 		The name of the default file system. A URI whose scheme and authority determine the FileSystem implementation.  The uri's scheme determines the config property(fs.SCHEME.impl) naming the FileSystem implementation class. The uri's authority is used to determine the host, port, etc.for a filesystem.
 		</description>
 	</property>

 ####-mapred-site.xml####

 ######-On hadoop MR1(Hadoop 1.0)
 add both property tag in configuration tags <configuration>
 	<property>
 		<name>mapred.job.tracker</name>
 		value>localhost:54311</value>
 		<description>The host and port that the MapReduce job tracker runs at. If "local", then jobs are run in-process as a single map and reduce task.
 		</description>
 	</property>

 ######-On Hadoop(2.x) MR2(YARN)

 add below property in mapred-site.xml

 	<property>
 		<name>mapreduce.framework.name</name>
 		<value>yarn</value>
 	</property>

 and add below property in yarn-site.xml
 	<property>
 		<name>yarn.nodemanager.aux-services</name>
 		<value>mapreduce_shuffle</value>
 	</property>

 	<property>
 		<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
 		<value>org.apache.hadoop.mapred.ShuffleHandler</value>
 	</property>


 ####-hdfs-site.xml#########

 The default block size is 128MB, files pushed on datanodes will split up in 128MB block size and replication.

 	<property>
 		<name>dfs.replication</name>
 		<value>1</value>  // default is 3, since its single node we use 1.
 		<description>Default block replication.The actual number of replications can be specified when the file is created. The default is used if replication is not specified in create time.
 		</description>
 	</property>

 9.
 	#hadoop namenode -format
 10.
 	#start-all.sh

 11.
 	#jps
 	2287 TaskTracker
 	2149 JobTracker
 	1938 DataNode
 	2085 SecondaryNameNode
 	2349 Jps
 	1788 NameNode

 12.
 	# stop-all.sh
 	stopping jobtracker
 	localhost: stopping tasktracker
 	stopping namenode
 	localhost: stopping datanode
 	localhost: stopping secondarynamenode
 	hduser@ubuntu:/usr/local/hadoop$

 13.Run helloworld.
 	# hadoop fs -mkdir /input
 	hadoop fs -put largetextfilewithwords.txt /input
 	hadoop jar hadoop*examples*.jar wordcount /input/largetextfilewithwords.txt /input/testoutput

 on successful results will be  testoutput folder.

 	_SUCCESS
 	part-r-00000

 	part-r-00000 will have result output.


 ###Machine 2(slave).

 1. Repeat step 1 to 13 from machine 1.
 2. On Machine 1 vi /etc/hosts and add the ip a name as master not localhost.


 	vi /etc/hosts
 	#127.0.0.1	localhost
 	#127.0.1.1	ubupc1

 	192.13.171.58	master
 	192.113.171.43	slave

 	# The following lines are desirable for IPv6 capable hosts
 	::1     ip6-localhost ip6-loopback
 	fe00::0 ip6-localnet
 	ff00::0 ip6-mcastprefix




 ![alt tag](http://www.michael-noll.com/blog/uploads/Hadoop-multi-node-cluster-overview.png)



 same on machine2 (salve)

 	vi /etc/hosts
 	#127.0.0.1	localhost
 	#127.0.1.1	bsdpc1

 	192.13.171.58	master
 	192.113.171.43	slave

 3.Disable SSH  password from machine1 to machine2 and vice versa. by using

 On master machine run

 	$ ssh-copy-id -i ~/.ssh/id_rsa.pub slave

 On slave

 	$ ssh-copy-id -i ~/.ssh/id_rsa.pub master

 4.On master and slave machine, add master(ip) and slave(ip) in the slaves file. and also ip of any slave machines(node).


 	vi $HD_HOME/etc/hadoop/slaves
 	master
 	salve

 #####5.open conf/*-site.xml (all machines)

 ####-core-site.xml
 	<property>
 		<name>fs.default.name</name>
 		<value>hdfs://master:54310</value>
 		<description>The name of the default file system.  A URI whose scheme and authority determine the FileSystem implementation.The uri's scheme determines the config property (fs.SCHEME.impl) naming the FileSystem implementation class.  The uri's authority is used to  determine the host, port, etc. for a filesystem.</description>
 	</property>


 ####-mapred-site.xml(for hadoop 1.x)
 	<property>
 		<name>mapred.job.tracker</name>
 		<value>master:54311</value>
 		<description>The host and port that the MapReduce job tracker runs at.If "local", then jobs are run in-process as a single map and reduce task.
 		</description>
 	</property>

 ####-mapred-site.xml(for hadoop 2.x)

 	<property>
 	<name>mapreduce.framework.name</name>
 	<value>yarn</value>
 	</property>

 and update yarn site.

 ####-yarn-site.xml
 	<configuration>
 	<property>
 	<name>mapreduce.framework.name</name>
 	<value>yarn</value>
 	</property>
 	</configuration>





 ####-hdfs-site.xml
 		<property>
 		<name>dfs.replication</name>
 		<value>2</value>
 		<description>Default block replication.The actual number of replications can be specified when the file is created. The default is used if replication is not specified in create time.
 		</description>
 	</property>

 6.Delete any files in /app/hd/tmp/ on all machines


 	#rm -r /app/hd/tmp/*

 7.On master run


 	hadoop namenode -format

 8.On master


 	start-dfs.sh // should start below jvm process.

 confirm using jps

 	jps
 	799 NameNode
 	15314 Jps
 	14880 DataNode
 	14977 SecondaryNameNode

 or

 	start-mapred.sh // on hadoop 1.x

 9.On master


 	start-yarn.sh// hadoop 2.x
 will start 1 Nodemanager per cluster and starts

 		1.ResourceManager
 		2.Nodemanager
	#Hadoop 2.6.0 Multinode cluster Setup
	From Blog http://www.michael-noll.com/tutorials/running-hadoop-on-ubuntu-linux-multi-node-cluster/

	###Machine 1(master).

	Prequisite:

	java version

	java -version
	java version "1.7.0_72"
	Java(TM) SE Runtime Environment (build 1.7.0_72-b14)
	Java HotSpot(TM) 64-Bit Server VM (build 24.72-b04, mixed mode)

	$JAVA_HOME=/usr/lib/jvm/java-7-oracle/jre

	Hadoop Stable version:2.6.0


	setup steps(For single node cluster)

	mkdir -p /usr/local/hd
	chmod 666 /usr/local/hd
	Unzip hadoop2.6 in /usr/local/hd

	add in .bashrc

	export HD_HOME=/usr/local/hd
	export JAVA_HOME=/usr/lib/jvm/java-7-oracle/jre

	Create distributed filesystem file

	mkdir -p /app/hd/tmp
	chmod 666 /app/hd/tmp

	connection to localhost with ssh passwordless

	-generate the public private keys

	$ ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa

	-authorize the key by adding it to the list of authorized keys

	$ cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys

	-test that you can log in with no password

	$ ssh localhost

	open hadoop-env.sh from HD_HOME/etc/hadoop and update $JAVA_HOME


	export JAVA_HOME=/usr/lib/jvm/java-7-oracle/jre

	#### Open core-site.xml,mapred-site.xml,hdfs-site.xml

	####-core-site.xml #####
	add both property tag in configuration tags <configuration>
	<property>
	<name>hadoop.tmp.dir</name>
	<value>/app/hd/tmp</value> <! folder created in step 5.>
	<description>A base for other temporary directories.
	</description>
	</property>

	<property>
	<name>fs.default.name</name>
	<value>hdfs://localhost:54310</value>
	<description>
	The name of the default file system. A URI whose scheme and authority determine the FileSystem implementation. The uri's scheme determines the config property(fs.SCHEME.impl) naming the FileSystem implementation class. The uri's authority is used to determine the host, port, etc.for a filesystem.
	</description>
	</property>

	####-mapred-site.xml####

	######-On hadoop MR1(Hadoop 1.0)
	add both property tag in configuration tags <configuration>
	<property>
	<name>mapred.job.tracker</name>
	value>localhost:54311</value>
	<description>The host and port that the MapReduce job tracker runs at. If "local", then jobs are run in-process as a single map and reduce task.
	</description>
	</property>

	######-On Hadoop(2.x) MR2(YARN)

	add below property in mapred-site.xml

	<property>
	<name>mapreduce.framework.name</name>
	<value>yarn</value>
	</property>

	and add below property in yarn-site.xml
	<property>
	<name>yarn.nodemanager.aux-services</name>
	<value>mapreduce_shuffle</value>
	</property>

	<property>
	<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
	<value>org.apache.hadoop.mapred.ShuffleHandler</value>
	</property>


	####-hdfs-site.xml#########

	The default block size is 128MB, files pushed on datanodes will split up in 128MB block size and replication.

	<property>
	<name>dfs.replication</name>
	<value>1</value> // default is 3, since its single node we use 1.
	<description>Default block replication.The actual number of replications can be specified when the file is created. The default is used if replication is not specified in create time.
	</description>
	</property>

	9.
	#hadoop namenode -format
	10.
	#start-all.sh

	11.
	#jps
	2287 TaskTracker
	2149 JobTracker
	1938 DataNode
	2085 SecondaryNameNode
	2349 Jps
	1788 NameNode

	12.
	# stop-all.sh
	stopping jobtracker
	localhost: stopping tasktracker
	stopping namenode
	localhost: stopping datanode
	localhost: stopping secondarynamenode
	hduser@ubuntu:/usr/local/hadoop$

	13.Run helloworld.
	# hadoop fs -mkdir /input
	hadoop fs -put largetextfilewithwords.txt /input
	hadoop jar hadoopexamples.jar wordcount /input/largetextfilewithwords.txt /input/testoutput

	on successful results will be testoutput folder.

	_SUCCESS
	part-r-00000

	part-r-00000 will have result output.


	###Machine 2(slave).

	1. Repeat step 1 to 13 from machine 1.
	2. On Machine 1 vi /etc/hosts and add the ip a name as master not localhost.


	vi /etc/hosts
	#127.0.0.1 localhost
	#127.0.1.1 ubupc1

	192.13.171.58 master
	192.113.171.43 slave

	# The following lines are desirable for IPv6 capable hosts
	::1 ip6-localhost ip6-loopback
	fe00::0 ip6-localnet
	ff00::0 ip6-mcastprefix




	![alt tag](http://www.michael-noll.com/blog/uploads/Hadoop-multi-node-cluster-overview.png)



	same on machine2 (salve)

	vi /etc/hosts
	#127.0.0.1 localhost
	#127.0.1.1 bsdpc1

	192.13.171.58 master
	192.113.171.43 slave

	3.Disable SSH password from machine1 to machine2 and vice versa. by using

	On master machine run

	$ ssh-copy-id -i ~/.ssh/id_rsa.pub slave

	On slave

	$ ssh-copy-id -i ~/.ssh/id_rsa.pub master

	4.On master and slave machine, add master(ip) and slave(ip) in the slaves file. and also ip of any slave machines(node).


	vi $HD_HOME/etc/hadoop/slaves
	master
	salve

	#####5.open conf/*-site.xml (all machines)

	####-core-site.xml
	<property>
	<name>fs.default.name</name>
	<value>hdfs://master:54310</value>
	<description>The name of the default file system. A URI whose scheme and authority determine the FileSystem implementation.The uri's scheme determines the config property (fs.SCHEME.impl) naming the FileSystem implementation class. The uri's authority is used to determine the host, port, etc. for a filesystem.</description>
	</property>


	####-mapred-site.xml(for hadoop 1.x)
	<property>
	<name>mapred.job.tracker</name>
	<value>master:54311</value>
	<description>The host and port that the MapReduce job tracker runs at.If "local", then jobs are run in-process as a single map and reduce task.
	</description>
	</property>

	####-mapred-site.xml(for hadoop 2.x)

	<property>
	<name>mapreduce.framework.name</name>
	<value>yarn</value>
	</property>

	and update yarn site.

	####-yarn-site.xml
	<configuration>
	<property>
	<name>mapreduce.framework.name</name>
	<value>yarn</value>
	</property>
	</configuration>





	####-hdfs-site.xml
	<property>
	<name>dfs.replication</name>
	<value>2</value>
	<description>Default block replication.The actual number of replications can be specified when the file is created. The default is used if replication is not specified in create time.
	</description>
	</property>

	6.Delete any files in /app/hd/tmp/ on all machines


	#rm -r /app/hd/tmp/*

	7.On master run


	hadoop namenode -format

	8.On master


	start-dfs.sh // should start below jvm process.

	confirm using jps

	jps
	799 NameNode
	15314 Jps
	14880 DataNode
	14977 SecondaryNameNode

	or

	start-mapred.sh // on hadoop 1.x

	9.On master


	start-yarn.sh// hadoop 2.x
	will start 1 Nodemanager per cluster and starts

	1.ResourceManager
	2.Nodemanager