airawat · December 22, 2015 22:18 · ghost · Jul 31, 2014 · subhahere · Jul 27, 2015
diff --git a/00-CreatingMapFile b/00-CreatingMapFile
 This gist demonstrates how to create a map file, from a text file.

 Includes:
 ---------
 1. Input data and script download
 2. Input data-review
 3. Data load commands                                
 4. Java program to create the map file out of a text file in HDFS
 5. Command to run Java program
 6. Results of the program run to create map file 
 7. Java program to lookup data in map file
 8. Command to run program to do a lookup



diff --git a/01-DataAndScriptDownload b/01-DataAndScriptDownload
 01. Data and script download
 -----------------------------
 Google: 
 <<To be added>>
 
 Email me at [email protected] if you encounter any issues
 
 gitHub:
 <<To be added>>
 
 
 Directory structure
 -------------------
 formatProject
    data
        departments_sorted
          part-m-00000


    formatConverterTextToMap
        src
            FormatConverterTextToMap.java  
            MapFileLookup.java
      
        jars
            formatConverterTextToMap.jar




diff --git a/02-InputData b/02-InputData
 **************************************************
 Input text file - departments_sorted/part-m-00000
 **************************************************

 $ more formatProject/data/departments_sorted/part-m-00000 
 d001  Marketing
 d002	Finance
 d003	Human Resources
 d004	Production
 d005	Development
 d006	Quality Management
 d007	Sales
 d008	Research
 d009	Customer Service


diff --git a/03-HdfsLoadCommands b/03-HdfsLoadCommands
 **********************************************
 hdfs load commands
 **********************************************

 # Load data
 $ hadoop fs -put formatProject/

 # Remove unnecessary files 
 $ hadoop fs -rm -R formatProject/formatConverterTextToMap/
diff --git a/04-MapFileCreator b/04-MapFileCreator
 /******************************************
 * FormatConverterTextToMap.java
 * ****************************************/

 import java.io.IOException;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.MapFile;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.fs.FSDataInputStream;

 public class FormatConverterTextToMap {

  @SuppressWarnings("deprecation")
 	public static void main(String[] args) throws IOException{
 		
 		Configuration conf = new Configuration();
 		FileSystem fs;
 		
 		try {
 			fs = FileSystem.get(conf);
 			
 			Path inputFile = new Path(args[0]);
 			Path outputFile = new Path(args[1]);

      			Text txtKey = new Text();
  			Text txtValue = new Text();

 			String strLineInInputFile = "";
 			String lstKeyValuePair[] = null;
 			MapFile.Writer writer = null;
 			
 			FSDataInputStream inputStream = fs.open(inputFile);

 			try {
 				writer = new MapFile.Writer(conf, fs, outputFile.toString(),
 						txtKey.getClass(), txtKey.getClass());
 				writer.setIndexInterval(1);//Need this as the default is 128, and my data is just 9 records
 				while (inputStream.available() > 0) {
 					strLineInInputFile = inputStream.readLine();
 					lstKeyValuePair = strLineInInputFile.split("\\t");
 					txtKey.set(lstKeyValuePair[0]);
 					txtValue.set(lstKeyValuePair[1]);
 					writer.append(txtKey, txtValue);
 				}
 			} finally {
 				IOUtils.closeStream(writer);
        			System.out.println("Map file created successfully!!");
  		}
 	} catch (IOException e) {
 			e.printStackTrace();
 		}	
 	}
 }
diff --git a/05-RunProgram b/05-RunProgram
 ******************************************************************
 **Command to run program that creates a map file from text file 
 ******************************************************************

 $ hadoop jar formatProject/formatConverterTextToMap/jars/formatConverterTextToMap.jar FormatConverterTextToMap formatProject/data/departments_sorted/part-m-00000 formatProject/data/departments_map

 13/09/12 22:05:21 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
 13/09/12 22:05:21 INFO compress.CodecPool: Got brand-new compressor [.deflate]
 13/09/12 22:05:21 INFO compress.CodecPool: Got brand-new compressor [.deflate]
 Map file created successfully!!


diff --git a/06-Results b/06-Results
 ************************************************
 **Results 
 ************************************************

 $ hadoop fs -ls formatProject/data/departments_map | awk '{print $8}'

 formatProject/data/departments_map/data
 formatProject/data/departments_map/index

 $ hadoop fs -text formatProject/data/departments_map/data
 13/09/12 22:44:34 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
 13/09/12 22:44:34 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
 d001  Marketing
 d002	Finance
 d003	Human Resources
 d004	Production
 d005	Development
 d006	Quality Management
 d007	Sales
 d008	Research
 d009	Customer Service


 $ hadoop fs -text formatProject/data/departments_map/index
 13/09/12 22:44:56 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
 13/09/12 22:44:56 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
 13/09/12 22:44:56 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
 13/09/12 22:44:56 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
 13/09/12 22:44:56 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
 d001  121
 d002	152
 d003	181
 d004	218
 d005	250
 d006	283
 d007	323
 d008	350
 d009	380
diff --git a/07-ReadMapFile b/07-ReadMapFile
 /****************************************
 * MapFileLookup.java 
 * **************************************/

 import java.io.IOException;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.io.MapFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.conf.Configuration;

 public class MapFileLookup {

  /*
 	This program looks up a map file for a certain key and returns the associated value
 	The call to this program is:
 	Parameters:
 	param 1: Path to map file 
 	param 2: Key for which we want to get the value from the map file
 	Return: The value for the key
 	Return type: Text
 	Sample call: hadoop jar MapFileLookup.jar MapFileLookup <map-file-directory> <key> 
 	 */
 	@SuppressWarnings("deprecation")
 	public static Text main(String[] args) throws IOException {

 		Configuration conf = new Configuration();
 		FileSystem fs = null;
  	Text txtKey = new Text(args[1]);
 		Text txtValue = new Text();
 		MapFile.Reader reader = null;

 		try {
 			fs = FileSystem.get(conf);

 			try {
 				reader = new MapFile.Reader(fs, args[0].toString(), conf);
 				reader.get(txtKey, txtValue);
 			} catch (IOException e) {
 				e.printStackTrace();
 			}

 		} catch (IOException e) {
 			e.printStackTrace();
 		} finally {
                        if(reader != null)
 				reader.close();
  		}
 		System.out.println("The key is " + txtKey.toString()
 				+ " and the value is " + txtValue.toString());
 		return txtValue;
 	}
 }
diff --git a/08-RunReadMapFile b/08-RunReadMapFile
 **************************************************************************
 **Commands to run program to look up a key in a map file from text file 
 **************************************************************************

 $ hadoop jar formatProject/formatConverterTextToMap/jars/MapFileLookup.jar MapFileLookup formatProject/data/departments_map "d009"
 13/09/12 22:53:08 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
 13/09/12 22:53:08 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
 13/09/12 22:53:08 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
 13/09/12 22:53:08 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
 13/09/12 22:53:08 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
 13/09/12 22:53:08 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
 The key is d009 and the value is Customer Service
	This gist demonstrates how to create a map file, from a text file.

	Includes:
	---------
	1. Input data and script download
	2. Input data-review
	3. Data load commands
	4. Java program to create the map file out of a text file in HDFS
	5. Command to run Java program
	6. Results of the program run to create map file
	7. Java program to lookup data in map file
	8. Command to run program to do a lookup
	01. Data and script download
	-----------------------------
	Google:
	<<To be added>>

	Email me at [email protected] if you encounter any issues

	gitHub:
	<<To be added>>


	Directory structure
	-------------------
	formatProject
	data
	departments_sorted
	part-m-00000


	formatConverterTextToMap
	src
	FormatConverterTextToMap.java
	MapFileLookup.java

	jars
	formatConverterTextToMap.jar
	**************************************************
	Input text file - departments_sorted/part-m-00000
	**************************************************

	$ more formatProject/data/departments_sorted/part-m-00000
	d001 Marketing
	d002 Finance
	d003 Human Resources
	d004 Production
	d005 Development
	d006 Quality Management
	d007 Sales
	d008 Research
	d009 Customer Service
	**********************************************
	hdfs load commands
	**********************************************

	# Load data
	$ hadoop fs -put formatProject/

	# Remove unnecessary files
	$ hadoop fs -rm -R formatProject/formatConverterTextToMap/
	/******************************************
	* FormatConverterTextToMap.java
	* ****************************************/

	import java.io.IOException;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.io.MapFile;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.IOUtils;
	import org.apache.hadoop.fs.FSDataInputStream;

	public class FormatConverterTextToMap {

	@SuppressWarnings("deprecation")
	public static void main(String[] args) throws IOException{

	Configuration conf = new Configuration();
	FileSystem fs;

	try {
	fs = FileSystem.get(conf);

	Path inputFile = new Path(args[0]);
	Path outputFile = new Path(args[1]);

	Text txtKey = new Text();
	Text txtValue = new Text();

	String strLineInInputFile = "";
	String lstKeyValuePair[] = null;
	MapFile.Writer writer = null;

	FSDataInputStream inputStream = fs.open(inputFile);

	try {
	writer = new MapFile.Writer(conf, fs, outputFile.toString(),
	txtKey.getClass(), txtKey.getClass());
	writer.setIndexInterval(1);//Need this as the default is 128, and my data is just 9 records
	while (inputStream.available() > 0) {
	strLineInInputFile = inputStream.readLine();
	lstKeyValuePair = strLineInInputFile.split("\\t");
	txtKey.set(lstKeyValuePair[0]);
	txtValue.set(lstKeyValuePair[1]);
	writer.append(txtKey, txtValue);
	}
	} finally {
	IOUtils.closeStream(writer);
	System.out.println("Map file created successfully!!");
	}
	} catch (IOException e) {
	e.printStackTrace();
	}
	}
	}
	******************************************************************
	**Command to run program that creates a map file from text file
	******************************************************************

	$ hadoop jar formatProject/formatConverterTextToMap/jars/formatConverterTextToMap.jar FormatConverterTextToMap formatProject/data/departments_sorted/part-m-00000 formatProject/data/departments_map

	13/09/12 22:05:21 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
	13/09/12 22:05:21 INFO compress.CodecPool: Got brand-new compressor [.deflate]
	13/09/12 22:05:21 INFO compress.CodecPool: Got brand-new compressor [.deflate]
	Map file created successfully!!
	************************************************
	**Results
	************************************************

	$ hadoop fs -ls formatProject/data/departments_map \| awk '{print $8}'

	formatProject/data/departments_map/data
	formatProject/data/departments_map/index

	$ hadoop fs -text formatProject/data/departments_map/data
	13/09/12 22:44:34 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
	13/09/12 22:44:34 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
	d001 Marketing
	d002 Finance
	d003 Human Resources
	d004 Production
	d005 Development
	d006 Quality Management
	d007 Sales
	d008 Research
	d009 Customer Service


	$ hadoop fs -text formatProject/data/departments_map/index
	13/09/12 22:44:56 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
	13/09/12 22:44:56 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
	13/09/12 22:44:56 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
	13/09/12 22:44:56 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
	13/09/12 22:44:56 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
	d001 121
	d002 152
	d003 181
	d004 218
	d005 250
	d006 283
	d007 323
	d008 350
	d009 380
	/****************************************
	* MapFileLookup.java
	* **************************************/

	import java.io.IOException;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.io.MapFile;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.conf.Configuration;

	public class MapFileLookup {

	/*
	This program looks up a map file for a certain key and returns the associated value
	The call to this program is:
	Parameters:
	param 1: Path to map file
	param 2: Key for which we want to get the value from the map file
	Return: The value for the key
	Return type: Text
	Sample call: hadoop jar MapFileLookup.jar MapFileLookup <map-file-directory> <key>
	*/
	@SuppressWarnings("deprecation")
	public static Text main(String[] args) throws IOException {

	Configuration conf = new Configuration();
	FileSystem fs = null;
	Text txtKey = new Text(args[1]);
	Text txtValue = new Text();
	MapFile.Reader reader = null;

	try {
	fs = FileSystem.get(conf);

	try {
	reader = new MapFile.Reader(fs, args[0].toString(), conf);
	reader.get(txtKey, txtValue);
	} catch (IOException e) {
	e.printStackTrace();
	}

	} catch (IOException e) {
	e.printStackTrace();
	} finally {
	if(reader != null)
	reader.close();
	}
	System.out.println("The key is " + txtKey.toString()
	+ " and the value is " + txtValue.toString());
	return txtValue;
	}
	}
	**************************************************************************
	**Commands to run program to look up a key in a map file from text file
	**************************************************************************

	$ hadoop jar formatProject/formatConverterTextToMap/jars/MapFileLookup.jar MapFileLookup formatProject/data/departments_map "d009"
	13/09/12 22:53:08 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
	13/09/12 22:53:08 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
	13/09/12 22:53:08 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
	13/09/12 22:53:08 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
	13/09/12 22:53:08 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
	13/09/12 22:53:08 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
	The key is d009 and the value is Customer Service