airawat · March 19, 2019 18:35 · egagan · Jan 9, 2015
diff --git a/00-CreatingSequenceFile b/00-CreatingSequenceFile
 This gist demonstrates how to create a sequence file (compressed and uncompressed), from a text file.
 
 Includes:
 ---------
 1. Input data and script download
 2. Input data-review
 3. Data load commands                                
 4. Mapper code
 5. Driver code to create the sequence file out of a text file in HDFS
 6. Command to run Java program
 7. Results of the program run to create sequence file 
 8. Java program to read a sequence file, and convert to text file
 9. Command to run program from #8, with results
 10. Note on creating compressed sequence files
 11. Driver code to create a compressed sequence file
 12. Command to run program in #11 with results
diff --git a/01-DataAndCodeDownload b/01-DataAndCodeDownload
 01. Data and code download
 -----------------------------
 Google: 
 <<To be added>>
 
 Email me at [email protected] if you encounter any issues
 
 gitHub:
 <<To be added>>
 
 
 Directory structure
 -------------------
 formatProject
    data
        departments_sorted
          part-m-00000
 
 
    formatConverterTextToSequence
        src
            FormatConverterMapper.java
            FormatConverterTextToSequenceDriver.java
            FormatConverterSequenceToTextDriver.java
        jars
            formatConverterTextToSequence.jar
            formatConverterSequenceToText.jar
diff --git a/02-SourceData b/02-SourceData
 **************************************************
 Input text file - departments_sorted/part-m-00000
 **************************************************
 
 $ more formatProject/data/departments_sorted/part-m-00000 
 d001  Marketing
 d002  Finance
 d003	Human Resources
 d004	Production
 d005	Development
 d006	Quality Management
 d007	Sales
 d008	Research
 d009	Customer Service
diff --git a/03-HdfsLoadCommands b/03-HdfsLoadCommands
 **********************************************
 hdfs load commands
 **********************************************
 
 # Load data
 $ hadoop fs -put formatProject/
 
 # Remove unnecessary files 
 $ hadoop fs -rm -R formatProject/formatConverterTextToSequence/
 $ hadoop fs -rm -R formatProject/formatConverterTextToMap/
diff --git a/04-TextFileMapper b/04-TextFileMapper
 /*********************************************************************************************************
 **           Mapper 
 **           formatProject/FormatConverterTextToSequence/src/FormatConverterMapper.java 
 **           Reads text file and emits the contents out as key-value pairs
 *********************************************************************************************************/

 import java.io.IOException;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Mapper;


 public class FormatConverterMapper extends
    Mapper<LongWritable, Text, LongWritable, Text> {

  @Override
  public void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {
    context.write(key, value);
  }
 }
diff --git a/05-SequenceFileCreator b/05-SequenceFileCreator
 /*********************************************************************************************************
 **           Driver 
 **           formatProject/FormatConverterTextToSequence/src/FormatConverterTextToSequenceDriver.java 
 *********************************************************************************************************/

 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;

 public class FormatConverterTextToSequenceDriver extends Configured implements Tool {

  @Override
  public int run(String[] args) throws Exception {

    if (args.length != 2) {
      System.out.printf("Two parameters are required for FormatConverterTextToSequenceDriver-<input dir> <output dir>\n");
      return -1;
    }

    Job job = new Job(getConf());
    job.setJarByClass(FormatConverterTextToSequenceDriver.class);
    job.setJobName("Create Sequence File, from text file");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(FormatConverterMapper.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(0);

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
  }

  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new Configuration(), new FormatConverterTextToSequenceDriver(), args);
    System.exit(exitCode);
  }
 }
diff --git a/06-CommandToRunProgram b/06-CommandToRunProgram
 ************************************************
 **Command to create sequence file from text file 
 ************************************************

 $ hadoop jar formatProject/formatConverterTextToSequence/jars/formatConverterTextToSequence.jar FormatConverterTextToSequenceDriver formatProject/data/departments_sorted/part-m-00000 formatProject/data/departments_sequence
 .
 .
 .
 .

 $ hadoop fs -ls -R formatProject/data/departments_sequence | awk '{print $8}'
 formatProject/data/departments_sequence/_SUCCESS
 formatProject/data/departments_sequence/_logs
 formatProject/data/departments_sequence/_logs/history
 formatProject/data/departments_sequence/_logs/history/cdh-jt01_1376335706356_job_201308121428_0116_conf.xml
 formatProject/data/departments_sequence/_logs/history/job_201308121428_0116_1379087496898_akhanolk_Create+Sequence+File%2C+from+text+file
 formatProject/data/departments_sequence/part-m-00000

diff --git a/07-Results b/07-Results
 ************************************************
 **Results 
 ************************************************

 $ hadoop fs -text formatProject/data/departments_sequence/part-m-00000
 0  d001	Marketing
 15	d002	Finance
 28	d003	Human Resources
 49	d004	Production
 65	d005	Development
 82	d006	Quality Management
 106	d007	Sales
 117	d008	Research
 131	d009	Customer Service
diff --git a/08-SequenceFileReader b/08-SequenceFileReader
 /*********************************************************************************************************
 **           Driver 
 **           formatProject/FormatConverterTextToSequence/src/FormatConverterSequenceToTextDriver.java 
 *********************************************************************************************************/
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;

 public class FormatConverterSequenceToTextDriver extends Configured implements Tool {

  @Override
  public int run(String[] args) throws Exception {

    if (args.length != 2) {
      System.out
          .printf("Two parameters need to be supplied - <input dir> and <output dir>\n");
      return -1;
    }

    Job job = new Job(getConf());
    job.setJarByClass(FormatConverterSequenceToTextDriver.class);
    job.setJobName("Convert Sequence File and Output as Text");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(FormatConverterMapper.class);
    job.setNumReduceTasks(0);

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
  }

  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new Configuration(), new FormatConverterSequenceToTextDriver(), args);
    System.exit(exitCode);
  }
 }
diff --git a/09-CommandToRunProgramToReadSequenceFile b/09-CommandToRunProgramToReadSequenceFile
 **************************************************************
 **Command to create text file from sequence file & results 
 **************************************************************

 $ hadoop jar formatProject/formatConverterTextToSequence/jars/formatConverterSequenceToText.jar FormatConverterSequenceToTextDriver formatProject/data/departments_sequence/part-m-00000 formatProject/data/departments_text

 $ hadoop fs -ls -R formatProject/data/departments_text | awk '{print $8}'
 formatProject/data/departments_text/_SUCCESS
 formatProject/data/departments_text/_logs
 formatProject/data/departments_text/_logs/history
 formatProject/data/departments_text/_logs/history/cdh-jt01_1376335706356_job_201308121428_0118_conf.xml
 formatProject/data/departments_text/_logs/history/job_201308121428_0118_1379089420495_akhanolk_Convert+Sequence+File+and+Output+as+Text
 formatProject/data/departments_text/part-m-00000

 $ hadoop fs -cat formatProject/data/departments_text/part-m-00000
 0  d001	Marketing
 15	d002	Finance
 28	d003	Human Resources
 49	d004	Production
 65	d005	Development
 82	d006	Quality Management
 106	d007	Sales
 117	d008	Research
 131	d009	Customer Service

diff --git a/10-Compression b/10-Compression
 **************************************************************
 ** Compression and sequence files 

 **************************************************************
 To create an compressed sequence file - and block compression is the recommended option, there are just minor additions to code in the driver [formatProject/FormatConverterTextToSequence/src/FormatConverterTextToSequenceDriver.java] 
 The sample code here uses SnappyCodec, and block compression.

 FileOutputFormat.setCompressOutput(job, true);
 FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
 SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK);

 The next section includes the code.

diff --git a/11-DriverToCreateBlockCompSeqFile b/11-DriverToCreateBlockCompSeqFile
 /*************************************************************************************************************
 **           Driver 
 **           formatProject/FormatConverterTextToSequence/src/FormatConverterTextToBlckCompSequenceDriver.java 
 *************************************************************************************************************/

 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile.CompressionType;
 import org.apache.hadoop.io.compress.SnappyCodec;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;

 public class FormatConverterTextToBlckCompSequenceDriver extends Configured implements Tool {

  @Override
  public int run(String[] args) throws Exception {

    if (args.length != 2) {
      System.out.printf("Two parameters are required for FormatConverterTextToBlckCompSequenceDriver-<input dir> <output dir>\n");
      return -1;
    }

    Job job = new Job(getConf());
    job.setJarByClass(FormatConverterTextToBlckCompSequenceDriver.class);
    job.setJobName("Create block compressed Sequence File, from text file");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);

    job.setMapperClass(FormatConverterMapper.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK);
    job.setNumReduceTasks(0);

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
  }

  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new Configuration(), new FormatConverterTextToBlckCompSequenceDriver(), args);
    System.exit(exitCode);
  }
 }
diff --git a/12-CommandsToRunProgramFrom11 b/12-CommandsToRunProgramFrom11
 *************************************************************************************
 **Command to create block compressed(snappy) sequence file from text file + output
 *************************************************************************************

 $ hadoop jar formatProject/formatConverterTextToSequence/jars/formatConverterTextToBlkCompSequence.jar FormatConverterTextToBlckCompSequenceDriver formatProject/data/departments_sorted/part-m-00000 formatProject/data/departments_sequence_blckcmp
 .

 $ hadoop fs -ls -R formatProject/data/departments_sequence_blckcmp | awk '{print $8}'
 formatProject/data/departments_sequence_blckcmp/_SUCCESS
 formatProject/data/departments_sequence_blckcmp/_logs
 formatProject/data/departments_sequence_blckcmp/_logs/history
 formatProject/data/departments_sequence_blckcmp/_logs/history/cdh-jt01_1376335706356_job_201308121428_0120_conf.xml
 formatProject/data/departments_sequence_blckcmp/_logs/history/job_201308121428_0120_1379091181653_akhanolk_Create+block+compressed+Sequence+File%2C+from+text+f
 formatProject/data/departments_sequence_blckcmp/part-m-00000


 $ hadoop fs -text formatProject/data/departments_sequence_blckcmp/part-m-00000
 13/09/13 11:55:38 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
 13/09/13 11:55:38 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
 13/09/13 11:55:38 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
 13/09/13 11:55:38 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
 0  d001	Marketing
 15	d002	Finance
 28	d003	Human Resources
 49	d004	Production
 65	d005	Development
 82	d006	Quality Management
 106	d007	Sales
 117	d008	Research
 131	d009	Customer Service
	This gist demonstrates how to create a sequence file (compressed and uncompressed), from a text file.

	Includes:
	---------
	1. Input data and script download
	2. Input data-review
	3. Data load commands
	4. Mapper code
	5. Driver code to create the sequence file out of a text file in HDFS
	6. Command to run Java program
	7. Results of the program run to create sequence file
	8. Java program to read a sequence file, and convert to text file
	9. Command to run program from #8, with results
	10. Note on creating compressed sequence files
	11. Driver code to create a compressed sequence file
	12. Command to run program in #11 with results
	01. Data and code download
	-----------------------------
	Google:
	<<To be added>>

	Email me at [email protected] if you encounter any issues

	gitHub:
	<<To be added>>


	Directory structure
	-------------------
	formatProject
	data
	departments_sorted
	part-m-00000


	formatConverterTextToSequence
	src
	FormatConverterMapper.java
	FormatConverterTextToSequenceDriver.java
	FormatConverterSequenceToTextDriver.java
	jars
	formatConverterTextToSequence.jar
	formatConverterSequenceToText.jar
	**************************************************
	Input text file - departments_sorted/part-m-00000
	**************************************************

	$ more formatProject/data/departments_sorted/part-m-00000
	d001 Marketing
	d002 Finance
	d003 Human Resources
	d004 Production
	d005 Development
	d006 Quality Management
	d007 Sales
	d008 Research
	d009 Customer Service
	**********************************************
	hdfs load commands
	**********************************************

	# Load data
	$ hadoop fs -put formatProject/

	# Remove unnecessary files
	$ hadoop fs -rm -R formatProject/formatConverterTextToSequence/
	$ hadoop fs -rm -R formatProject/formatConverterTextToMap/
	/*********************************************************************************************************
	** Mapper
	** formatProject/FormatConverterTextToSequence/src/FormatConverterMapper.java
	** Reads text file and emits the contents out as key-value pairs
	*********************************************************************************************************/

	import java.io.IOException;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Mapper;


	public class FormatConverterMapper extends
	Mapper<LongWritable, Text, LongWritable, Text> {

	@Override
	public void map(LongWritable key, Text value, Context context)
	throws IOException, InterruptedException {
	context.write(key, value);
	}
	}
	/*********************************************************************************************************
	** Driver
	** formatProject/FormatConverterTextToSequence/src/FormatConverterTextToSequenceDriver.java
	*********************************************************************************************************/

	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
	import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.conf.Configured;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.util.Tool;
	import org.apache.hadoop.util.ToolRunner;

	public class FormatConverterTextToSequenceDriver extends Configured implements Tool {

	@Override
	public int run(String[] args) throws Exception {

	if (args.length != 2) {
	System.out.printf("Two parameters are required for FormatConverterTextToSequenceDriver-<input dir> <output dir>\n");
	return -1;
	}

	Job job = new Job(getConf());
	job.setJarByClass(FormatConverterTextToSequenceDriver.class);
	job.setJobName("Create Sequence File, from text file");

	FileInputFormat.setInputPaths(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	job.setMapperClass(FormatConverterMapper.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	job.setNumReduceTasks(0);

	boolean success = job.waitForCompletion(true);
	return success ? 0 : 1;
	}

	public static void main(String[] args) throws Exception {
	int exitCode = ToolRunner.run(new Configuration(), new FormatConverterTextToSequenceDriver(), args);
	System.exit(exitCode);
	}
	}
	************************************************
	**Command to create sequence file from text file
	************************************************

	$ hadoop jar formatProject/formatConverterTextToSequence/jars/formatConverterTextToSequence.jar FormatConverterTextToSequenceDriver formatProject/data/departments_sorted/part-m-00000 formatProject/data/departments_sequence
	.
	.
	.
	.

	$ hadoop fs -ls -R formatProject/data/departments_sequence \| awk '{print $8}'
	formatProject/data/departments_sequence/_SUCCESS
	formatProject/data/departments_sequence/_logs
	formatProject/data/departments_sequence/_logs/history
	formatProject/data/departments_sequence/_logs/history/cdh-jt01_1376335706356_job_201308121428_0116_conf.xml
	formatProject/data/departments_sequence/_logs/history/job_201308121428_0116_1379087496898_akhanolk_Create+Sequence+File%2C+from+text+file
	formatProject/data/departments_sequence/part-m-00000
	************************************************
	**Results
	************************************************

	$ hadoop fs -text formatProject/data/departments_sequence/part-m-00000
	0 d001 Marketing
	15 d002 Finance
	28 d003 Human Resources
	49 d004 Production
	65 d005 Development
	82 d006 Quality Management
	106 d007 Sales
	117 d008 Research
	131 d009 Customer Service
	**************************************************************
	**Command to create text file from sequence file & results
	**************************************************************

	$ hadoop jar formatProject/formatConverterTextToSequence/jars/formatConverterSequenceToText.jar FormatConverterSequenceToTextDriver formatProject/data/departments_sequence/part-m-00000 formatProject/data/departments_text

	$ hadoop fs -ls -R formatProject/data/departments_text \| awk '{print $8}'
	formatProject/data/departments_text/_SUCCESS
	formatProject/data/departments_text/_logs
	formatProject/data/departments_text/_logs/history
	formatProject/data/departments_text/_logs/history/cdh-jt01_1376335706356_job_201308121428_0118_conf.xml
	formatProject/data/departments_text/_logs/history/job_201308121428_0118_1379089420495_akhanolk_Convert+Sequence+File+and+Output+as+Text
	formatProject/data/departments_text/part-m-00000

	$ hadoop fs -cat formatProject/data/departments_text/part-m-00000
	0 d001 Marketing
	15 d002 Finance
	28 d003 Human Resources
	49 d004 Production
	65 d005 Development
	82 d006 Quality Management
	106 d007 Sales
	117 d008 Research
	131 d009 Customer Service
	**************************************************************
	** Compression and sequence files

	**************************************************************
	To create an compressed sequence file - and block compression is the recommended option, there are just minor additions to code in the driver [formatProject/FormatConverterTextToSequence/src/FormatConverterTextToSequenceDriver.java]
	The sample code here uses SnappyCodec, and block compression.

	FileOutputFormat.setCompressOutput(job, true);
	FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
	SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK);

	The next section includes the code.