Created
July 6, 2012 21:40
-
-
Save cotdp/3062901 to your computer and use it in GitHub Desktop.
Mapper for processing ZipFile entries
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* This Mapper class checks the filename ends with the .txt extension, cleans | |
* the text and then applies the simple WordCount algorithm. | |
*/ | |
public static class MyMapper | |
extends Mapper<Text, BytesWritable, Text, IntWritable> | |
{ | |
private final static IntWritable one = new IntWritable( 1 ); | |
private Text word = new Text(); | |
public void map( Text key, BytesWritable value, Context context ) | |
throws IOException, InterruptedException | |
{ | |
// NOTE: the filename is the *full* path within the ZIP file | |
// e.g. "subdir1/subsubdir2/Ulysses-18.txt" | |
String filename = key.toString(); | |
LOG.info( "map: " + filename ); | |
// We only want to process .txt files | |
if ( filename.endsWith(".txt") == false ) | |
return; | |
// Prepare the content | |
String content = new String( value.getBytes(), "UTF-8" ); | |
content = content.replaceAll( "[^A-Za-z \n]", "" ).toLowerCase(); | |
// Tokenize the content | |
StringTokenizer tokenizer = new StringTokenizer( content ); | |
while ( tokenizer.hasMoreTokens() ) | |
{ | |
word.set( tokenizer.nextToken() ); | |
context.write( word, one ); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment