Skip to content

Instantly share code, notes, and snippets.

@ramarov
Created September 5, 2016 06:46
Show Gist options
  • Save ramarov/c24bb2cf15bf18dc56bd28248a382b89 to your computer and use it in GitHub Desktop.
Save ramarov/c24bb2cf15bf18dc56bd28248a382b89 to your computer and use it in GitHub Desktop.
/**
* Following sample is adopted from original wordcount sample from
* http://wiki.apache.org/hadoop/WordCount.
*/
//package microbook.wordcount;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/**
* <p>
* The word count sample counts the number of word occurrences within a set of
* input documents using MapReduce. The code has three parts: mapper, reducer,
* and the main program.
* </p>
*
* @author Srinath Perera ([email protected])
*/
public class WordCount {
private final static IntWritable one = new IntWritable(1);
/**
* <p>
* Hadoop invokes map function once for each line in the input file,
* and it emits each word in the input line against one.
* </p>
*/
public static class WordcountMapper extends Mapper<Object, Text, Text, IntWritable> {
private Text word = new Text();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
System.out.println("itr = " + itr);
System.out.println("value = " + value);
System.out.println("Key = " + key);
while (itr.hasMoreTokens()) {
//System.out.println("itr_nextToken = " + itr.nextToken());
word.set(itr.nextToken().trim());
System.out.println("word = " + word);
//context.write(new Text(itr.nextToken()), one);
/*
if (word.toString().equals("hello"))
{
context.write(word,one);
}
*/
context.write(word,one);
}
}
}
/**
* <p>
* Reduce function receives all the values that has the same key as the
* input, and it output the key and the number of occurrences of the key as
* the output.
* </p>
*/
public static class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException,
InterruptedException {
int sum = 0;
System.out.println("reduce key = " + key);
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJar("WordCount.jar");
//job.setJarByClass(WordCount.class);
//job.setJarByClass(WordcountMapper.class);
//job.setJarByClass(WordcountReducer.class);
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordcountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment