Created
November 28, 2015 03:36
-
-
Save thanoojgithub/bd4adaddbf744cf0f049 to your computer and use it in GitHub Desktop.
Word count mapreduce
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package wc; | |
import java.io.IOException; | |
import java.util.*; | |
import org.apache.hadoop.conf.*; | |
import org.apache.hadoop.fs.*; | |
import org.apache.hadoop.conf.*; | |
import org.apache.hadoop.io.*; | |
import org.apache.hadoop.mapreduce.*; | |
import org.apache.hadoop.mapreduce.lib.input.*; | |
import org.apache.hadoop.mapreduce.lib.output.*; | |
import org.apache.hadoop.util.*; | |
public class WordCount extends Configured implements Tool { | |
public static void main(String args[]) throws Exception { | |
int res = ToolRunner.run(new WordCount(), args); | |
System.exit(res); | |
} | |
public int run(String[] args) throws Exception { | |
Path inputPath = new Path(args[0]); | |
Path outputPath = new Path(args[1]); | |
Configuration conf = getConf(); | |
Job job = new Job(conf, this.getClass().toString()); | |
FileInputFormat.setInputPaths(job, inputPath); | |
FileOutputFormat.setOutputPath(job, outputPath); | |
job.setJobName("WordCount"); | |
job.setJarByClass(WordCount.class); | |
job.setInputFormatClass(TextInputFormat.class); | |
job.setOutputFormatClass(TextOutputFormat.class); | |
job.setMapOutputKeyClass(Text.class); | |
job.setMapOutputValueClass(IntWritable.class); | |
job.setOutputKeyClass(Text.class); | |
job.setOutputValueClass(IntWritable.class); | |
job.setMapperClass(Map.class); | |
job.setCombinerClass(Reduce.class); | |
job.setReducerClass(Reduce.class); | |
return job.waitForCompletion(true) ? 0 : 1; | |
} | |
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> { | |
private final static IntWritable one = new IntWritable(1); | |
private Text word = new Text(); | |
@Override | |
public void map(LongWritable key, Text value, | |
Mapper.Context context) throws IOException, InterruptedException { | |
String line = value.toString(); | |
StringTokenizer tokenizer = new StringTokenizer(line); | |
while (tokenizer.hasMoreTokens()) { | |
word.set(tokenizer.nextToken()); | |
context.write(word, one); | |
} | |
} | |
} | |
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> { | |
@Override | |
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { | |
int sum = 0; | |
for (IntWritable value : values) { | |
sum += value.get(); | |
} | |
context.write(key, new IntWritable(sum)); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Combiner Functions
Many MapReduce jobs are limited by the bandwidth available on the cluster, so it pays to minimize the data transferred between map and reduce tasks. Hadoop allows the user to specify a combiner function to be run on the map output, and the combiner function’s output forms the input to the reduce function. Because the combiner function is an optimization, Hadoop does not provide a guarantee of how many times it will call it for a particular map output record, if at all. In other words, calling the combiner function zero, one, or many times should produce the same output from the reducer.