Last active
November 12, 2019 18:50
-
-
Save soulmachine/773588fc2b91cb190f83 to your computer and use it in GitHub Desktop.
Calculate Word Count Percentage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package me.soulmachine; | |
import org.apache.hadoop.conf.Configuration; | |
import org.apache.hadoop.conf.Configured; | |
import org.apache.hadoop.fs.Path; | |
import org.apache.hadoop.io.LongWritable; | |
import org.apache.hadoop.io.Text; | |
import org.apache.hadoop.mapreduce.Job; | |
import org.apache.hadoop.mapreduce.Mapper; | |
import org.apache.hadoop.mapreduce.Reducer; | |
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; | |
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; | |
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; | |
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; | |
import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer; | |
import org.apache.hadoop.util.Tool; | |
import org.apache.hadoop.util.ToolRunner; | |
import java.io.IOException; | |
import java.util.StringTokenizer; | |
/** Caculate the percentage of every word. */ | |
@SuppressWarnings("PMD.SignatureDeclareThrowsException") | |
public class WordCountPercentage extends Configured implements Tool { | |
public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> { | |
private static final transient LongWritable one = new LongWritable(1); | |
private final transient Text word = new Text(); | |
/** Total number of lines. */ | |
private static final Text TOTAL_KEY = new Text("TOTAL_KEY"); | |
/** mapper(). */ | |
@Override | |
public void map(final LongWritable key, final Text value, final Context context) | |
throws IOException, InterruptedException { | |
String line = value.toString(); | |
StringTokenizer tokenizer = new StringTokenizer(line); | |
while (tokenizer.hasMoreTokens()) { | |
word.set(tokenizer.nextToken()); | |
context.write(word, one); | |
context.write(TOTAL_KEY, one); | |
} | |
} | |
} | |
/** Reducer. */ | |
public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> { | |
/** Implement reduce(). */ | |
@Override | |
public void reduce(final Text key, final Iterable<LongWritable> values, final Context context) | |
throws IOException, InterruptedException { | |
int sum = 0; | |
for (LongWritable val : values) { | |
sum += val.get(); | |
} | |
context.write(key, new LongWritable(sum)); | |
} | |
} | |
/** | |
* Implement run(). | |
*/ | |
@Override | |
public int run(final String[] args) throws Exception { | |
final Path input = new Path(args[0]); | |
final Path output = new Path(args[1]); | |
final Configuration conf = this.getConf(); | |
final Job job = Job.getInstance(conf, "WordCountPercentage: " + input + "->" + output); | |
job.setJarByClass(WordCountPercentage.class); | |
job.setMapperClass(MyMapper.class); | |
job.setCombinerClass(LongSumReducer.class); | |
job.setReducerClass(MyReducer.class); | |
job.setNumReduceTasks(1); | |
job.setOutputKeyClass(Text.class); | |
job.setOutputValueClass(LongWritable.class); | |
job.setInputFormatClass(TextInputFormat.class); | |
job.setOutputFormatClass(TextOutputFormat.class); | |
FileInputFormat.addInputPath(job, input); | |
FileInputFormat.setInputDirRecursive(job, true); | |
FileOutputFormat.setOutputPath(job, output); | |
return job.waitForCompletion(true) ? 0 : 1; | |
} | |
/** | |
* main. | |
*/ | |
public static void main(final String[] args) throws Exception { | |
final int returnCode = ToolRunner.run(new Configuration(), new WordCountPercentage(), args); | |
System.exit(returnCode); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment