Last active
December 12, 2018 10:55
-
-
Save m-manu/8cfc586c512a4d931f7730539c6cfee2 to your computer and use it in GitHub Desktop.
Word Count Hadoop example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<project xmlns="http://maven.apache.org/POM/4.0.0" | |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
<modelVersion>4.0.0</modelVersion> | |
<groupId>manu.sandbox</groupId> | |
<artifactId>hadoop-sandbox</artifactId> | |
<version>1.0-SNAPSHOT</version> | |
<properties> | |
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |
<hadoop.version>2.7.1.2.4.0.0-169</hadoop.version> | |
<hbase.version>1.1.2.2.4.0.0-169</hbase.version> | |
</properties> | |
<repositories> | |
<repository> | |
<id>HDPReleases</id> | |
<name>HDP Releases</name> | |
<url>http://repo.hortonworks.com/content/repositories/releases/</url> | |
</repository> | |
</repositories> | |
<dependencies> | |
<dependency> | |
<groupId>manu.sandbox</groupId> | |
<artifactId>java-sandbox</artifactId> | |
<version>1.0-SNAPSHOT</version> | |
</dependency> | |
<dependency> | |
<groupId>org.apache.hadoop</groupId> | |
<artifactId>hadoop-common</artifactId> | |
<version>${hadoop.version}</version> | |
</dependency> | |
<dependency> | |
<groupId>org.apache.hadoop</groupId> | |
<artifactId>hadoop-mapreduce-client-core</artifactId> | |
<version>${hadoop.version}</version> | |
</dependency> | |
<dependency> | |
<groupId>org.apache.hadoop</groupId> | |
<artifactId>hadoop-common</artifactId> | |
<version>${hadoop.version}</version> | |
<type>test-jar</type> | |
<scope>test</scope> | |
</dependency> | |
<dependency> | |
<groupId>org.apache.hadoop</groupId> | |
<artifactId>hadoop-hdfs</artifactId> | |
<version>${hadoop.version}</version> | |
<type>test-jar</type> | |
<scope>test</scope> | |
</dependency> | |
<dependency> | |
<groupId>org.apache.hadoop</groupId> | |
<artifactId>hadoop-hdfs</artifactId> | |
<version>${hadoop.version}</version> | |
<scope>test</scope> | |
</dependency> | |
</dependencies> | |
<build> | |
<plugins> | |
<plugin> | |
<artifactId>maven-compiler-plugin</artifactId> | |
<version>3.3</version> | |
<configuration> | |
<source>1.7</source> | |
<target>1.7</target> | |
<compilerArgument>-Xlint:all</compilerArgument> | |
</configuration> | |
</plugin> | |
</plugins> | |
</build> | |
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package manu.sandbox.demos.hadoop; | |
import org.apache.hadoop.conf.Configuration; | |
import org.apache.hadoop.fs.FileSystem; | |
import org.apache.hadoop.fs.Path; | |
import org.apache.hadoop.io.IntWritable; | |
import org.apache.hadoop.io.LongWritable; | |
import org.apache.hadoop.io.Text; | |
import org.apache.hadoop.mapreduce.Job; | |
import org.apache.hadoop.mapreduce.Mapper; | |
import org.apache.hadoop.mapreduce.Reducer; | |
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; | |
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; | |
import java.io.IOException; | |
import java.util.StringTokenizer; | |
public class WordCounter { | |
private static class WordMapper extends Mapper<LongWritable, Text, Text, IntWritable> { | |
private final static IntWritable one = new IntWritable(1); | |
private Text word = new Text(); | |
@Override | |
public void map(LongWritable n, Text t, Context context) throws java.io.IOException, java.lang.InterruptedException { | |
String line = t.toString(); | |
StringTokenizer tokenizer = new StringTokenizer(line, " "); | |
while (tokenizer.hasMoreTokens()) { | |
word.set(tokenizer.nextToken()); | |
context.write(word, one); | |
} | |
} | |
} | |
private static class WordReducer extends Reducer<Text, IntWritable, Text, IntWritable> { | |
@Override | |
public void reduce(Text word, Iterable<IntWritable> counts, Context context) throws IOException, InterruptedException { | |
int sum = 0; | |
for (IntWritable count : counts) { | |
sum += count.get(); | |
} | |
context.write(word, new IntWritable(sum)); | |
} | |
} | |
public static void run(String input, String output, String jobName) { | |
try { | |
Configuration conf = new Configuration(); | |
Job job = Job.getInstance(conf, jobName); | |
Path inputPath = new Path(input); | |
Path outputPath = new Path(output); | |
FileInputFormat.setInputPaths(job, inputPath); | |
FileOutputFormat.setOutputPath(job, outputPath); | |
FileSystem fs = FileSystem.getLocal(conf); | |
fs.delete(outputPath, true); | |
job.setJarByClass(WordCounter.class); | |
job.setOutputKeyClass(Text.class); | |
job.setOutputValueClass(IntWritable.class); | |
job.setMapperClass(WordMapper.class); | |
job.setReducerClass(WordReducer.class); | |
job.waitForCompletion(false); | |
} catch (Exception e) { | |
System.err.println("Exception thrown"); | |
e.printStackTrace(); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package manu.sandbox.demos.hadoop; | |
public class WordCounterTest { | |
public static void main(String[] args) { | |
if (args.length == 3) { | |
WordCounter.run(args[0], args[1], args[2]); | |
} else { | |
System.err.println("Invalid number of arguments"); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment