Created
July 30, 2012 14:11
-
-
Save harit-sunrun/3207166 to your computer and use it in GitHub Desktop.
Build the count of counts for cited patents from patent dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.hadoop.patent; | |
/** | |
* This program collect data about how many times a patent has been cited | |
* input data - http://data.nber.org/patents/ | |
* use the citation data set cite75_99.txt and the patent description data set apat63_99.txt. | |
*/ | |
import org.apache.hadoop.conf.Configuration; | |
import org.apache.hadoop.conf.Configured; | |
import org.apache.hadoop.fs.Path; | |
import org.apache.hadoop.io.IntWritable; | |
import org.apache.hadoop.io.Text; | |
import org.apache.hadoop.mapred.*; | |
import org.apache.hadoop.util.Tool; | |
import org.apache.hadoop.util.ToolRunner; | |
import java.io.IOException; | |
import java.util.Iterator; | |
public class CitationHistogram extends Configured implements Tool { | |
public static class MapClass extends MapReduceBase implements Mapper<Text, Text, IntWritable, IntWritable> { | |
private final static IntWritable uno = new IntWritable(1); | |
private IntWritable citationCount = new IntWritable(); | |
public void map(Text key, Text value, OutputCollector<IntWritable, IntWritable> output, Reporter reporter) throws IOException { | |
citationCount.set(Integer.parseInt(value.toString())); | |
output.collect(citationCount, uno); | |
} | |
} | |
public static class Reduce extends MapReduceBase implements Reducer<IntWritable, IntWritable, IntWritable, IntWritable> { | |
public void reduce(IntWritable key, Iterator<IntWritable> values, OutputCollector<IntWritable, IntWritable> output, Reporter reporter) throws IOException { | |
int count = 0; | |
while (values.hasNext()) { | |
count += values.next().get(); | |
} | |
output.collect(key, new IntWritable(count)); | |
} | |
} | |
public int run(String args[]) throws Exception { | |
Configuration conf = getConf(); | |
JobConf job = new JobConf(conf, CitationHistogram.class); | |
Path in = new Path(args[0]); | |
Path out = new Path(args[1]); | |
FileInputFormat.setInputPaths(job, in); | |
FileOutputFormat.setOutputPath(job, out); | |
job.setJobName("CitationHistogram"); | |
job.setMapperClass(MapClass.class); | |
job.setReducerClass(Reduce.class); | |
job.setInputFormat(KeyValueTextInputFormat.class); | |
job.setOutputFormat(TextOutputFormat.class); | |
job.setOutputKeyClass(IntWritable.class); | |
job.setOutputValueClass(IntWritable.class); | |
// job.set("key.value.separator.in.input.line", ","); | |
JobClient.runJob(job); | |
return 0; | |
} | |
public static void main(String args[]) throws Exception { | |
int res = ToolRunner.run(new Configuration(), new CitationHistogram(), args); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment