Skip to content

Instantly share code, notes, and snippets.

@first087
Last active September 4, 2015 02:05
Show Gist options
  • Save first087/d49540d1f3e7d043ad41 to your computer and use it in GitHub Desktop.
Save first087/d49540d1f3e7d043ad41 to your computer and use it in GitHub Desktop.
Hadoop Training (Hadoop 2) - Day 2
package analysis;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.conf.*;
/**
* Sorts the number of hits received by each URL
* @author Srinath Perera ([email protected])
*/
public class WebLogFrequencyDistributionProcessor {
public static final Pattern httplogPattern = Pattern.compile("([^\\s]+) - - \\[(.+)\\] \"([^\\s]+) (/[^\\s]*) HTTP/[^\\s]+\" [^\\s]+ ([0-9]+)");
public static class AMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output,Reporter reporter) throws IOException {
String[] tokens = value.toString().split("\\s");
output.collect(new Text(tokens[0]),new IntWritable(Integer.parseInt(tokens[1])));
}
}
/**
* <p>Reduce function receives all the values that has the same key as the input, and it output the key
* and the number of occurrences of the key as the output.</p>
*/
public static class AReducer extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output,Reporter reporter) throws IOException {
if(values.hasNext()){
output.collect(key, values.next());
}
}
}
/**
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
JobConf job = new JobConf(WebLogFrequencyDistributionProcessor.class);
job.setMapperClass(AMapper.class);
job.setReducerClass(AReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
JobClient.runJob(job);
}
}
package analysis;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.conf.*;
/**
* Finds the number of hits received by each URL
*
* @author Srinath Perera ([email protected])
*/
public class WebLogHitsByLinkProcessor {
public static final Pattern httplogPattern = Pattern
.compile("([^\\s]+) - - \\[(.+)\\] \"([^\\s]+) (/[^\\s]*) HTTP/[^\\s]+\" [^\\s]+ ([0-9]+)");
public static class AMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output,Reporter reporter) throws IOException {
Matcher matcher = httplogPattern.matcher(value.toString());
if (matcher.matches()) {
String linkUrl = matcher.group(4);
word.set(linkUrl);
output.collect(word, one);
}
}
}
public static class AReducer extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output,Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
result.set(sum);
output.collect(key, result);
}
}
public static void main(String[] args) throws Exception {
JobConf job = new JobConf(WebLogHitsByLinkProcessor.class);
job.setMapperClass(AMapper.class);
job.setReducerClass(AReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
JobClient.runJob(job);
}
}
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/*
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.conf.*; */
public class WebLogMaxIP{
public static final Pattern httplogPattern = Pattern
//.compile("([^\\s]+) - - \\[(.+)\\] \"([^\\s]+) (/[^\\s]*) HTTP/[^\\s]+\" [^\\s]+ ([0-9]+)");
.compile("([^\\s]+) - - \\[(.+)\\] \"([^\\s]+) (/[^\\s]*) HTTP/[^\\s]+\" [^\\s]+ ([0-9]+)");
private final static IntWritable one = new IntWritable(1);
public static class AMapper extends Mapper<Object, Text, Text, IntWritable> {
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
Matcher matcher = httplogPattern.matcher(value.toString());
if (matcher.matches()) {
// int size = Integer.parseInt(matcher.group(1));
context.write(new Text(matcher.group(1)), one);
}
}
}
public static class AReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
int max = 0;
Text maxIp = new Text();
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable val : values) {
sum += val.get();
}
if(sum > max) {
max = sum;
maxIp.set(key);
}
}
//@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(maxIp, new IntWritable(max));
}
}
/**
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"Max IP");
job.setJarByClass(WebLogMaxIP.class);
job.setMapperClass(AMapper.class);
job.setReducerClass(AReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
package analysis;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.conf.*;
public class WebLogMessageSizeAggregator {
public static final Pattern httplogPattern = Pattern
.compile("([^\\s]+) - - \\[(.+)\\] \"([^\\s]+) (/[^\\s]*) HTTP/[^\\s]+\" [^\\s]+ ([0-9]+)");
public static class AMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output,Reporter reporter) throws IOException {
Matcher matcher = httplogPattern.matcher(value.toString());
if (matcher.matches()) {
int size = Integer.parseInt(matcher.group(5));
output.collect(new Text("msgSize"), new IntWritable(size));
}
}
}
public static class AReducer extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output,Reporter reporter) throws IOException {
double tot = 0;
int count = 0;
int min = Integer.MAX_VALUE;
int max = 0;
while (values.hasNext()) {
int value = values.next().get();
tot = tot + value;
count++;
if (value < min) {
min = value;
}
if (value > max) {
max = value;
}
}
output.collect(new Text("Mean"), new IntWritable((int) tot / count));
output.collect(new Text("Max"), new IntWritable(max));
output.collect(new Text("Min"), new IntWritable(min));
}
}
/**
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
JobConf job = new JobConf(WebLogMessageSizeAggregator.class);
job.setJarByClass(WebLogMessageSizeAggregator.class);
job.setMapperClass(AMapper.class);
job.setReducerClass(AReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
JobClient.runJob(job);
}
}
@first087
Copy link
Author

first087 commented Sep 4, 2015

Inout from ftp://ita.ee.lbl.gov/traces/NASA_access_log_Jul95.gz

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment