Skip to content

Instantly share code, notes, and snippets.

@jgmize
Last active August 29, 2015 14:06
Show Gist options
  • Save jgmize/a14140c203f4580724eb to your computer and use it in GitHub Desktop.
Save jgmize/a14140c203f4580724eb to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import mrjob.job
import mrjob.protocol
IP = 1
DOMAIN = 2
DATETIME = 4
METHOD = 5
URL = 6
STATUS_CODE = 7
UA = 10
class WeblogJob(mrjob.job.MRJob):
OUTPUT_PROTOCOL = mrjob.protocol.RawValueProtocol
HADOOP_INPUT_FORMAT = (
'org.apache.hadoop.mapred.SequenceFileAsTextInputFormat')
def mapper(self, _, line):
try:
fields = line.split('\t')
if (fields[DOMAIN] == 'www.mozilla.org' and
fields[METHOD] == 'GET' and
fields[STATUS_CODE] == '404' and
'/firefox/buttons/' in fields[URL]):
yield fields[URL], 1
except Exception as e:
self.increment_counter("errors", "all")
def combiner(self, key, vals):
yield key, sum(vals)
def reducer(self, key, vals):
yield '', '%s\t%d' % (key, sum(vals))
if __name__ == '__main__':
WeblogJob.run()
#!/bin/bash
export TMPDIR=$HOME/tmp
export HADOOP_HOME=/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce/
#export OUTPUT_DIR=aug_buttons_output
#hadoop fs -rm -r -f $OUTPUT_DIR
#export INPUT=hdfs:///data/weblogs/v2_raw/www.mozilla.org/2014-08-*/*
#python button404s.py --runner hadoop --hadoop-bin /usr/bin/hadoop --jobconf mapred.reduce.tasks=1 --no-output --output-dir $OUTPUT_DIR $INPUT
#hadoop fs -get aug_buttons_output/part-00000.gz aug_buttons.gz
#zcat aug_buttons.gz | sort -n -k 2 -r > aug_button_404s.tsv
export OUTPUT_DIR=sept_buttons_output
hadoop fs -rm -r -f $OUTPUT_DIR
export INPUT=hdfs:///data/weblogs/v2_raw/www.mozilla.org/2014-09-*/*
python button404s.py --runner hadoop --hadoop-bin /usr/bin/hadoop --jobconf mapred.reduce.tasks=1 --no-output --output-dir $OUTPUT_DIR $INPUT
hadoop fs -get sept_buttons_output/part-00000.gz sept_buttons.gz
zcat sept_buttons.gz | sort -n -k 2 -r > sept_button_404s.tsv
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment