Skip to content

Instantly share code, notes, and snippets.

@jgmize
Created September 29, 2014 05:39
Show Gist options
  • Save jgmize/0229567331b6bd7cc2af to your computer and use it in GitHub Desktop.
Save jgmize/0229567331b6bd7cc2af to your computer and use it in GitHub Desktop.
correlated locale redirects
#!/usr/bin/env python
import re
import mrjob.job
import mrjob.protocol
IP = 1
DOMAIN = 2
DATETIME = 4
METHOD = 5
URL = 6
STATUS_CODE = 7
UA = 10
LOCALE_REGEX = re.compile(r'/[a-z]{2,3}(-[a-zA-Z]{2})?/')
MAX_REDIRECTS = 500
non_locale_redirects = []
class WeblogJob(mrjob.job.MRJob):
OUTPUT_PROTOCOL = mrjob.protocol.RawValueProtocol
HADOOP_INPUT_FORMAT = (
'org.apache.hadoop.mapred.SequenceFileAsTextInputFormat')
def mapper(self, _, line):
try:
fields = line.split('\t')
if (fields[DOMAIN] == 'www.mozilla.org' and
fields[METHOD] == 'GET'):
if fields[STATUS_CODE] == '301':
yield 'redirects', 1
if LOCALE_REGEX.match(fields[URL]):
yield 'locale-redirects', 1
else:
yield 'non-locale-redirects', 1
non_locale_redirects.insert(
0, (fields[URL], fields[IP]))
if len(non_locale_redirects) > MAX_REDIRECTS:
non_locale_redirects.pop()
elif fields[STATUS_CODE] == '200':
url, replaced = LOCALE_REGEX.subn('', fields[URL], count=1)
if replaced:
for redirected_url, ip in non_locale_redirects:
if (ip == fields[IP] and
redirected_url == '/' + url):
yield 'redirected-url-ip-matches', 1
except:
self.increment_counter("errors", "all")
def combiner(self, key, vals):
yield key, sum(vals)
def reducer(self, key, vals):
yield '', '%s\t%d' % (key, sum(vals))
if __name__ == '__main__':
WeblogJob.run()
#!/bin/bash
export TMPDIR=$HOME/tmp
export HADOOP_HOME=/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce/
export SCRIPT_NAME=correlated_locale_redirects
export INPUT=hdfs:///data/weblogs/v2_raw/www.mozilla.org/2014-08-*/*
export OUTPUT_DIR=aug_$SCRIPT_NAME
export OUTPUT_FILE=$OUTPUT_DIR.tsv
hadoop fs -rm -r -f $OUTPUT_DIR
python $SCRIPT_NAME.py --runner hadoop --hadoop-bin /usr/bin/hadoop --jobconf mapred.reduces=1 --no-output --output-dir $OUTPUT_DIR $INPUT
hadoop fs -get $OUTPUT_DIR/part-00000.gz $OUTPUT_FILE.gz
gunzip $OUTPUT_FILE
export INPUT=hdfs:///data/weblogs/v2_raw/www.mozilla.org/2014-09-*/*
export OUTPUT_DIR=sept_$SCRIPT_NAME
export OUTPUT_FILE=$OUTPUT_DIR.tsv
hadoop fs -rm -r -f $OUTPUT_DIR
python $SCRIPT_NAME.py --runner hadoop --hadoop-bin /usr/bin/hadoop --jobconf mapred.reduces=1 --no-output --output-dir $OUTPUT_DIR $INPUT
hadoop fs -get $OUTPUT_DIR/part-00000.gz $OUTPUT_FILE.gz
gunzip $OUTPUT_FILE
#!/usr/bin/env python
import re
import mrjob.job
import mrjob.protocol
IP = 1
DOMAIN = 2
DATETIME = 4
METHOD = 5
URL = 6
STATUS_CODE = 7
UA = 10
LOCALE_REGEX = re.compile(r'/[a-z]{2,3}(-[a-zA-Z]{2})?/')
class WeblogJob(mrjob.job.MRJob):
OUTPUT_PROTOCOL = mrjob.protocol.RawValueProtocol
HADOOP_INPUT_FORMAT = (
'org.apache.hadoop.mapred.SequenceFileAsTextInputFormat')
def mapper(self, _, line):
try:
fields = line.split('\t')
if (fields[DOMAIN] == 'www.mozilla.org' and
fields[METHOD] == 'GET' and
fields[STATUS_CODE] == '301'):
yield 'redirects', 1
if LOCALE_REGEX.match(fields[URL]):
yield 'locale-redirects', 1
else:
yield 'non-locale-redirects', 1
except:
self.increment_counter("errors", "all")
def combiner(self, key, vals):
yield key, sum(vals)
def reducer(self, key, vals):
yield '', '%s\t%d' % (key, sum(vals))
if __name__ == '__main__':
WeblogJob.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment