Created
January 13, 2012 18:50
-
-
Save wearpants/1608040 to your computer and use it in GitHub Desktop.
Bluesmote Example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""input: blocks output: (domain_name, count)""" | |
from mrjob.job import MRJob | |
import re | |
# a good enough regex for matching IP addresses | |
ip_re = re.compile(r"""^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$""") | |
class DomainCounter(MRJob): | |
def mapper(self, _, line): | |
"""runs on each line of input. yield (multiple) new key-value pairs""" | |
try: | |
cs_host = line.rstrip().split('\t')[17] | |
except: | |
return | |
yield "<TOTAL>", 1 | |
if ip_re.match(cs_host): | |
# just yield IP | |
yield "<IP>", 1 | |
yield cs_host, 1 | |
else: | |
# yield base domain name | |
yield "<HOST>", 1 | |
yield ".".join(cs_host.rsplit('.', 2)[-2:]), 1 | |
def combiner(self, domain, occurrences): | |
"""combine results locally - see reducer""" | |
yield domain, sum(occurrences) | |
def reducer(self, domain, occurrences): | |
"""sum counts by domain name & yield key-value pair of (domain, count)""" | |
yield domain, sum(occurrences) | |
if __name__ == '__main__': | |
DomainCounter.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This code is based on mrjob a Python wrapper around the Hadoop map-reduce framework.
Feel free to use it, or write your own code in whatever tool you like.