Created
February 13, 2016 06:40
-
-
Save tsuyukimakoto/1c80d396b59b7bb7a69b to your computer and use it in GitHub Desktop.
忘れないようにメモだけ
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
from datetime import datetime, timedelta | |
from optparse import OptionParser | |
import operator | |
import collections | |
import itertools | |
import multiprocessing | |
class SimpleMapReduce(object): | |
def __init__(self, map_func, reduce_func, num_workers=None): | |
self.map_func = map_func | |
self.reduce_func = reduce_func | |
self.pool = multiprocessing.Pool(num_workers) | |
def partition(self, mapped_values): | |
partitioned_data = collections.defaultdict(list) | |
for key, value in mapped_values: | |
partitioned_data[key].append(value) | |
return partitioned_data.items() | |
def __call__(self, inputs, chunksize=1): | |
map_responses = self.pool.map(self.map_func, inputs, | |
chunksize=chunksize) | |
partitioned_data = self.partition(itertools.chain(*map_responses)) | |
reduced_values = self.pool.map(self.reduce_func, partitioned_data) | |
return reduced_values | |
MONTHS = { | |
'Jan': '01', | |
'Feb': '02', | |
'Mar': '03', | |
'Apr': '04', | |
'May': '05', | |
'Jun': '06', | |
'Jul': '07', | |
'Aug': '08', | |
'Sep': '09', | |
'Oct': '10', | |
'Nov': '11', | |
'Dec': '12' | |
} | |
def parse_date(date): | |
return ''.join((date[7:11], MONTH[date[3:6]], date[0:2],)) | |
PARTS = ( | |
r'(?P<host>\S+)', # host %h | |
r'\S+', # indent %l (unused) | |
r'(?P<user>\S+)', # user %u | |
r'\[(?P<time>.+)\]', # time %t | |
r'"(?P<request>.+)"', # request "%r" | |
r'(?P<status>[0-9]+)', # status %>s | |
r'(?P<size>\S+)', # size %b (careful, can be '-') | |
r'"(?P<referer>.*)"', # referer "%{Referer}i" | |
r'"(?P<agent>.*)"', # user agent "%{User-agent}i" | |
) | |
LOG_PATTERN_RE = re.compile(r'\s+'.join(PARTS) + r'\s*\Z') | |
AIR_RE = re.compile(r'spam/egg1|ham/egg2|everes/egg3') | |
class NotMatchError(Exception): | |
pass | |
class DateLog(object): | |
__slots__ = ('log_date', 'client') | |
def __init__(self, log_date): | |
self.log_date = log_date | |
self.client = set() | |
def add_log(self, host, useragent): | |
self.client.add('{0}-{1}'.format(host, useragent)) | |
def count_client(self): | |
return len(self.client) | |
def parse(line): | |
m = LOG_PATTERN_RE.match(line) | |
if m: | |
data = m.groupdict() | |
log_date = parse_date(data.get('time', '01/Jan/0001:00:00:00 +0900')) | |
m = AIR_RE.search(data.get('request')) | |
if m and m.start() >= 0: | |
return (log_date, data.get('host'), data.get('agent')) | |
raise NotMatchError() | |
def parse_access_log(target_file): | |
#Python2.4 doesn't have set. 2.4.3 is installed on CentOS5.5 | |
result = {} | |
print('{}: start:{}'.format(target_file, | |
datetime.now().strftime('%H:%m:%S'))) | |
if os.path.splitext(target_file)[1] == '.gz': | |
import gzip | |
f = gzip.open(target_file) | |
else: | |
f = open(target_file) | |
for l in f: | |
try: | |
log_date, host, useragent = parse(l) | |
if not log_date in result: | |
result[log_date] = DateLog(log_date) | |
date_log = result.get(log_date) | |
date_log.add_log(host, useragent) | |
except NotMatchError: | |
pass | |
print('{}: end:{}'.format(target_file, datetime.now().strftime('%H:%m:%S'))) | |
return [x for x in result.items()] | |
def count_client(date_log): | |
return date_log[0], len(reduce(operator.or_, [x.client for x in date_log[1]])) | |
if __name__ == '__main__': | |
import glob | |
files = glob.glob('./**/*.gz') | |
mapper = SimpleMapReduce(parse_access_log, count_client) | |
result = mapper(files) | |
result.sort(key=operator.itemgetter(0)) | |
total = 0 | |
for k, v in result: | |
total += v | |
print('{0}\t{1}'.format(k, v)) | |
print('TOTAL:{0}'.format(total)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment