Skip to content

Instantly share code, notes, and snippets.

@tsuyukimakoto
Created February 13, 2016 06:40
Show Gist options
  • Save tsuyukimakoto/1c80d396b59b7bb7a69b to your computer and use it in GitHub Desktop.
Save tsuyukimakoto/1c80d396b59b7bb7a69b to your computer and use it in GitHub Desktop.
忘れないようにメモだけ
import os
import re
from datetime import datetime, timedelta
from optparse import OptionParser
import operator
import collections
import itertools
import multiprocessing
class SimpleMapReduce(object):
def __init__(self, map_func, reduce_func, num_workers=None):
self.map_func = map_func
self.reduce_func = reduce_func
self.pool = multiprocessing.Pool(num_workers)
def partition(self, mapped_values):
partitioned_data = collections.defaultdict(list)
for key, value in mapped_values:
partitioned_data[key].append(value)
return partitioned_data.items()
def __call__(self, inputs, chunksize=1):
map_responses = self.pool.map(self.map_func, inputs,
chunksize=chunksize)
partitioned_data = self.partition(itertools.chain(*map_responses))
reduced_values = self.pool.map(self.reduce_func, partitioned_data)
return reduced_values
MONTHS = {
'Jan': '01',
'Feb': '02',
'Mar': '03',
'Apr': '04',
'May': '05',
'Jun': '06',
'Jul': '07',
'Aug': '08',
'Sep': '09',
'Oct': '10',
'Nov': '11',
'Dec': '12'
}
def parse_date(date):
return ''.join((date[7:11], MONTH[date[3:6]], date[0:2],))
PARTS = (
r'(?P<host>\S+)', # host %h
r'\S+', # indent %l (unused)
r'(?P<user>\S+)', # user %u
r'\[(?P<time>.+)\]', # time %t
r'"(?P<request>.+)"', # request "%r"
r'(?P<status>[0-9]+)', # status %>s
r'(?P<size>\S+)', # size %b (careful, can be '-')
r'"(?P<referer>.*)"', # referer "%{Referer}i"
r'"(?P<agent>.*)"', # user agent "%{User-agent}i"
)
LOG_PATTERN_RE = re.compile(r'\s+'.join(PARTS) + r'\s*\Z')
AIR_RE = re.compile(r'spam/egg1|ham/egg2|everes/egg3')
class NotMatchError(Exception):
pass
class DateLog(object):
__slots__ = ('log_date', 'client')
def __init__(self, log_date):
self.log_date = log_date
self.client = set()
def add_log(self, host, useragent):
self.client.add('{0}-{1}'.format(host, useragent))
def count_client(self):
return len(self.client)
def parse(line):
m = LOG_PATTERN_RE.match(line)
if m:
data = m.groupdict()
log_date = parse_date(data.get('time', '01/Jan/0001:00:00:00 +0900'))
m = AIR_RE.search(data.get('request'))
if m and m.start() >= 0:
return (log_date, data.get('host'), data.get('agent'))
raise NotMatchError()
def parse_access_log(target_file):
#Python2.4 doesn't have set. 2.4.3 is installed on CentOS5.5
result = {}
print('{}: start:{}'.format(target_file,
datetime.now().strftime('%H:%m:%S')))
if os.path.splitext(target_file)[1] == '.gz':
import gzip
f = gzip.open(target_file)
else:
f = open(target_file)
for l in f:
try:
log_date, host, useragent = parse(l)
if not log_date in result:
result[log_date] = DateLog(log_date)
date_log = result.get(log_date)
date_log.add_log(host, useragent)
except NotMatchError:
pass
print('{}: end:{}'.format(target_file, datetime.now().strftime('%H:%m:%S')))
return [x for x in result.items()]
def count_client(date_log):
return date_log[0], len(reduce(operator.or_, [x.client for x in date_log[1]]))
if __name__ == '__main__':
import glob
files = glob.glob('./**/*.gz')
mapper = SimpleMapReduce(parse_access_log, count_client)
result = mapper(files)
result.sort(key=operator.itemgetter(0))
total = 0
for k, v in result:
total += v
print('{0}\t{1}'.format(k, v))
print('TOTAL:{0}'.format(total))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment