Created
December 10, 2012 08:35
-
-
Save ivanistheone/4249353 to your computer and use it in GitHub Desktop.
Parsing access log and collecting sessions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
from collections import defaultdict, namedtuple | |
import datetime | |
f = open("access.log") | |
logfiles_list = [ f.readlines() ] | |
# STEP 1 parse logs | |
######################################################################## | |
format_pat= re.compile( | |
r"(?P<host>[\d\.]+)\s" | |
r"(?P<identity>\S*)\s" | |
r"(?P<user>\S*)\s" | |
r"\[(?P<time>.*?)\]\s" | |
r'"(?P<request>.*?)"\s' | |
r"(?P<status>\d+)\s" | |
r"(?P<bytes>\S*)\s" | |
r'"(?P<referer>.*?)"\s' # [SIC] | |
r'"(?P<user_agent>.*?)"\s*' | |
) | |
Access = namedtuple('Access', | |
['host', 'identity', 'user', 'time', 'request', | |
'status', 'bytes', 'referer', 'user_agent'] ) | |
def access_iter( source_iter ): | |
for log in source_iter: | |
for line in (l.rstrip() for l in log): | |
match= format_pat.match(line) | |
if match: | |
yield Access( **match.groupdict() ) | |
parsed_entries = access_iter( logfiles_list ) | |
# STEP 2 normalize data | |
######################################################################## | |
month_map = {'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, | |
'Aug':8, 'Sep': 9, 'Oct':10, 'Nov': 11, 'Dec': 12} | |
def cleanup_time(s): | |
"""Convert an apache log timestamp to a datetime object""" | |
return datetime.datetime(int(s[7:11]), month_map[s[3:6]], int(s[0:2]), \ | |
int(s[12:14]), int(s[15:17]), int(s[18:20])) | |
def cleanup_entry( res ): | |
""" Get data out of the namedtuple, stuff into dict and normalize properties """ | |
res_out = {} | |
if res.user == "-": | |
res_out["user"] = None | |
res_out["status"] = int(res.status) | |
if res.bytes == "-": | |
res_out["bytes"] = 0 | |
else: | |
res_out["bytes"] = int(res.bytes) | |
res_out["host"] = res.host | |
res_out["identity"] = res.identity | |
res_out["time"] = cleanup_time( res.time ) | |
res_out["request"] = res.request | |
res_out["referer"] = res.referer | |
if res.referer == "-": | |
res_out["referer"] = None | |
res_out["user_agent"] = res.user_agent | |
return res_out | |
entries = [ cleanup_entry(en) for en in parsed_entries ] | |
#### EXAMPLE INPUT | |
In [29]: f.readlines()[10] | |
Out[29]: '66.249.76.98 - - [21/Nov/2012:08:07:15 -0800] "GET /_media/indexmenu/math HTTP/1.1" 404 40 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"\n' | |
#### EXAMPLE OUTPUT | |
In [22]: entries[10] | |
Out[22]: | |
{'bytes': 40, | |
'host': '66.249.76.98', | |
'identity': '-', | |
'referer': None, | |
'request': 'GET /_media/indexmenu/math HTTP/1.1', | |
'status': 404, | |
'time': datetime.datetime(2012, 11, 21, 8, 7, 15), | |
'user': None, | |
'user_agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment