Created
December 10, 2012 09:57
-
-
Save ivanistheone/4249678 to your computer and use it in GitHub Desktop.
Semi-finished version of visitor script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from collections import defaultdict, namedtuple | |
import datetime | |
from operator import itemgetter | |
import re | |
import requests | |
import json | |
import sys | |
# SERVER SIDE | |
######################################################################## | |
#mv /var/log/nginx/mr.access* | |
#to /home/ivan/logs/miniref/ ---> adjust filenames, skip dups & set perms to ivan-readable | |
# STEP 0 get logs | |
######################################################################## | |
#rsync to local dir | |
#and get list of logs from last N five days (defualt N=5) | |
filename= None | |
if len(sys.argv)> 1: | |
filename = sys.argv[1] | |
if not filename: | |
f = open("logs/access.log") | |
else: | |
f = open(filename) | |
logfiles_list = [ f.readlines() ] | |
# STEP 1 parse logs | |
######################################################################## | |
format_pat= re.compile( | |
r"(?P<host>[\d\.]+)\s" | |
r"(?P<identity>\S*)\s" | |
r"(?P<user>\S*)\s" | |
r"\[(?P<time>.*?)\]\s" | |
r'"(?P<request>.*?)"\s' | |
r"(?P<status>\d+)\s" | |
r"(?P<bytes>\S*)\s" | |
r'"(?P<referer>.*?)"\s' # [SIC] | |
r'"(?P<user_agent>.*?)"\s*' | |
) | |
Access = namedtuple('Access', | |
['host', 'identity', 'user', 'time', 'request', | |
'status', 'bytes', 'referer', 'user_agent'] ) | |
def access_iter( source_iter ): | |
for log in source_iter: | |
for line in (l.rstrip() for l in log): | |
match= format_pat.match(line) | |
if match: | |
yield Access( **match.groupdict() ) | |
parsed_entries = access_iter( logfiles_list ) | |
# STEP 2 normalize data | |
######################################################################## | |
month_map = {'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, | |
'Aug':8, 'Sep': 9, 'Oct':10, 'Nov': 11, 'Dec': 12} | |
def cleanup_time(s): | |
"""Convert an apache log timestamp to a datetime object""" | |
return datetime.datetime(int(s[7:11]), month_map[s[3:6]], int(s[0:2]), \ | |
int(s[12:14]), int(s[15:17]), int(s[18:20])) | |
def cleanup_entry( res ): | |
""" Get data out of the namedtuple, stuff into dict and normalize properties """ | |
res_out = {} | |
if res.user == "-": | |
res_out["user"] = None | |
res_out["status"] = int(res.status) | |
if res.bytes == "-": | |
res_out["bytes"] = 0 | |
else: | |
res_out["bytes"] = int(res.bytes) | |
res_out["host"] = res.host | |
res_out["identity"] = res.identity | |
res_out["time"] = cleanup_time( res.time ) | |
res_out["request"] = res.request | |
res_out["referer"] = res.referer | |
if res.referer == "-": | |
res_out["referer"] = None | |
res_out["user_agent"] = res.user_agent | |
return res_out | |
entries = [ cleanup_entry(en) for en in parsed_entries ] | |
sorted_entries = sorted( entries, key=itemgetter("time") ) | |
# STEP 3 Organize into sessions | |
######################################################################## | |
visitors = defaultdict( list ) | |
for req in sorted_entries: | |
ua = req["user_agent"] | |
if ( ("bingbot" in ua) or ("Baiduspider" in ua) or ( "WBSearchBot" in ua) or ("Googlebot" in ua) ): | |
continue | |
else: | |
visitors[ req["host"] ].append( req ) | |
sorted_visitors = sorted( visitors.iteritems(), key=lambda p: p[1][0]["time"] ) | |
# STEP 4 Print each request in sesssion | |
######################################################################## | |
for tup in sorted_visitors: | |
vis_ip = tup[0] | |
vis_list = tup[1] | |
# collect geolocation info | |
r = requests.get("http://api.hostip.info/get_json.php?" + vis_ip) | |
j = json.loads( r.read() ) | |
print vis_ip + " " + vis_list[0]["time"].__str__() | |
print "from " + j["city"] + ", " + j["country_name"] + " " + vis_list[0]["user_agent"] | |
old_time = vis_list[0]["time"] | |
current_req = vis_list[0]["request"] | |
for req in vis_list[1:]: | |
print " ", current_req, req["time"] - old_time | |
old_time = req["time"] | |
current_req = req["request"] | |
print " ", current_req, " end", "\n" | |
# TODO, skip media links & blog links in general |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment