Created
November 30, 2009 18:59
-
-
Save oremj/245647 to your computer and use it in GitHub Desktop.
funnelcake log processing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import sys | |
import csv | |
import funnelcake_lib | |
data_dict = funnelcake_lib.parse_mirror_log(('200', '206'), ('linux-i686', 'win32', 'mac'), ('en-us', )) | |
def run_report(data_key, extra_headers, filename): | |
global data_dict | |
csv_writer = csv.writer(open(filename, "wb")) | |
csv_writer.writerow(extra_headers + funnelcake_lib.stats_len_funcs + funnelcake_lib.stats_funcs) | |
for k in data_dict[data_key]: | |
if len(data_dict[data_key][k]) < 2: | |
continue | |
analyzer = funnelcake_lib.FunnelCakeReports(data_dict[data_key][k]) | |
if len(analyzer.total_complete()) < 2: | |
continue | |
row = [] | |
row += k | |
row += [ "%s" % ( len(f()), ) for f in analyzer.len_reports() ] | |
row += [ "%s" % ( f(), ) for f in analyzer.reports() ] | |
csv_writer.writerow(row) | |
run_report('url', ['os','locale','product'], "/tmp/funnelcake_mirror.csv") | |
run_report('geo', ['geo','product'], "/tmp/funnelcake_mirror_geo.csv") | |
run_report('browser', ['browser','locale', 'product'], "/tmp/funnelcake_mirror_browser.csv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
import math | |
import GeoIP | |
stats_len_funcs = ['total_complete', 'total_incomplete', 'total_incomplete_uniq_cookie_200', 'total_uniq_cookie', | |
'total_uniq_ip', 'total_206_requests', 'total_uniq_cookie_206', | |
'total_ie', 'total_firefox', 'total_safari', 'total_opera', 'total_other_browser' ] | |
stats_funcs = ['total_possible_extra'] | |
mozcom_len_funcs = ['total_complete', 'total_uniq_ip_ua', 'total_ie', | |
'total_firefox', 'total_safari', 'total_opera', | |
'total_other_browser' ] | |
mozcom_funcs = [] | |
def detect_browser(ua): | |
browser_re = { 'IE': re.compile('MSIE'), | |
'Firefox': re.compile('Firefox'), | |
'Safari': re.compile('Safari'), | |
'Opera': re.compile('^Opera') } | |
for b,r in browser_re.iteritems(): | |
if r.search(ua): | |
return b | |
return "Other" | |
class FunnelCakeReports: | |
def __init__(self, url_list, BYTE_LIMIT = 100000): | |
self.url_list = url_list | |
self.BYTE_LIMIT = BYTE_LIMIT | |
def len_reports(self): | |
return [ getattr(self,f) for f in stats_len_funcs ] | |
def reports(self): | |
return [ getattr(self,f) for f in stats_funcs ] | |
def total_complete(self): | |
return [ i for i in self.url_list if i['status'] == '200' and int(i['bytes_sent']) >= int(i['resp_bytes']) - self.BYTE_LIMIT] | |
def total_incomplete(self): | |
return [ i for i in self.url_list if i['status'] == '206' or ( i['status'] == '200' and int(i['bytes_sent']) < int(i['resp_bytes']) - self.BYTE_LIMIT)] | |
def total_incomplete_200(self): | |
return [ i for i in self.url_list if i['status'] == '200' and int(i['bytes_sent']) < int(i['resp_bytes']) - self.BYTE_LIMIT] | |
def total_incomplete_uniq_cookie_200(self): | |
return self.total_uniq_cookie('total_incomplete_200') | |
def total_ie(self): | |
url_list = self.total_complete() | |
re_browser = re.compile('MSIE') | |
return [ i for i in url_list if re_browser.search(i['ua']) ] | |
def total_firefox(self): | |
url_list = self.total_complete() | |
re_browser = re.compile('Firefox') | |
return [ i for i in url_list if re_browser.search(i['ua']) ] | |
def total_safari(self): | |
url_list = self.total_complete() | |
re_browser = re.compile('Safari') | |
return [ i for i in url_list if re_browser.search(i['ua']) ] | |
def total_opera(self): | |
url_list = self.total_complete() | |
re_browser = re.compile('^Opera') | |
return [ i for i in url_list if re_browser.search(i['ua']) ] | |
def total_other_browser(self): | |
url_list = self.total_complete() | |
re_browser = re.compile('(^Opera|Firefox|MSIE|Safari)') | |
return [ i for i in url_list if not re_browser.search(i['ua']) ] | |
def total_206_requests(self): | |
return [ i for i in self.url_list if i['status'] == '206' ] | |
def total_uniq_cookie_206(self): | |
return self.total_uniq_cookie('total_206_requests') | |
def total_uniq_ip_206(self): | |
return self.total_uniq_ip('total_206_requests') | |
def total_extra_bytes(self): | |
return sum([ int(i['bytes_sent']) for i in self.total_incomplete()]) | |
def total_possible_extra(self): | |
total_complete = self.total_complete() | |
try: | |
avg_bytes = float(sum([ int(i['bytes_sent']) for i in total_complete])) / len(total_complete) | |
except ZeroDivisionError: | |
return 0 | |
cookie_dict = {} | |
incomplete = self.total_incomplete() | |
for i in incomplete: | |
cookie_dict.setdefault(i['cookie'], {}).setdefault(i['status'], []).append(int(i['bytes_sent'])) | |
possible_extra = 0 | |
for k in cookie_dict: | |
try: | |
if (max(cookie_dict[k]['200']) + sum(cookie_dict[k]['206'])) >= avg_bytes: | |
possible_extra += 1 | |
except KeyError: continue | |
return possible_extra | |
def total_uniq_cookie(self, f = 'total_complete'): | |
url_list = getattr(self,f)() | |
cookie_dict = {} | |
tmp_list = [] | |
for u in url_list: | |
try: | |
if cookie_dict[u['cookie']]: continue | |
except: | |
cookie_dict[u['cookie']] = True | |
tmp_list.append(u) | |
return tmp_list | |
def total_uniq_ip(self, f = 'total_complete'): | |
url_list = getattr(self,f)() | |
cookie_dict = {} | |
tmp_list = [] | |
for u in url_list: | |
try: | |
if cookie_dict[u['ip']]: continue | |
except: | |
cookie_dict[u['ip']] = True | |
tmp_list.append(u) | |
return tmp_list | |
class FunnelCakeMozComReports: | |
def __init__(self, url_list): | |
self.url_list = url_list | |
def len_reports(self): | |
return [ getattr(self,f) for f in mozcom_len_funcs ] | |
def reports(self): | |
return [ getattr(self,f) for f in mozcom_funcs ] | |
def total_complete(self): | |
return [ i for i in self.url_list ] | |
def total_ie(self): | |
url_list = self.total_complete() | |
return [ i for i in url_list if i['browser'] == 'IE' ] | |
def total_firefox(self): | |
url_list = self.total_complete() | |
return [ i for i in url_list if i['browser'] == 'Firefox' ] | |
def total_safari(self): | |
url_list = self.total_complete() | |
return [ i for i in url_list if i['browser'] == 'Safari' ] | |
def total_opera(self): | |
url_list = self.total_complete() | |
return [ i for i in url_list if i['browser'] == 'Opera' ] | |
def total_other_browser(self): | |
url_list = self.total_complete() | |
return [ i for i in url_list if i['browser'] == 'Other' ] | |
def total_uniq_ip_ua(self, f = 'total_complete'): | |
url_list = getattr(self,f)() | |
cookie_dict = {} | |
tmp_list = [] | |
for u in url_list: | |
try: | |
if cookie_dict[(u['ip'], u['ua'])]: continue | |
except: | |
cookie_dict[(u['ip'], u['ua'])] = True | |
tmp_list.append(u) | |
return tmp_list | |
def parse_mirror_log(valid_status, valid_os, valid_locale, log_file = sys.stdin): | |
gi = GeoIP.new(GeoIP.GEOIP_MEMORY_CACHE) | |
data_dict = {} | |
data_dict['url'] = {} | |
data_dict['geo'] = {} | |
data_dict['browser'] = {} | |
log_re = re.compile(r'(?P<ip>\d{1,3}(?:\.\d{1,3}){3}) \S+ \S+ \[(?P<dt>\d\d/\w{3}/\d{4}:\d\d:\d\d:\d\d) -\d{4}\] "GET (?P<url>\S+) HTTP/..." (?P<status>\d+) (?P<resp_bytes>\S+) "(?P<referer>[^"]*)" "(?P<ua>[^"]+)" (?P<time_taken>\S+) (?P<bytes_sent>\S+) (?P<cookie>\S+)') | |
for l in log_file: | |
m = log_re.search(l) | |
if m: | |
if m.group('status') in valid_status: | |
try: | |
os, locale, product = [ i.lower() for i in re.split('/+', m.group('url'))[6:] ] | |
except ValueError: continue | |
data = m.groupdict() | |
data['browser'] = detect_browser(data['ua']) | |
data['geo'] = gi.country_code_by_addr(data['ip']) | |
if locale in valid_locale and os in valid_os: | |
data_dict['url'].setdefault((os, locale, product),[]).append( data ) | |
data_dict['geo'].setdefault((data['geo'], product), []).append( data ) | |
data_dict['browser'].setdefault((data['browser'], locale, product), []).append( data ) | |
return data_dict | |
def parse_mozcom_log(valid_status, valid_url_re, log_file = sys.stdin): | |
data_dict = {} | |
data_dict['url'] = {} | |
log_re = re.compile(r'(?P<ip>\d{1,3}(?:\.\d{1,3}){3}) \S+ \S+ \[(?P<dt>\d\d/\w{3}/\d{4}:\d\d:\d\d:\d\d) -\d{4}\] "GET (?P<url>\S+) HTTP/..." (?P<status>\d+) (?P<resp_bytes>\S+) "(?P<referer>[^"]*)" "(?P<ua>[^"]+)" (?P<cookie>\S+)?') | |
for l in log_file: | |
m = log_re.search(l) | |
if m: | |
if not valid_url_re.search(m.group('url')): | |
continue | |
if m.group('status') in valid_status: | |
data_dict['url'].setdefault(m.group('url'),[]).append( m.groupdict() ) | |
return data_dict |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment