Skip to content

Instantly share code, notes, and snippets.

@oremj
Created November 30, 2009 18:59
Show Gist options
  • Save oremj/245647 to your computer and use it in GitHub Desktop.
Save oremj/245647 to your computer and use it in GitHub Desktop.
funnelcake log processing
#!/usr/bin/python
import sys
import csv
import funnelcake_lib
data_dict = funnelcake_lib.parse_mirror_log(('200', '206'), ('linux-i686', 'win32', 'mac'), ('en-us', ))
def run_report(data_key, extra_headers, filename):
global data_dict
csv_writer = csv.writer(open(filename, "wb"))
csv_writer.writerow(extra_headers + funnelcake_lib.stats_len_funcs + funnelcake_lib.stats_funcs)
for k in data_dict[data_key]:
if len(data_dict[data_key][k]) < 2:
continue
analyzer = funnelcake_lib.FunnelCakeReports(data_dict[data_key][k])
if len(analyzer.total_complete()) < 2:
continue
row = []
row += k
row += [ "%s" % ( len(f()), ) for f in analyzer.len_reports() ]
row += [ "%s" % ( f(), ) for f in analyzer.reports() ]
csv_writer.writerow(row)
run_report('url', ['os','locale','product'], "/tmp/funnelcake_mirror.csv")
run_report('geo', ['geo','product'], "/tmp/funnelcake_mirror_geo.csv")
run_report('browser', ['browser','locale', 'product'], "/tmp/funnelcake_mirror_browser.csv")
import sys
import re
import math
import GeoIP
stats_len_funcs = ['total_complete', 'total_incomplete', 'total_incomplete_uniq_cookie_200', 'total_uniq_cookie',
'total_uniq_ip', 'total_206_requests', 'total_uniq_cookie_206',
'total_ie', 'total_firefox', 'total_safari', 'total_opera', 'total_other_browser' ]
stats_funcs = ['total_possible_extra']
mozcom_len_funcs = ['total_complete', 'total_uniq_ip_ua', 'total_ie',
'total_firefox', 'total_safari', 'total_opera',
'total_other_browser' ]
mozcom_funcs = []
def detect_browser(ua):
browser_re = { 'IE': re.compile('MSIE'),
'Firefox': re.compile('Firefox'),
'Safari': re.compile('Safari'),
'Opera': re.compile('^Opera') }
for b,r in browser_re.iteritems():
if r.search(ua):
return b
return "Other"
class FunnelCakeReports:
def __init__(self, url_list, BYTE_LIMIT = 100000):
self.url_list = url_list
self.BYTE_LIMIT = BYTE_LIMIT
def len_reports(self):
return [ getattr(self,f) for f in stats_len_funcs ]
def reports(self):
return [ getattr(self,f) for f in stats_funcs ]
def total_complete(self):
return [ i for i in self.url_list if i['status'] == '200' and int(i['bytes_sent']) >= int(i['resp_bytes']) - self.BYTE_LIMIT]
def total_incomplete(self):
return [ i for i in self.url_list if i['status'] == '206' or ( i['status'] == '200' and int(i['bytes_sent']) < int(i['resp_bytes']) - self.BYTE_LIMIT)]
def total_incomplete_200(self):
return [ i for i in self.url_list if i['status'] == '200' and int(i['bytes_sent']) < int(i['resp_bytes']) - self.BYTE_LIMIT]
def total_incomplete_uniq_cookie_200(self):
return self.total_uniq_cookie('total_incomplete_200')
def total_ie(self):
url_list = self.total_complete()
re_browser = re.compile('MSIE')
return [ i for i in url_list if re_browser.search(i['ua']) ]
def total_firefox(self):
url_list = self.total_complete()
re_browser = re.compile('Firefox')
return [ i for i in url_list if re_browser.search(i['ua']) ]
def total_safari(self):
url_list = self.total_complete()
re_browser = re.compile('Safari')
return [ i for i in url_list if re_browser.search(i['ua']) ]
def total_opera(self):
url_list = self.total_complete()
re_browser = re.compile('^Opera')
return [ i for i in url_list if re_browser.search(i['ua']) ]
def total_other_browser(self):
url_list = self.total_complete()
re_browser = re.compile('(^Opera|Firefox|MSIE|Safari)')
return [ i for i in url_list if not re_browser.search(i['ua']) ]
def total_206_requests(self):
return [ i for i in self.url_list if i['status'] == '206' ]
def total_uniq_cookie_206(self):
return self.total_uniq_cookie('total_206_requests')
def total_uniq_ip_206(self):
return self.total_uniq_ip('total_206_requests')
def total_extra_bytes(self):
return sum([ int(i['bytes_sent']) for i in self.total_incomplete()])
def total_possible_extra(self):
total_complete = self.total_complete()
try:
avg_bytes = float(sum([ int(i['bytes_sent']) for i in total_complete])) / len(total_complete)
except ZeroDivisionError:
return 0
cookie_dict = {}
incomplete = self.total_incomplete()
for i in incomplete:
cookie_dict.setdefault(i['cookie'], {}).setdefault(i['status'], []).append(int(i['bytes_sent']))
possible_extra = 0
for k in cookie_dict:
try:
if (max(cookie_dict[k]['200']) + sum(cookie_dict[k]['206'])) >= avg_bytes:
possible_extra += 1
except KeyError: continue
return possible_extra
def total_uniq_cookie(self, f = 'total_complete'):
url_list = getattr(self,f)()
cookie_dict = {}
tmp_list = []
for u in url_list:
try:
if cookie_dict[u['cookie']]: continue
except:
cookie_dict[u['cookie']] = True
tmp_list.append(u)
return tmp_list
def total_uniq_ip(self, f = 'total_complete'):
url_list = getattr(self,f)()
cookie_dict = {}
tmp_list = []
for u in url_list:
try:
if cookie_dict[u['ip']]: continue
except:
cookie_dict[u['ip']] = True
tmp_list.append(u)
return tmp_list
class FunnelCakeMozComReports:
def __init__(self, url_list):
self.url_list = url_list
def len_reports(self):
return [ getattr(self,f) for f in mozcom_len_funcs ]
def reports(self):
return [ getattr(self,f) for f in mozcom_funcs ]
def total_complete(self):
return [ i for i in self.url_list ]
def total_ie(self):
url_list = self.total_complete()
return [ i for i in url_list if i['browser'] == 'IE' ]
def total_firefox(self):
url_list = self.total_complete()
return [ i for i in url_list if i['browser'] == 'Firefox' ]
def total_safari(self):
url_list = self.total_complete()
return [ i for i in url_list if i['browser'] == 'Safari' ]
def total_opera(self):
url_list = self.total_complete()
return [ i for i in url_list if i['browser'] == 'Opera' ]
def total_other_browser(self):
url_list = self.total_complete()
return [ i for i in url_list if i['browser'] == 'Other' ]
def total_uniq_ip_ua(self, f = 'total_complete'):
url_list = getattr(self,f)()
cookie_dict = {}
tmp_list = []
for u in url_list:
try:
if cookie_dict[(u['ip'], u['ua'])]: continue
except:
cookie_dict[(u['ip'], u['ua'])] = True
tmp_list.append(u)
return tmp_list
def parse_mirror_log(valid_status, valid_os, valid_locale, log_file = sys.stdin):
gi = GeoIP.new(GeoIP.GEOIP_MEMORY_CACHE)
data_dict = {}
data_dict['url'] = {}
data_dict['geo'] = {}
data_dict['browser'] = {}
log_re = re.compile(r'(?P<ip>\d{1,3}(?:\.\d{1,3}){3}) \S+ \S+ \[(?P<dt>\d\d/\w{3}/\d{4}:\d\d:\d\d:\d\d) -\d{4}\] "GET (?P<url>\S+) HTTP/..." (?P<status>\d+) (?P<resp_bytes>\S+) "(?P<referer>[^"]*)" "(?P<ua>[^"]+)" (?P<time_taken>\S+) (?P<bytes_sent>\S+) (?P<cookie>\S+)')
for l in log_file:
m = log_re.search(l)
if m:
if m.group('status') in valid_status:
try:
os, locale, product = [ i.lower() for i in re.split('/+', m.group('url'))[6:] ]
except ValueError: continue
data = m.groupdict()
data['browser'] = detect_browser(data['ua'])
data['geo'] = gi.country_code_by_addr(data['ip'])
if locale in valid_locale and os in valid_os:
data_dict['url'].setdefault((os, locale, product),[]).append( data )
data_dict['geo'].setdefault((data['geo'], product), []).append( data )
data_dict['browser'].setdefault((data['browser'], locale, product), []).append( data )
return data_dict
def parse_mozcom_log(valid_status, valid_url_re, log_file = sys.stdin):
data_dict = {}
data_dict['url'] = {}
log_re = re.compile(r'(?P<ip>\d{1,3}(?:\.\d{1,3}){3}) \S+ \S+ \[(?P<dt>\d\d/\w{3}/\d{4}:\d\d:\d\d:\d\d) -\d{4}\] "GET (?P<url>\S+) HTTP/..." (?P<status>\d+) (?P<resp_bytes>\S+) "(?P<referer>[^"]*)" "(?P<ua>[^"]+)" (?P<cookie>\S+)?')
for l in log_file:
m = log_re.search(l)
if m:
if not valid_url_re.search(m.group('url')):
continue
if m.group('status') in valid_status:
data_dict['url'].setdefault(m.group('url'),[]).append( m.groupdict() )
return data_dict
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment