Created
October 6, 2011 23:27
-
-
Save cbare/1268998 to your computer and use it in GitHub Desktop.
A customized script (aka hack) to parse apache server logs and count stuff
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Parse gaggle apache server logs to compile usage stats | |
## ...with thanks to: https://github.com/lethain/apache-log-parser | |
## Track number of accesses by IP address, accesses to Java Web Starts (.jnlp files) | |
## and subversion access. | |
import sys | |
import re | |
import subprocess | |
import argparse | |
class Counter: | |
"""A dictionary that keeps a counter for each key.""" | |
def __init__(self): | |
self.counts = {} | |
def incr(self, key): | |
"""Increment the counter""" | |
if key in self.counts: | |
self.counts[key] += 1 | |
else: | |
self.counts[key] = 1 | |
return self.counts[key] | |
def get(self, key): | |
"""Get the value of the counter for a key or 0 for a key we've never seen before""" | |
if key in self.counts: | |
return self.counts[key] | |
else: | |
return 0 | |
# a nasty regex to parse a line of apache server log | |
log_re = re.compile(r'(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) [-\w\d"\\]+ [-\w\d"\\]+\s+\[(?P<time>.*?)\] "(?P<cmd>.*?) (?P<uri>.*?) HTTP/\d.\d" (?P<status>\d+).*') | |
def read_log(filename, whois_flag, show_ip_cutoff=-1): | |
jnlp = 0 | |
svn = 0 | |
internal_jnlp = 0 | |
internal_svn = 0 | |
ip_counter = Counter() | |
with open(filename, 'r') as f: | |
for line in f: | |
m = log_re.match(line) | |
if not m: | |
print "???:" + line | |
continue | |
ip = m.group('ip') | |
uri = m.group('uri') | |
time = m.group('time') | |
status = m.group('status') | |
# throw out search engine traffic | |
if ip.startswith("66.249.68."): continue # google | |
if ip.startswith("66.249.67."): continue # google | |
if ip.startswith("66.249.72."): continue # google | |
if ip.startswith("67.195."): continue # yahoo | |
if ip.startswith("207.46."): continue # msft | |
if ip.startswith("65.52."): continue # msft | |
if ip.startswith("65.53."): continue # msft | |
if ip.startswith("65.54."): continue # msft | |
if ip.startswith("65.55."): continue # msft | |
if ip.startswith("157.54."): continue # msft | |
if ip.startswith("157.55."): continue # msft | |
if ip.startswith("157.56."): continue # msft | |
if ip.startswith("157.57."): continue # msft | |
if ip.startswith("157.58."): continue # msft | |
if ip.startswith("157.59."): continue # msft | |
if ip.startswith("157.60."): continue # msft | |
if ip.startswith("208.115.111."): continue # dotnetdotcom.org | |
# spider75.yandex.ru | |
if ip.startswith("87.250.252.") or ip.startswith("95.108.158.") or ip.startswith("87.250.254."): continue | |
# internal traffic | |
if ip.startswith("10.10") or ip.startswith('10.0.'): | |
if uri.endswith(".jnlp"): | |
internal_jnlp += 1 | |
if uri.startswith("/svn/gaggle"): | |
internal_svn += 1 | |
ip_counter.incr("10.x.x.x") | |
continue | |
# count visits from each unique IP | |
ip_counter.incr(ip) | |
# count access to .jnlp's (java webstart launch) | |
if uri.endswith(".jnlp"): | |
jnlp += 1 | |
# count accesses to SVN from outside ISB | |
if uri.startswith("/svn/gaggle"): | |
svn += 1 | |
# sort IPs with the most hits to the top | |
sorted_ips = sorted(ip_counter.counts.keys(), key=lambda k: ip_counter.counts[k], reverse=True) | |
if show_ip_cutoff > -1: | |
for ip in sorted_ips: | |
if ip_counter.counts[ip] > show_ip_cutoff: | |
if whois_flag: | |
org = whois(ip) | |
print "%s: %d (%s)" % (ip, ip_counter.counts[ip], org) | |
else: | |
print "%s: %d" % (ip, ip_counter.counts[ip]) | |
print "-" * 90 | |
print "unique IPs: %d" % (len(ip_counter.counts)) | |
print "jnlp accesses: %d" % (jnlp) | |
print "svn accesses: %d" % (svn) | |
print "internal accesses: %d" % (ip_counter.get('10.x.x.x')) | |
print "internal jnlp accesses: %d" % (internal_jnlp) | |
print "internal svn accesses: %d" % (internal_svn) | |
# this just wraps the command line whois utility and greps for "orgname:" | |
# this only works about 1/2 the time | |
# there's also a pywhois library http://code.google.com/p/pywhois/ | |
def whois(ip): | |
p1 = subprocess.Popen(['whois', ip], stdout=subprocess.PIPE) | |
p2 = subprocess.Popen(['grep', '-i', 'orgname:'], stdin=p1.stdout, stdout=subprocess.PIPE) | |
p1.stdout.close() | |
return p2.communicate()[0].strip() | |
def main(): | |
parser = argparse.ArgumentParser(description='Process an apache log file and generate some usage statistics.') | |
parser.add_argument('filename', metavar='FILENAME', help='an apache log file') | |
parser.add_argument('--whois', action='store_true', default=False, help='run whois on high-usage IP addresses') | |
parser.add_argument('--show-ip-cutoff', '-c', metavar='CUTOFF', type=int, default=-1, help='cutoff for showing high-usage IP addresses') | |
args = parser.parse_args() | |
print "Reading log file(s) at: " + args.filename | |
read_log(args.filename, args.whois, args.show_ip_cutoff) | |
if __name__ == "__main__": | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment