Skip to content

Instantly share code, notes, and snippets.

@adjam
Last active February 6, 2023 15:45
Show Gist options
  • Save adjam/5768802 to your computer and use it in GitHub Desktop.
Save adjam/5768802 to your computer and use it in GitHub Desktop.
Pretty cheap Apache log parser in Python that outputs to SQLite3 or JSON. Splits HTTP Method and path, and also logs referer and referer host separately.Not particularly efficient, but it's better than nothing and lets you do some quick analysis, e.g. after running, you can do something like: sqlite3 output-sq3.dat "select referer_host, count(1)…
#!/usr/bin/env python
# pip install --user apachelogs
# ./logparse.py [logfile] [json|db] [output file name]
# JSON output is assumed; if output filename is omitted
# output.[json|db] will be generated
import apachelogs
import os
import pytz
import sqlite3
import sys
from urllib.parse import urlparse
from time import strptime,mktime
from datetime import datetime
import re
p = apachelogs.LogParser(apachelogs.COMBINED)
rpre=re.compile(r"([A-Z]+)\s+(.*) (?:HTTP/([\d.]+))")
def parse_request(req):
if req is None:
return None, None, None
m = rpre.search(req)
if m:
return m.group(1),m.group(2),m.group(3)
else:
return None,None,None
class DbWriter(object):
def __init__(self,overwrite=False,dbfile="output-sq3.dat",timefmt="%d/%b/%Y:%H:%M:%S"):
if os.path.exists(dbfile):
if not overwrite:
raise Exception("Sorry, not gonna overwrite %s" % dbfile)
else:
os.unlink(dbfile)
self.conn = sqlite3.connect(dbfile)
self.timefmt = timefmt
cur = self.conn.cursor()
cur.execute("""
CREATE TABLE hit (
id INTEGER PRIMARY KEY AUTOINCREMENT,
client_addr VARCHAR(128),
timestamp TIMESTAMP,
status_code INT NOT NULL,
method VARCHAR(12),
path VARCHAR(2048),
http_version VARCHAR(12),
user_name VARCHAR(128),
referer VARCHAR(2048),
referer_host VARCHAR(2048),
user_agent VARCHAR(1024),
log_name VARCHAR(128)
);""")
def log_hit(self,hit):
loctime = hit.request_time
utc = loctime.astimezone(pytz.utc)
method, path, http_version = parse_request(hit.directives['%r'])
if method is None:
return False
ref = hit.headers_in['Referer']
ref_host = ref == '-' and '' or urlparse(ref).netloc
cur = self.conn.cursor()
r = cur.execute("""INSERT INTO hit (client_addr,timestamp,status_code,method, path, http_version, user_name, referer, referer_host,user_agent,log_name) VALUES(?,?,?,?,?,?,?,?,?,?,?)""",
(hit.directives['%h'],
utc,
int(hit.directives['%>s']),
method,
path,
http_version,
hit.directives['%u'],
ref,
ref_host,
hit.headers_in.get('User-Agent',''),
hit.directives['%l']
)
)
def __del__(self):
if hasattr(self,'conn'):
self.conn.commit()
self.conn.close()
def parse_file(filename,row_mapper=lambda x: x):
hits = []
with open(filename,"r") as f:
for lno, line in enumerate(f):
try:
hits.append(row_mapper(p.parse(line)))
except apachelogs.Error as e:
print(str(lno) + " : " + str(e),file=sys.stderr)
return hits
def get_writer(output="json", filename=None):
if output == 'json':
import json
if filename is None:
f = sys.stdout
else:
f = open(filename,"w")
def out(hits):
json.dump(hits, f)
return out
else:
if filename is None:
writer = DbWriter()
else:
writer = DbWriter(dbfile=filename)
def out(hits):
for h in hits:
writer.log_hit(h)
return out
def main(files,output, outfile):
writer = get_writer(output=output, filename=outfile)
for f in files:
d = parse_file(f)
writer(d)
if __name__ == '__main__':
filename =sys.argv[1]
output = len(sys.argv) > 2 and sys.argv[2] or 'json'
outfile = len(sys.argv) > 3 and sys.argv[3] or f"output.{output}"
main([filename], output, outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment