Last active
February 6, 2023 15:45
-
-
Save adjam/5768802 to your computer and use it in GitHub Desktop.
Pretty cheap Apache log parser in Python that outputs to SQLite3 or JSON. Splits HTTP Method and path, and also logs referer and referer host separately.Not particularly efficient, but it's better than nothing and lets you do some quick analysis, e.g. after running, you can do something like: sqlite3 output-sq3.dat "select referer_host, count(1)…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# pip install --user apachelogs | |
# ./logparse.py [logfile] [json|db] [output file name] | |
# JSON output is assumed; if output filename is omitted | |
# output.[json|db] will be generated | |
import apachelogs | |
import os | |
import pytz | |
import sqlite3 | |
import sys | |
from urllib.parse import urlparse | |
from time import strptime,mktime | |
from datetime import datetime | |
import re | |
p = apachelogs.LogParser(apachelogs.COMBINED) | |
rpre=re.compile(r"([A-Z]+)\s+(.*) (?:HTTP/([\d.]+))") | |
def parse_request(req): | |
if req is None: | |
return None, None, None | |
m = rpre.search(req) | |
if m: | |
return m.group(1),m.group(2),m.group(3) | |
else: | |
return None,None,None | |
class DbWriter(object): | |
def __init__(self,overwrite=False,dbfile="output-sq3.dat",timefmt="%d/%b/%Y:%H:%M:%S"): | |
if os.path.exists(dbfile): | |
if not overwrite: | |
raise Exception("Sorry, not gonna overwrite %s" % dbfile) | |
else: | |
os.unlink(dbfile) | |
self.conn = sqlite3.connect(dbfile) | |
self.timefmt = timefmt | |
cur = self.conn.cursor() | |
cur.execute(""" | |
CREATE TABLE hit ( | |
id INTEGER PRIMARY KEY AUTOINCREMENT, | |
client_addr VARCHAR(128), | |
timestamp TIMESTAMP, | |
status_code INT NOT NULL, | |
method VARCHAR(12), | |
path VARCHAR(2048), | |
http_version VARCHAR(12), | |
user_name VARCHAR(128), | |
referer VARCHAR(2048), | |
referer_host VARCHAR(2048), | |
user_agent VARCHAR(1024), | |
log_name VARCHAR(128) | |
);""") | |
def log_hit(self,hit): | |
loctime = hit.request_time | |
utc = loctime.astimezone(pytz.utc) | |
method, path, http_version = parse_request(hit.directives['%r']) | |
if method is None: | |
return False | |
ref = hit.headers_in['Referer'] | |
ref_host = ref == '-' and '' or urlparse(ref).netloc | |
cur = self.conn.cursor() | |
r = cur.execute("""INSERT INTO hit (client_addr,timestamp,status_code,method, path, http_version, user_name, referer, referer_host,user_agent,log_name) VALUES(?,?,?,?,?,?,?,?,?,?,?)""", | |
(hit.directives['%h'], | |
utc, | |
int(hit.directives['%>s']), | |
method, | |
path, | |
http_version, | |
hit.directives['%u'], | |
ref, | |
ref_host, | |
hit.headers_in.get('User-Agent',''), | |
hit.directives['%l'] | |
) | |
) | |
def __del__(self): | |
if hasattr(self,'conn'): | |
self.conn.commit() | |
self.conn.close() | |
def parse_file(filename,row_mapper=lambda x: x): | |
hits = [] | |
with open(filename,"r") as f: | |
for lno, line in enumerate(f): | |
try: | |
hits.append(row_mapper(p.parse(line))) | |
except apachelogs.Error as e: | |
print(str(lno) + " : " + str(e),file=sys.stderr) | |
return hits | |
def get_writer(output="json", filename=None): | |
if output == 'json': | |
import json | |
if filename is None: | |
f = sys.stdout | |
else: | |
f = open(filename,"w") | |
def out(hits): | |
json.dump(hits, f) | |
return out | |
else: | |
if filename is None: | |
writer = DbWriter() | |
else: | |
writer = DbWriter(dbfile=filename) | |
def out(hits): | |
for h in hits: | |
writer.log_hit(h) | |
return out | |
def main(files,output, outfile): | |
writer = get_writer(output=output, filename=outfile) | |
for f in files: | |
d = parse_file(f) | |
writer(d) | |
if __name__ == '__main__': | |
filename =sys.argv[1] | |
output = len(sys.argv) > 2 and sys.argv[2] or 'json' | |
outfile = len(sys.argv) > 3 and sys.argv[3] or f"output.{output}" | |
main([filename], output, outfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment