Skip to content

Instantly share code, notes, and snippets.

@DMRobertson
Created December 17, 2021 18:30
Show Gist options
  • Save DMRobertson/133e303f659cace07d7126e82b8b9b62 to your computer and use it in GitHub Desktop.
Save DMRobertson/133e303f659cace07d7126e82b8b9b62 to your computer and use it in GitHub Desktop.
dirty script to ingest synapse request logs into sqlite
#! /usr/bin/env python3
import argparse
import re
import sqlite3
import sys
# Sorry everyone.
# Use nongreedy matches to make this well fast.
from datetime import datetime
multiple_files_pattern = re.compile(r"""
^
([^:]*?):(.*?)\ -\
(.*?)\ -\
(.*?)\ -\
(.*?)\ -\
(.*?)\ -\
(.*?)\ -\
(.*?)\ -\
{(.*?)}\ .*?:\
(.*?)sec/(.*?)sec\
\((.*?)sec,\ (.*?)sec\)\
\((.*?)sec/(.*?)sec/(.*?)\)\
(.*?)B\ (.*?)\
\"(.*?)\ (.*?)\ .*?\"\
\"(.*?)\"\
\[(.*?)\ dbevts\]
$
""", re.VERBOSE)
single_file_pattern = re.compile(r"""
^
(.*?)\ -\
(.*?)\ -\
(.*?)\ -\
(.*?)\ -\
(.*?)\ ?-\
(.*?)\ -\
(.*?)\ -\
{(.*?)}\ .*?:\
(.*?)sec/(.*?)sec\
\((.*?)sec,\ (.*?)sec\)\
\((.*?)sec/(.*?)sec/(.*?)\)\
(.*?)B\ (.*?)\
\"(.*?)\ (.*?)\ .*?\"\
\"(.*?)\"\
\[(.*?)\ dbevts\]
""", re.VERBOSE)
def log_entries(args):
pattern = multiple_files_pattern if args.multiple_files else single_file_pattern
with open(args.input, "rt") as f:
for i, line in enumerate(f, start=1):
if i % 1000 == 0:
print(i)
match = pattern.match(line)
if match is None:
print(f"Couldn't parse {repr(line)}", file=sys.stderr)
continue
parameters = []
# If the input isn't from grepping multiple files, add the filename as source
if not args.multiple_files:
parameters.append(args.input)
parameters.extend(match.groups())
parameters[1] = datetime.strptime(parameters[1], "%Y-%m-%d %H:%M:%S,%f").timestamp()
yield parameters
parser = argparse.ArgumentParser()
parser.add_argument("input", help="Read this log file")
parser.add_argument("--output", default="logs.db", help="Write to this sqlite db")
parser.add_argument("--table_name", help="Write entries to this table. If omitted, uses "
"the filename up until the first fullstop")
parser.add_argument("--multiple-files", action="store_true",
help="Use this if your output comes from grepping multiple files,"
"i.e. if lines are prefixed with filename:'")
if __name__ == "__main__":
args = parser.parse_args()
if args.table_name is None:
args.table_name = args.input.split(".")[0]
con = sqlite3.connect(args.output)
cur = con.cursor()
# BEWARE: SQL injection in args.table_name here
# But not sure there's a better way? C.f. https://stackoverflow.com/a/3247553/5252017
cur.execute(rf"""
CREATE TABLE {args.table_name}
(
filename text,
timestamp double, -- seconds since epoch
logger text,
lineno int,
level text,
request_id text,
ip_addr text,
port int,
requester text,
processing_time double,
response_send_time double,
ru_utime double,
ru_stime double,
db_sched_duration_sec double,
db_txn_duration_sec double,
db_txn_count int,
bytes_sent int,
status int,
method text,
url text,
user_agent text,
db_events int
);
""")
sql = f"INSERT INTO {args.table_name} VALUES ({','.join('?' * 22)});"
cur.executemany(sql, log_entries(args))
con.commit()
print(f"Wrote {cur.rowcount} rows to table {args.table_name} in {args.output}")
@DMRobertson
Copy link
Author

Brief instructions:

grep "Process" my_log_file.log > my_filtered_logs.log
parse.py --help
parse.py my_filtered_logs.log
sqlite3 logs.db
.describe my_filtered_logs

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment