Skip to content

Instantly share code, notes, and snippets.

@lovasoa
Created May 18, 2018 14:29
Show Gist options
  • Save lovasoa/3fdb0a97a5a1d5b7fc4211aeae2a0c85 to your computer and use it in GitHub Desktop.
Save lovasoa/3fdb0a97a5a1d5b7fc4211aeae2a0c85 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import pandas as pd
import numpy as np
import sys
import logging
def is_valid_line(line: bytes):
return line and b"SQLProxy" not in line and b"P2_COD" not in line
record_types = [('id', np.uint64), ('time', np.uint64), ('is_reply', np.bool), ('type', np.uint8)]
def parse_line(line: bytes):
"""Takes a line and returns a tuple (id, time, is_reply, type)"""
parts = line.split(b', ')
assert len(parts) > 5, "Not enough information in log line"
l = parts[0]
h = int(l[0:2]) * 60 * 60 * 1000000
m = int(l[3:5]) * 60 * 1000000
s = int(l[6:8]) * 1000000
ms = int(l[9:15])
tot_time = h + m + s + ms
id_str = parts[1]
is_reply = id_str.startswith(b'reply')
id_int = int(id_str[-16:], 16)
type_str = parts[5]
type_int = int(type_str[5:])
return np.array((id_int, tot_time, is_reply, type_int), dtype=record_types)
def read_logfile(filename):
"""Parses a log file into a pandas DataFrame"""
f = open(filename, 'rb')
records = np.array([parse_line(l) for l in f if is_valid_line(l)])
df = pd.DataFrame(records, columns=("id", "time", "is_reply", "type"))
df.set_index('id', inplace=True)
return df
def join_req_rep(df):
"""Joins requests with their associated responses"""
return df[~df.is_reply].join(df[df.is_reply], rsuffix='_reply', how='inner')
def format_second(t):
"""Takes a Series containing seconds and returns a Series containing properly """
s = t % 60
m = (t // 60) % 60
h = (t // 60 // 60) % 24
fmt = "{:02d}".format
return h.map(fmt).add(':').add(m.map(fmt)).add(':').add(s.map(fmt))
def write_csv(stats_results, outfile):
"""Write the stats to a csv file"""
stats_results["time_str"] = format_second(stats_results.second)
cols = ['time_str', 'type_reply', 'count', 'mean', 'median', 'percentile_90', 'percentile_99', 'amax']
stats_results[cols].to_csv(outfile, index=False, float_format='%g')
def compute_stats(joined):
"""Computes the required statistics"""
grouped = pd.DataFrame({
"delay": joined.time_reply - joined.time,
"second": joined.time // 1000000,
"type_reply": joined.type_reply
}).groupby(["second", "type_reply"])
def percentile_90(g): return g.quantile(0.90)
def percentile_99(g): return g.quantile(0.99)
aggs = [pd.Series.count, np.mean, np.median, percentile_90, percentile_99, np.max]
stats = grouped.delay.agg(aggs)
stats.reset_index(inplace=True)
return stats
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
assert len(sys.argv) >= 3, "Not enough arguments"
infile, outfile = sys.argv[1], sys.argv[2]
logging.info("Reading input file...")
df = read_logfile(infile)
logging.info("Joining requests with the associated results...")
joined = join_req_rep(df)
logging.info("Computing statistics...")
stats_results = compute_stats(joined)
logging.info("Writing output file...")
write_csv(stats_results, outfile)
logging.info("Done.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment