Created
May 18, 2018 14:29
-
-
Save lovasoa/3fdb0a97a5a1d5b7fc4211aeae2a0c85 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import pandas as pd | |
import numpy as np | |
import sys | |
import logging | |
def is_valid_line(line: bytes): | |
return line and b"SQLProxy" not in line and b"P2_COD" not in line | |
record_types = [('id', np.uint64), ('time', np.uint64), ('is_reply', np.bool), ('type', np.uint8)] | |
def parse_line(line: bytes): | |
"""Takes a line and returns a tuple (id, time, is_reply, type)""" | |
parts = line.split(b', ') | |
assert len(parts) > 5, "Not enough information in log line" | |
l = parts[0] | |
h = int(l[0:2]) * 60 * 60 * 1000000 | |
m = int(l[3:5]) * 60 * 1000000 | |
s = int(l[6:8]) * 1000000 | |
ms = int(l[9:15]) | |
tot_time = h + m + s + ms | |
id_str = parts[1] | |
is_reply = id_str.startswith(b'reply') | |
id_int = int(id_str[-16:], 16) | |
type_str = parts[5] | |
type_int = int(type_str[5:]) | |
return np.array((id_int, tot_time, is_reply, type_int), dtype=record_types) | |
def read_logfile(filename): | |
"""Parses a log file into a pandas DataFrame""" | |
f = open(filename, 'rb') | |
records = np.array([parse_line(l) for l in f if is_valid_line(l)]) | |
df = pd.DataFrame(records, columns=("id", "time", "is_reply", "type")) | |
df.set_index('id', inplace=True) | |
return df | |
def join_req_rep(df): | |
"""Joins requests with their associated responses""" | |
return df[~df.is_reply].join(df[df.is_reply], rsuffix='_reply', how='inner') | |
def format_second(t): | |
"""Takes a Series containing seconds and returns a Series containing properly """ | |
s = t % 60 | |
m = (t // 60) % 60 | |
h = (t // 60 // 60) % 24 | |
fmt = "{:02d}".format | |
return h.map(fmt).add(':').add(m.map(fmt)).add(':').add(s.map(fmt)) | |
def write_csv(stats_results, outfile): | |
"""Write the stats to a csv file""" | |
stats_results["time_str"] = format_second(stats_results.second) | |
cols = ['time_str', 'type_reply', 'count', 'mean', 'median', 'percentile_90', 'percentile_99', 'amax'] | |
stats_results[cols].to_csv(outfile, index=False, float_format='%g') | |
def compute_stats(joined): | |
"""Computes the required statistics""" | |
grouped = pd.DataFrame({ | |
"delay": joined.time_reply - joined.time, | |
"second": joined.time // 1000000, | |
"type_reply": joined.type_reply | |
}).groupby(["second", "type_reply"]) | |
def percentile_90(g): return g.quantile(0.90) | |
def percentile_99(g): return g.quantile(0.99) | |
aggs = [pd.Series.count, np.mean, np.median, percentile_90, percentile_99, np.max] | |
stats = grouped.delay.agg(aggs) | |
stats.reset_index(inplace=True) | |
return stats | |
if __name__ == "__main__": | |
logging.getLogger().setLevel(logging.INFO) | |
assert len(sys.argv) >= 3, "Not enough arguments" | |
infile, outfile = sys.argv[1], sys.argv[2] | |
logging.info("Reading input file...") | |
df = read_logfile(infile) | |
logging.info("Joining requests with the associated results...") | |
joined = join_req_rep(df) | |
logging.info("Computing statistics...") | |
stats_results = compute_stats(joined) | |
logging.info("Writing output file...") | |
write_csv(stats_results, outfile) | |
logging.info("Done.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment