lovasoa · May 18, 2018 14:29
diff --git a/logs_stats.py b/logs_stats.py
 #!/usr/bin/env python3
 import pandas as pd
 import numpy as np
 import sys
 import logging


 def is_valid_line(line: bytes):
    return line and b"SQLProxy" not in line and b"P2_COD" not in line

 record_types = [('id', np.uint64), ('time', np.uint64), ('is_reply', np.bool), ('type', np.uint8)]
 def parse_line(line: bytes):
    """Takes a line and returns a tuple (id, time, is_reply, type)"""
    parts = line.split(b', ')
    assert len(parts) > 5, "Not enough information in log line"
    l = parts[0]
    h = int(l[0:2]) * 60 * 60 * 1000000
    m = int(l[3:5]) * 60 * 1000000
    s = int(l[6:8]) * 1000000
    ms = int(l[9:15])
    tot_time = h + m + s + ms
    id_str = parts[1]
    is_reply = id_str.startswith(b'reply')
    id_int = int(id_str[-16:], 16)
    type_str = parts[5]
    type_int = int(type_str[5:])
    return np.array((id_int, tot_time, is_reply, type_int), dtype=record_types) 

 def read_logfile(filename):
    """Parses a log file into a pandas DataFrame"""
    f = open(filename, 'rb')
    records = np.array([parse_line(l) for l in f if is_valid_line(l)])
    df = pd.DataFrame(records, columns=("id", "time", "is_reply", "type"))
    df.set_index('id', inplace=True)
    return df

 def join_req_rep(df):
    """Joins requests with their associated responses"""
    return df[~df.is_reply].join(df[df.is_reply], rsuffix='_reply', how='inner')
    
 def format_second(t):
    """Takes a Series containing seconds and returns a Series containing properly """
    s = t % 60
    m = (t // 60) % 60
    h = (t // 60 // 60) % 24
    fmt = "{:02d}".format
    return h.map(fmt).add(':').add(m.map(fmt)).add(':').add(s.map(fmt))

 def write_csv(stats_results, outfile):
    """Write the stats to a csv file"""
    stats_results["time_str"] = format_second(stats_results.second)
    cols = ['time_str', 'type_reply', 'count', 'mean', 'median', 'percentile_90', 'percentile_99', 'amax']
    stats_results[cols].to_csv(outfile, index=False, float_format='%g')

 def compute_stats(joined):
    """Computes the required statistics"""
    grouped = pd.DataFrame({
        "delay": joined.time_reply - joined.time,
        "second": joined.time // 1000000,
        "type_reply": joined.type_reply
    }).groupby(["second", "type_reply"])
    def percentile_90(g): return g.quantile(0.90)
    def percentile_99(g): return g.quantile(0.99)
    aggs = [pd.Series.count, np.mean, np.median, percentile_90, percentile_99, np.max]
    stats = grouped.delay.agg(aggs)
    stats.reset_index(inplace=True)
    return stats


 if __name__ == "__main__":
    logging.getLogger().setLevel(logging.INFO)
    assert len(sys.argv) >= 3, "Not enough arguments"
    infile, outfile = sys.argv[1], sys.argv[2]
    logging.info("Reading input file...")
    df = read_logfile(infile)
    logging.info("Joining requests with the associated results...")
    joined = join_req_rep(df)
    logging.info("Computing statistics...")
    stats_results = compute_stats(joined)
    logging.info("Writing output file...")
    write_csv(stats_results, outfile)
    logging.info("Done.")
	#!/usr/bin/env python3
	import pandas as pd
	import numpy as np
	import sys
	import logging


	def is_valid_line(line: bytes):
	return line and b"SQLProxy" not in line and b"P2_COD" not in line

	record_types = [('id', np.uint64), ('time', np.uint64), ('is_reply', np.bool), ('type', np.uint8)]
	def parse_line(line: bytes):
	"""Takes a line and returns a tuple (id, time, is_reply, type)"""
	parts = line.split(b', ')
	assert len(parts) > 5, "Not enough information in log line"
	l = parts[0]
	h = int(l[0:2]) * 60 * 60 * 1000000
	m = int(l[3:5]) * 60 * 1000000
	s = int(l[6:8]) * 1000000
	ms = int(l[9:15])
	tot_time = h + m + s + ms
	id_str = parts[1]
	is_reply = id_str.startswith(b'reply')
	id_int = int(id_str[-16:], 16)
	type_str = parts[5]
	type_int = int(type_str[5:])
	return np.array((id_int, tot_time, is_reply, type_int), dtype=record_types)

	def read_logfile(filename):
	"""Parses a log file into a pandas DataFrame"""
	f = open(filename, 'rb')
	records = np.array([parse_line(l) for l in f if is_valid_line(l)])
	df = pd.DataFrame(records, columns=("id", "time", "is_reply", "type"))
	df.set_index('id', inplace=True)
	return df

	def join_req_rep(df):
	"""Joins requests with their associated responses"""
	return df[~df.is_reply].join(df[df.is_reply], rsuffix='_reply', how='inner')

	def format_second(t):
	"""Takes a Series containing seconds and returns a Series containing properly """
	s = t % 60
	m = (t // 60) % 60
	h = (t // 60 // 60) % 24
	fmt = "{:02d}".format
	return h.map(fmt).add(':').add(m.map(fmt)).add(':').add(s.map(fmt))

	def write_csv(stats_results, outfile):
	"""Write the stats to a csv file"""
	stats_results["time_str"] = format_second(stats_results.second)
	cols = ['time_str', 'type_reply', 'count', 'mean', 'median', 'percentile_90', 'percentile_99', 'amax']
	stats_results[cols].to_csv(outfile, index=False, float_format='%g')

	def compute_stats(joined):
	"""Computes the required statistics"""
	grouped = pd.DataFrame({
	"delay": joined.time_reply - joined.time,
	"second": joined.time // 1000000,
	"type_reply": joined.type_reply
	}).groupby(["second", "type_reply"])
	def percentile_90(g): return g.quantile(0.90)
	def percentile_99(g): return g.quantile(0.99)
	aggs = [pd.Series.count, np.mean, np.median, percentile_90, percentile_99, np.max]
	stats = grouped.delay.agg(aggs)
	stats.reset_index(inplace=True)
	return stats


	if __name__ == "__main__":
	logging.getLogger().setLevel(logging.INFO)
	assert len(sys.argv) >= 3, "Not enough arguments"
	infile, outfile = sys.argv[1], sys.argv[2]
	logging.info("Reading input file...")
	df = read_logfile(infile)
	logging.info("Joining requests with the associated results...")
	joined = join_req_rep(df)
	logging.info("Computing statistics...")
	stats_results = compute_stats(joined)
	logging.info("Writing output file...")
	write_csv(stats_results, outfile)
	logging.info("Done.")
No results found