Skip to content

Instantly share code, notes, and snippets.

@emaxerrno
Created March 13, 2019 18:36
Show Gist options
  • Save emaxerrno/a52e5fdfa5b5f48277ba0dc3cf44e5d0 to your computer and use it in GitHub Desktop.
Save emaxerrno/a52e5fdfa5b5f48277ba0dc3cf44e5d0 to your computer and use it in GitHub Desktop.
compute histogram based on text file line length w/ csv optional output
#!/usr/bin/env python3
import sys
import os
import argparse
import operator
import distutils.util
def generate_options():
parser = argparse.ArgumentParser(description='histogram per file lenth')
parser.add_argument(
'--file',
type=str,
help='file that is new line delimited. i.e: --file=foo.json')
parser.add_argument(
'--csv',
type=distutils.util.strtobool,
default='true',
help='print to stdout as csv')
parser.add_argument(
'--buckets',
type=int,
default=10,
help='number of buckets to print. i.e: --buckets=10')
return parser
def read_file(filename):
with open(filename, 'r') as f:
return f.readlines()
def histogram_for_dict(od, buckets):
m = {}
ks = sorted(od.keys())
first_k = ks[0]
last_k = ks[-1]
step = int((last_k - first_k) / buckets)
for i in range(first_k, last_k, step):
m[i] = int(0)
for k in ks:
if k >= i and k < i + step:
m[i] = m[i] + od[k]
return m
def compute_frequencies(lines):
m = {}
for l in lines:
s = len(l)
if s == 0: continue
v = 1 + m.get(s, 0)
m[s] = v
return m
def print_histogram(csv_print, hist):
total = sum(hist.values())
h = sorted(hist.keys())
counter = 1
v_counter = 0
first_v = hist[h[0]]
if csv_print:
sys.stdout.write(
"payload_size,number_of_requests,cummulative_requests,percentile\n"
)
for k in h:
v = hist[k]
v_counter = v_counter + v
if csv_print:
sys.stdout.write(
"%s,%s,%s,%s\n" % (k, v, v_counter,
round(100 * (v_counter / total), 5)))
else:
sys.stdout.write("%s:\t%s\t%s\t%s\t%s\n" % (
counter, k, v, v_counter, round(100 * (v_counter / total), 5)))
counter = counter + 1
def main():
parser = generate_options()
options, program_options = parser.parse_known_args()
print_histogram(
options.csv,
histogram_for_dict(
compute_frequencies(read_file(options.file)), options.buckets - 1))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment