Created
August 30, 2019 13:50
-
-
Save sbliven/237ccf49c42176a5febfb28dccecb271 to your computer and use it in GitHub Desktop.
Create histograms of line lengths. Used for https://github.com/biopython/biopython/issues/2008
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Create distribution of line lengths over files | |
Example: | |
find . -regextype egrep -regex './(Bio|Tests)/.*\.py' -type f -exec \ | |
python linelength.py --hist linelengths.png \ | |
--cdf linelengthscumulative.png -v '{}' '+' | |
""" | |
import sys | |
import argparse | |
import logging | |
import seaborn as sns | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
def line_len(file): | |
for line in file: | |
yield len(line)-1 | |
pass | |
def main(args=None): | |
parser = argparse.ArgumentParser(description='') | |
parser.add_argument("--hist", type=argparse.FileType('wb'), | |
help="PNG output file Histogram") | |
parser.add_argument("--cdf", type=argparse.FileType('wb'), | |
help="PNG output file cumulative") | |
parser.add_argument("files", type=argparse.FileType('r'), | |
default=sys.stdin, | |
nargs="*", | |
help="Input files") | |
parser.add_argument("-v", "--verbose", help="Long messages", | |
dest="verbose", default=False, action="store_true") | |
args = parser.parse_args(args) | |
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG if args.verbose else logging.WARN) | |
data = pd.DataFrame({"linelen": list(l for file in args.files for l in line_len(file))}) | |
logging.info(f"Read data from {len(args.files)} files") | |
logging.debug(f"Read {len(data.index)} lines") | |
bins = list(range(200)) + [100000] | |
sns.distplot(data, bins=bins) | |
plt.xlim(0, 200) | |
plt.savefig(args.hist) | |
logging.info(f"Save histogram to {args.hist.name}") | |
plt.clf() | |
sns.distplot(data, | |
bins=bins, | |
hist_kws={"cumulative": True}, | |
kde_kws={"cumulative": True}, | |
kde=False, | |
norm_hist=True | |
) | |
plt.xlim(0, 200) | |
plt.savefig(args.cdf) | |
logging.info(f"Save cumulative to {args.cdf.name}") | |
cuts = [-1, 0, 80, 88, 120, max(data.linelen)] | |
c = pd.cut(data.linelen, cuts, right=True) | |
counts = data.groupby(c).count() | |
print(f"counts:\n{counts}") | |
print(f"total: {len(data.index)}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment