Last active
August 27, 2018 19:29
-
-
Save muppetjones/a2e186f11eb6e00e4b27c974080179e1 to your computer and use it in GitHub Desktop.
Print length of column names in a file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Print size of columns. | |
Usage: | |
$ python3 colsize.py ./ref_uniprot.csv | |
id: (1, 6) | |
ensembl_transcript_id: (0, 15) | |
feature_type: (4, 35) | |
feature_desc: (0, 747) | |
uniprot_id: (6, 10) | |
start_pos: (1, 5) | |
end_pos: (1, 5) | |
@Author: Stephen J. Bush | |
""" | |
import argparse | |
import re | |
class Column(object): | |
__CNT = 0 | |
def __new__(cls, *args, **kwargs): | |
cls.__CNT += 1 | |
return super().__new__(cls) | |
def __init__(self, name=None, field=None): | |
if field: | |
self._maxwidth = len(field) | |
self._minwidth = len(field) | |
self._fields = [field] | |
else: | |
self._maxwidth = 0 | |
self._minwidth = 9e6 | |
self._fields = [] | |
self._longest = field | |
self._shortest = field | |
self._name = name | |
self._i = self.__CNT | |
@property | |
def name(self): | |
return self._name or 'unnamed_{:03d}'.format(self._i) | |
@property | |
def maxwidth(self): | |
return self._maxwidth | |
@property | |
def minwidth(self): | |
return self._minwidth | |
@property | |
def longest(self): | |
return self._longest | |
@property | |
def shortest(self): | |
return self._shortest | |
@property | |
def n_fields(self): | |
return len(self._fields) | |
def add(self, value): | |
width = len(value) | |
if width > self._maxwidth: | |
self._maxwidth = width | |
self._longest = value | |
elif width < self._minwidth: | |
self._minwidth = width | |
self._shortest = value | |
self._fields.append(value) | |
@classmethod | |
def split_row(cls, row, *, rx): | |
return [ | |
x.strip('"') | |
for x in rx.split(row.strip()) | |
] | |
def aggregate_columns(path, delim=r'[,\t]', has_header=True): | |
"""Do something. | |
""" | |
columns = [] | |
rx_delim_w_quotes = re.compile( | |
delim + r'(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)' | |
) | |
with open(path, 'r') as fh: | |
first_line = Column.split_row(fh.readline(), rx=rx_delim_w_quotes) | |
if has_header: | |
columns = [Column(name=x) for x in first_line] | |
else: | |
columns = [ | |
Column(name=str(i), field=x) | |
for i, x in enumerate(first_line) | |
] | |
for i, line in enumerate(fh): | |
cols = Column.split_row(line, rx=rx_delim_w_quotes) | |
for j, col in enumerate(cols): | |
try: | |
columns[j].add(col) | |
except IndexError: | |
# store extra columns | |
columns.append(Column(field=col)) | |
return columns | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('path') | |
parser.add_argument('--delim', default=r'[,\t]') | |
parser.add_argument( | |
'--no-header', dest='has_header', default=True, action='store_false') | |
parser.add_argument( | |
'--count', dest='do_print_count', | |
default=False, action='store_true', | |
help='Print number of fields per column', | |
) | |
parser.add_argument( | |
'--longest', dest='do_print_longest', | |
default=False, action='store_true', | |
help='Print first, longest value of each column', | |
) | |
parser.add_argument( | |
'--shortest', dest='do_print_shortest', | |
default=False, action='store_true', | |
help='Print first, shortest of each column', | |
) | |
parser.add_argument( | |
'--values', dest='do_print_values', | |
default=False, action='store_true', | |
help='[not recommended] Print all values.', | |
) | |
parser.add_argument( | |
'--unique', dest='do_print_unique', | |
default=False, action='store_true', | |
help='Print number of unqiue values in the column.', | |
) | |
args = parser.parse_args() | |
columns = aggregate_columns( | |
args.path, delim=args.delim, has_header=args.has_header) | |
# write out column lengths | |
for column in columns: | |
print('{}: ({}, {})'.format( | |
column.name, | |
column.minwidth, | |
column.maxwidth, | |
)) | |
if args.do_print_count: | |
print('\n-- Count') | |
for column in columns: | |
print('{}: {}'.format(column.name, column.n_fields)) | |
if args.do_print_longest: | |
print('\n-- Longest') | |
for column in columns: | |
print('{}: "{}"'.format(column.name, column.longest)) | |
if args.do_print_shortest: | |
print('\n-- Shortest') | |
for column in columns: | |
print('{}: "{}"'.format(column.name, column.shortest)) | |
if args.do_print_values: | |
print('\n-- Values') | |
for column in columns: | |
print('{}: {}'.format(column.name, column._fields)) | |
if args.do_print_unique: | |
print('\n-- Unique') | |
unique_dict = { | |
column.name: (n_unique / n_values, n_unique, n_values) | |
for column in columns | |
for n_unique, n_values in [( | |
len(set(column._fields)), | |
len(column._fields), | |
)] | |
} | |
for name, values in sorted( | |
unique_dict.items(), key=lambda x: x[1], reverse=True): | |
print('{1:6.2%} {0} ({2} of {3})'.format(name, *values)) | |
# __END__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ ~/dev/scripts/colsize.py test.txt --count --longest --shortest --values | |
foo: (1, 3) | |
bar: (0, 3) | |
baz: (1, 9) | |
unnamed_004: (1, 1) | |
-- Count | |
foo: 6 | |
bar: 6 | |
baz: 6 | |
unnamed_004: 1 | |
-- Longest | |
foo: "waa" | |
bar: "b,c" | |
baz: "off_topic" | |
unnamed_004: "z" | |
-- Shortest | |
foo: "a" | |
bar: "" | |
baz: "d" | |
unnamed_004: "z" | |
-- Values | |
foo: ['1', 'a', 'waa', 'm', '4', '7'] | |
bar: ['2', 'b,c', 'x', 'n', '', ''] | |
baz: ['3', 'd', 'y', 'off_topic', '6', '9'] | |
unnamed_004: ['z'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
foo,bar,baz | |
1,2,3 | |
a,"b,c",d | |
waa,x,y,z | |
m n off_topic | |
4,,6 | |
7 9 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment