Created
April 28, 2017 05:15
-
-
Save charmoniumQ/e0e907912be37ef4103b5957b2ecf95c to your computer and use it in GitHub Desktop.
CSV-aware cut -- outputs chosen columns of it's input
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import click | |
import sys | |
import csv | |
version = 0.1 | |
version_info = ''' | |
csvcut v{version} | |
by Sam Grayson | |
(backwards-c) 2017 GPLv3 | |
''' | |
@click.command() | |
@click.option('--fields', '-f', type=str, | |
help='select only these (numbered) fields in this order') | |
@click.option('--kfields', '-k', type=str, | |
help='select only these keyword fields in this order') | |
@click.option('--complement', default=False, type=bool, | |
help='inverts the selection') | |
# --help implicit option | |
@click.option('--version', 'version', flag_value=True, default=False, type=bool, | |
help='prints version and exits') | |
@click.argument('file', default=sys.stdin, type=click.File('r')) | |
def main(fields, kfields, complement, version, file): | |
'''CSV-aware cut -- outputs chosen columns of it's input | |
This script is a thin wrapper around Python's CSV (Comma-Separated Values) | |
library with a cut-like interface. It is capable of handling escaped commas and | |
newlines. | |
With no FILE, or when FILE is -, read standard input. | |
If neither keyword fields nor numbered fields are supplied, print the header. | |
For keyword fields with -k, the first row is interpreted as the header. | |
For numbered fields with -f, this accepts comma separated field-ranges of the | |
following form where N, and M are integers (like the cut syntax): | |
\b | |
N N'th field | |
N- from N'th field | |
N-M from N'th to M'th (included) field | |
-M from first to M'th (included) field | |
Fields out of the range of the row will extract the empty string. Use / in place | |
of a minus-sign to input negative number. Negative indices are interpreted as | |
counting from the end of the line. | |
''' | |
if version: | |
return print_version() | |
if fields and not kfields: | |
return print_fields(file, fields, complement) | |
if kfields and not fields: | |
return print_kfields(file, kfields, complement) | |
if kfields and fields: | |
raise RuntimeError('Cannot provide both numbered fields and keyword fields') | |
if not kfields and not fields: | |
return print_header(file) | |
def str_to_indices(fields): | |
'''returns a list of indices or a tuple representing the range [M:None]''' | |
slices = [] | |
fields = fields.split(',') | |
for field in fields: | |
if '-' in field: | |
span = field.split('-') | |
span = tuple(map(lambda x: x.replace('/', '-'), span)) | |
if len(span) != 2: | |
raise RuntimeError('Cannot parse {field}\nShould be M, M-, -M, or N-M'.format(**locals())) | |
if span[0] and span[1]: | |
slices.append((int(span[0]), int(span[1]))) | |
elif span[0] and not span[1]: | |
slices.append((int(span[0]), None)) | |
elif not span[0] and span[1]: | |
slices.append((0, int(span[1]))) | |
else: | |
field = field.replace('/', '-') | |
slices.append(int(field)) | |
return slices | |
def print_fields(file, fields, complement): | |
try: | |
indices = str_to_indices(fields) | |
except RuntimeError as e: | |
print(str(e)) | |
return 1 | |
return print_fields_(file, sys.stdout, indices, complement) | |
def print_fields_(infile, outfile, indices, complement): | |
csv_in = csv.reader(infile) | |
csv_out = csv.writer(outfile) | |
for line in csv_in: | |
indices = indices_to_positive_indices(indices, len(line)) | |
if complement: | |
indices = complement(indices, len(line)) | |
csv_out.writerow(project_arr(line, indices, '')) | |
return 0 | |
def print_kfields(file, kfields, complement): | |
print_kfields_(file, sys.stdout, kfields.split(','), complement) | |
def print_kfields_(infile, outfile, fields, complement): | |
csv_in = csv.DictReader(infile) | |
csv_out = csv.DictWriter(outfile, fields) | |
if complement: | |
fields = [field for field in csv_in.fieldnames if field not in fields] | |
for line in csv_in: | |
csv_out.writerow(project(line, fields, '')) | |
return 0 | |
def print_header(file): | |
csv_in = csv.DictReader(file) | |
print('\n'.join(csv_in.fieldnames)) | |
return 0 | |
def print_version(): | |
print(version_info.format(**globals()).strip()) | |
def indices_to_positive_indices(indices, length): | |
r = [] | |
for index in indices: | |
if isinstance(index, tuple): | |
start, stop = index | |
if start < 0: | |
start += length | |
if stop is None: | |
stop = length - 1 | |
if stop < 0: | |
stop += length | |
r.extend(range(start, stop + 1)) # inclusive, like unix cut | |
else: | |
if index < 0: | |
index += length | |
r.append(index) | |
return r | |
def complement(indices, length): | |
return [index for index in range(length) if index not in set(indices)] | |
def project(d, keys, default=None): | |
return {key: d[key] if key in d else default | |
for key in keys} | |
def project_arr(arr, indices, default=None): | |
return [arr[index] if 0 <= index < len(arr) else default | |
for index in indices] | |
def sniff_dialect(file): | |
dialect = csv.Sniffer().sniff(file.read(1024)) | |
file.seek(0) | |
return dialect | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment