Last active
April 4, 2023 17:37
-
-
Save tsibley/16895440555b9cb3d27e7429fb495e04 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| usage: tsv-cast-header <target.tsv> <source.tsv> | |
| Casts a <source.tsv> into the header of <target.tsv>. | |
| Columns are reordered, dropped, and added as necessary. Added columns will | |
| have blank values. | |
| No output will be emitted if <source.tsv> has no rows. <target.tsv> must have | |
| at least a header line. | |
| All conversion is performed in a memory efficient manner, and inputs do not | |
| need to be seekable. | |
| """ | |
| import csv | |
| from argparse import ArgumentParser, RawDescriptionHelpFormatter | |
| from sys import stdin, stdout, stderr, exit | |
| cli = ArgumentParser( | |
| description = __doc__.strip().split("\n\n", 1)[1], | |
| formatter_class = RawDescriptionHelpFormatter) | |
| cli.add_argument("target", metavar = "<target.tsv>") | |
| cli.add_argument("source", metavar = "<source.tsv>") | |
| args = cli.parse_args() | |
| # Read header line of <target.tsv> | |
| with open(args.target, "r", encoding = "utf-8", newline = "") as target: | |
| lines = csv.reader(target, dialect = "excel-tab") | |
| try: | |
| header = next(lines) | |
| except StopIteration: | |
| print(f"{cli.prog}: error: {args.target!r} (the target) appears to empty; it must contain at least a header line", file = stderr) | |
| exit(1) | |
| # Set up output for casting from one dict to another | |
| output = csv.DictWriter( | |
| stdout, | |
| header, | |
| restval = "", | |
| extrasaction = "ignore", | |
| lineterminator = "\n") | |
| # Cast <source.tsv> | |
| with open(args.source, "r", encoding = "utf-8", newline = "") as source: | |
| input = csv.DictReader(source, dialect = "excel-tab") | |
| for i, row in enumerate(input): | |
| if i == 0: | |
| if not set(input.fieldnames) & set(output.fieldnames): | |
| print(f"{cli.prog}: error: {args.target!r} (the target) and {args.source!r} (the source) share no columns; they must share at least one", file = stderr) | |
| exit(1) | |
| output.writeheader() | |
| output.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment