Skip to content

Instantly share code, notes, and snippets.

@johnstcn
Created April 11, 2014 02:30
Show Gist options
  • Save johnstcn/10437873 to your computer and use it in GitHub Desktop.
Save johnstcn/10437873 to your computer and use it in GitHub Desktop.
Tool to merge csv files given a commonly named column header
#!/usr/bin/env python
import csv
import argparse
import sys
from collections import defaultdict
helptext = """Merge multiple CSV files using a commonly named column.
CSV files must have column headings, but do not need to be sorted.
Rows with incomplete information can be kept or discarded."""
ap = argparse.ArgumentParser(helptext)
ap.add_argument("files", nargs="+", help="CSV files to merge", type=argparse.FileType('r'))
ap.add_argument("key", help="Use this column header to merge the files")
ap.add_argument("--keep", action='store_true', help="Keep rows with incomplete information")
ap.add_argument("--outfile", help="File to write to (default is stdout)", default=sys.stdout, type=argparse.FileType('w'))
args = ap.parse_args()
output_rows = defaultdict(dict)
# keep tab of all fields we've seen
allfields = set()
# slurp in all the files and store the info
for infile in args.files:
rdr = csv.DictReader(infile)
# add in any new fieldnames that may exist
allfields.update(rdr.fieldnames)
for row in rdr:
output_rows[row[args.key]].update(row)
wrtr = csv.DictWriter(args.outfile, fieldnames=sorted(allfields), dialect=csv.excel)
# write column headings
wrtr.writeheader()
# write the output to our file
for outrow in output_rows.values():
# only write a row if it has all the fields we're interested in, or if we care about ones with incomplete info
if (set(outrow.keys()) == allfields) or args.keep:
wrtr.writerow(outrow)
args.outfile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment