Skip to content

Instantly share code, notes, and snippets.

@vadimkantorov
Last active November 21, 2023 16:40
Show Gist options
  • Save vadimkantorov/013972939f60a85a2ffea044dfa75139 to your computer and use it in GitHub Desktop.
Save vadimkantorov/013972939f60a85a2ffea044dfa75139 to your computer and use it in GitHub Desktop.
Prints set difference of a CSV file with another CSV file based on specified join fields
# -*- coding: utf-8 -*-
#
# Usage:
# python csvsetdiff.py mycsv1.csv - mycsv2.csv --key1 "Email" --key2 "Email acheteur" > mycsvsetdiff.csv
# python3 csvsetdiff.py Contacts.csv - adherez.csv --delimiter1="," --delimiter2=";" --key1 "Email" "Email Home" --key2 "Email payeur" "Champ complémentaire 2 Email" --encoding=utf-16 --delimiter=$'\t' --ignorecase --translate .= > test3.csv
import sys
import csv
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('path1')
parser.add_argument('op', choices = ['-'], help = 'ignored, added for UX clarity; only minus sign is suppored standing for set difference')
parser.add_argument('path2')
parser.add_argument('--key1', nargs = '+', default = ['email'], help = 'ignorecase is used')
parser.add_argument('--key2', nargs = '+', default = ['email'], help = 'ignorecase is used')
parser.add_argument('--delimiter1', default = ',', choices = [',', ';', '\t'])
parser.add_argument('--delimiter2', default = ',', choices = [',', ';', '\t'])
parser.add_argument('--delimiter', default = ';', choices = [',', ';', '\t'])
parser.add_argument('--encoding1', default = 'utf-8-sig', help = 'utf-8-sig is used to remove BOM')
parser.add_argument('--encoding2', default = 'utf-8-sig', help = 'utf-8-sig is used to remove BOM')
parser.add_argument('--encoding', default = 'utf-8', help = '''to open correctly in Excel and ensure a BOM, use --encoding=utf-16 --delimiter=$'\t' ''')
parser.add_argument('--ignorecase', '-i', action = 'store_true', help = 'in the key fields')
parser.add_argument('--translate', help = '''in the key fields, to replace char 'a' to nothing and, char 'c' to char 'd', use --translate a=;c=d ''')
args = parser.parse_args()
col = lambda rows, fieldname: ([k for k in rows[0] if k.lower() == fieldname.lower()] + [None])[0] if rows else None
san = lambda fieldvalue: (fieldvalue.lower() if args.ignorecase else fieldvalue).translate(dict(v.split('=') for v in args.translate.split(';')) if args.translate else {})
print('#', args, file = sys.stderr)
reader1 = csv.DictReader(open(args.path1, encoding = args.encoding1), delimiter = args.delimiter1)
reader2 = csv.DictReader(open(args.path2, encoding = args.encoding2), delimiter = args.delimiter2)
rows1 = list(reader1)
rows2 = list(reader2)
key1 = [col(rows1, k) for k in args.key1]
key2 = [col(rows2, k) for k in args.key2]
print('# key1:', key1, file = sys.stderr)
print('# key2:', key2, file = sys.stderr)
print('# sorted(reader1.fieldnames):', sorted(reader1.fieldnames), file = sys.stderr)
print('# sorted(reader2.fieldnames):', sorted(reader2.fieldnames), file = sys.stderr)
if key1 and key2:
vals = set(san(d[k]) for d in rows2 for k in key2)
rows = [d for d in rows1 if all(san(d[k]) not in vals for k in key1)]
sys.stdout.reconfigure(encoding = args.encoding, newline = None)
writer = csv.DictWriter(sys.stdout, fieldnames = reader1.fieldnames, delimiter = args.delimiter)
writer.writeheader()
writer.writerows(rows)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment