Last active
August 29, 2024 22:31
-
-
Save dsoprea/ec825ba05c93d09a96590b7abae3d188 to your computer and use it in GitHub Desktop.
Translate CSV values based on the columns in other CSVs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Apply data from one or more CSVs to translate values in the CSV on STDIN. | |
This will only read a certain source CSV once even if provided multiple times to | |
translate multiple columns in the principal data. | |
""" | |
""" | |
Copyright 2024 Dustin Oprea | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the “Software”), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
""" | |
import sys | |
import argparse | |
import collections | |
import csv | |
_DESCRIPTION = \ | |
"Read a CSV from STDIN and translate certain values based on secondary " \ | |
"CSVs." | |
def _get_args(): | |
parser = \ | |
argparse.ArgumentParser( | |
description=_DESCRIPTION) | |
parser.add_argument( | |
'--csv', | |
dest='from_csvs', | |
action='append', | |
nargs=4, | |
default=[], | |
metavar=('FILEPATH', 'SOURCE-FROM-COLUMN', 'SOURCE-TO-COLUMN', 'ACTIVE-COLUMN'), | |
help="Take the values from the given 'from' column in the given CSV and translate it to the values in the 'to' columns in the 'active' column of the principal file") | |
args = parser.parse_args() | |
return args | |
def _print(message): | |
print(message, file=sys.stderr) | |
def _main(): | |
args = _get_args() | |
assert \ | |
args.from_csvs, \ | |
"At least one CSV must be given." | |
assert \ | |
sys.stdin.isatty() is False, \ | |
"Please pipe data via STDIN." | |
# Group CSVs by file | |
csvs_grouped_by_file = collections.defaultdict(list) | |
for filepath, source_from_column, source_to_column, active_column \ | |
in args.from_csvs: | |
csvs_grouped_by_file[filepath].append(( | |
source_from_column, | |
source_to_column, | |
active_column, | |
)) | |
# Read CSVs and build translation data | |
translations = collections.defaultdict(dict) | |
for filepath, specs in csvs_grouped_by_file.items(): | |
_print("Loading translation data: [{}]".format(filepath)) | |
with open(filepath) as f: | |
c = csv.DictReader(f) | |
for record in c: | |
for source_from_column, source_to_column, active_column in specs: | |
from_ = record[source_from_column] | |
to_ = record[source_to_column] | |
translations[active_column][from_] = to_ | |
# Enumerate principal CSV records and apply translations | |
in_ = csv.DictReader(sys.stdin) | |
out_ = csv.DictWriter(sys.stdout, in_.fieldnames) | |
out_.writeheader() | |
translation_column_names = translations.keys() | |
# Sort, so that we maintain the order that we process the columns in (on | |
# principal, and for debugging) | |
translation_column_names = sorted(translation_column_names) | |
for i, record in enumerate(in_): | |
for column_name in translation_column_names: | |
value = record.get(column_name) | |
if value is not None: | |
# A value for this column was present and non-None | |
try: | |
translated = translations[column_name][value] | |
except KeyError: | |
pass | |
else: | |
_print("TRANSLATE: [{}] [{}]->[{}]".format( | |
column_name, value, translated)) | |
record[column_name] = translated | |
out_.writerow(record) | |
_main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment