Skip to content

Instantly share code, notes, and snippets.

@dsoprea
Last active August 29, 2024 22:31
Show Gist options
  • Save dsoprea/ec825ba05c93d09a96590b7abae3d188 to your computer and use it in GitHub Desktop.
Save dsoprea/ec825ba05c93d09a96590b7abae3d188 to your computer and use it in GitHub Desktop.
Translate CSV values based on the columns in other CSVs
#!/usr/bin/env python3
"""
Apply data from one or more CSVs to translate values in the CSV on STDIN.
This will only read a certain source CSV once even if provided multiple times to
translate multiple columns in the principal data.
"""
"""
Copyright 2024 Dustin Oprea
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the “Software”), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
import sys
import argparse
import collections
import csv
_DESCRIPTION = \
"Read a CSV from STDIN and translate certain values based on secondary " \
"CSVs."
def _get_args():
parser = \
argparse.ArgumentParser(
description=_DESCRIPTION)
parser.add_argument(
'--csv',
dest='from_csvs',
action='append',
nargs=4,
default=[],
metavar=('FILEPATH', 'SOURCE-FROM-COLUMN', 'SOURCE-TO-COLUMN', 'ACTIVE-COLUMN'),
help="Take the values from the given 'from' column in the given CSV and translate it to the values in the 'to' columns in the 'active' column of the principal file")
args = parser.parse_args()
return args
def _print(message):
print(message, file=sys.stderr)
def _main():
args = _get_args()
assert \
args.from_csvs, \
"At least one CSV must be given."
assert \
sys.stdin.isatty() is False, \
"Please pipe data via STDIN."
# Group CSVs by file
csvs_grouped_by_file = collections.defaultdict(list)
for filepath, source_from_column, source_to_column, active_column \
in args.from_csvs:
csvs_grouped_by_file[filepath].append((
source_from_column,
source_to_column,
active_column,
))
# Read CSVs and build translation data
translations = collections.defaultdict(dict)
for filepath, specs in csvs_grouped_by_file.items():
_print("Loading translation data: [{}]".format(filepath))
with open(filepath) as f:
c = csv.DictReader(f)
for record in c:
for source_from_column, source_to_column, active_column in specs:
from_ = record[source_from_column]
to_ = record[source_to_column]
translations[active_column][from_] = to_
# Enumerate principal CSV records and apply translations
in_ = csv.DictReader(sys.stdin)
out_ = csv.DictWriter(sys.stdout, in_.fieldnames)
out_.writeheader()
translation_column_names = translations.keys()
# Sort, so that we maintain the order that we process the columns in (on
# principal, and for debugging)
translation_column_names = sorted(translation_column_names)
for i, record in enumerate(in_):
for column_name in translation_column_names:
value = record.get(column_name)
if value is not None:
# A value for this column was present and non-None
try:
translated = translations[column_name][value]
except KeyError:
pass
else:
_print("TRANSLATE: [{}] [{}]->[{}]".format(
column_name, value, translated))
record[column_name] = translated
out_.writerow(record)
_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment