dsoprea · August 29, 2024 22:31
diff --git a/csv_translate_values b/csv_translate_values
 #!/usr/bin/env python3
 """
 Apply data from one or more CSVs to translate values in the CSV on STDIN.
 This will only read a certain source CSV once even if provided multiple times to
 translate multiple columns in the principal data.
 """

 """
 Copyright 2024 Dustin Oprea

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the “Software”), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 """

 import sys
 import argparse
 import collections
 import csv

 _DESCRIPTION = \
    "Read a CSV from STDIN and translate certain values based on secondary " \
    "CSVs."

 def _get_args():
    parser = \
        argparse.ArgumentParser(
            description=_DESCRIPTION)

    parser.add_argument(
        '--csv',
        dest='from_csvs',
        action='append',
        nargs=4,
        default=[],
        metavar=('FILEPATH', 'SOURCE-FROM-COLUMN', 'SOURCE-TO-COLUMN', 'ACTIVE-COLUMN'),
        help="Take the values from the given 'from' column in the given CSV and translate it to the values in the 'to' columns in the 'active' column of the principal file")

    args = parser.parse_args()
    return args

 def _print(message):
    print(message, file=sys.stderr)

 def _main():
    args = _get_args()

    assert \
        args.from_csvs, \
        "At least one CSV must be given."

    assert \
        sys.stdin.isatty() is False, \
        "Please pipe data via STDIN."


    # Group CSVs by file

    csvs_grouped_by_file = collections.defaultdict(list)

    for filepath, source_from_column, source_to_column, active_column \
            in args.from_csvs:

        csvs_grouped_by_file[filepath].append((
            source_from_column,
            source_to_column,
            active_column,
        ))


    # Read CSVs and build translation data

    translations = collections.defaultdict(dict)

    for filepath, specs in csvs_grouped_by_file.items():

        _print("Loading translation data: [{}]".format(filepath))

        with open(filepath) as f:
            c = csv.DictReader(f)
            for record in c:
                for source_from_column, source_to_column, active_column in specs:

                    from_ = record[source_from_column]
                    to_ = record[source_to_column]

                    translations[active_column][from_] = to_


    # Enumerate principal CSV records and apply translations

    in_ = csv.DictReader(sys.stdin)

    out_ = csv.DictWriter(sys.stdout, in_.fieldnames)
    out_.writeheader()

    translation_column_names = translations.keys()

    # Sort, so that we maintain the order that we process the columns in (on
    # principal, and for debugging)
    translation_column_names = sorted(translation_column_names)

    for i, record in enumerate(in_):
        for column_name in translation_column_names:
            value = record.get(column_name)

            if value is not None:
                # A value for this column was present and non-None

                try:
                    translated = translations[column_name][value]

                except KeyError:
                    pass

                else:
                    _print("TRANSLATE: [{}] [{}]->[{}]".format(
                           column_name, value, translated))

                    record[column_name] = translated

        out_.writerow(record)


 _main()
	#!/usr/bin/env python3
	"""
	Apply data from one or more CSVs to translate values in the CSV on STDIN.
	This will only read a certain source CSV once even if provided multiple times to
	translate multiple columns in the principal data.
	"""

	"""
	Copyright 2024 Dustin Oprea

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the “Software”), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the Software is
	furnished to do so, subject to the following conditions:

	The above copyright notice and this permission notice shall be included in all
	copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	SOFTWARE.
	"""

	import sys
	import argparse
	import collections
	import csv

	_DESCRIPTION = \
	"Read a CSV from STDIN and translate certain values based on secondary " \
	"CSVs."

	def _get_args():
	parser = \
	argparse.ArgumentParser(
	description=_DESCRIPTION)

	parser.add_argument(
	'--csv',
	dest='from_csvs',
	action='append',
	nargs=4,
	default=[],
	metavar=('FILEPATH', 'SOURCE-FROM-COLUMN', 'SOURCE-TO-COLUMN', 'ACTIVE-COLUMN'),
	help="Take the values from the given 'from' column in the given CSV and translate it to the values in the 'to' columns in the 'active' column of the principal file")

	args = parser.parse_args()
	return args

	def _print(message):
	print(message, file=sys.stderr)

	def _main():
	args = _get_args()

	assert \
	args.from_csvs, \
	"At least one CSV must be given."

	assert \
	sys.stdin.isatty() is False, \
	"Please pipe data via STDIN."


	# Group CSVs by file

	csvs_grouped_by_file = collections.defaultdict(list)

	for filepath, source_from_column, source_to_column, active_column \
	in args.from_csvs:

	csvs_grouped_by_file[filepath].append((
	source_from_column,
	source_to_column,
	active_column,
	))


	# Read CSVs and build translation data

	translations = collections.defaultdict(dict)

	for filepath, specs in csvs_grouped_by_file.items():

	_print("Loading translation data: [{}]".format(filepath))

	with open(filepath) as f:
	c = csv.DictReader(f)
	for record in c:
	for source_from_column, source_to_column, active_column in specs:

	from_ = record[source_from_column]
	to_ = record[source_to_column]

	translations[active_column][from_] = to_


	# Enumerate principal CSV records and apply translations

	in_ = csv.DictReader(sys.stdin)

	out_ = csv.DictWriter(sys.stdout, in_.fieldnames)
	out_.writeheader()

	translation_column_names = translations.keys()

	# Sort, so that we maintain the order that we process the columns in (on
	# principal, and for debugging)
	translation_column_names = sorted(translation_column_names)

	for i, record in enumerate(in_):
	for column_name in translation_column_names:
	value = record.get(column_name)

	if value is not None:
	# A value for this column was present and non-None

	try:
	translated = translations[column_name][value]

	except KeyError:
	pass

	else:
	_print("TRANSLATE: [{}] [{}]->[{}]".format(
	column_name, value, translated))

	record[column_name] = translated

	out_.writerow(record)


	_main()
No results found