Last active
September 30, 2021 20:53
-
-
Save mjordan/a0f846b3bb4b3d34503f62b7d4431e62 to your computer and use it in GitHub Desktop.
Marge CSVs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script to merge a secondary CSV file into a main CSV file. Columns not in a file are | |
# added to the merged file as empty CSV cells. Records are joined on a required 'Nid' column. | |
# Column headers should be unique in both files (other than 'Nid'); if they are present | |
# in both files, both instances will be added to the merged file. | |
import csv | |
main_filename = 'input/main.csv' | |
secondary_filename = 'input/secondary.csv' | |
output_filename = 'input/test_output.csv' | |
input_delimiter = '\t' | |
output_delimiter = ',' | |
# Read main CSV. | |
main_csv_reader_file_handle = open(main_filename, 'r', encoding="utf-8", newline='') | |
main_csv_reader = csv.DictReader(main_csv_reader_file_handle, delimiter=input_delimiter) | |
main_csv_reader_fieldnames = main_csv_reader.fieldnames | |
main_records = dict() | |
for row in main_csv_reader: | |
main_records[row['Nid']] = dict(row) | |
# Read secondary CSV. | |
secondary_csv_reader_file_handle = open(secondary_filename, 'r', encoding="utf-8", newline='') | |
secondary_csv_reader = csv.DictReader(secondary_csv_reader_file_handle, delimiter=input_delimiter) | |
secondary_csv_reader_fieldnames = secondary_csv_reader.fieldnames | |
secondary_records = dict() | |
for row in secondary_csv_reader: | |
secondary_records[row['Nid']] = dict(row) | |
# Merge 'em. | |
merged_records = dict() | |
for main_record_nid, data in main_records.items(): | |
merged_records[main_record_nid] = main_records[main_record_nid] | |
for empty_column in secondary_csv_reader_fieldnames: | |
if empty_column != 'Nid': | |
merged_records[main_record_nid][empty_column] = '' | |
for secondary_record_nid, data in secondary_records.items(): | |
if secondary_record_nid in main_records.keys(): | |
merged_records[secondary_record_nid] = dict() | |
merged_records[secondary_record_nid].update(secondary_records[secondary_record_nid]) | |
merged_records[secondary_record_nid]['Nid'] = secondary_record_nid | |
for main_csv_column in main_csv_reader_fieldnames: | |
merged_records[secondary_record_nid][main_csv_column] = main_records[secondary_record_nid][main_csv_column] | |
else: | |
merged_records[secondary_record_nid] = secondary_records[secondary_record_nid] | |
for empty_column in main_csv_reader_fieldnames: | |
merged_records[secondary_record_nid][empty_column] = '' | |
merged_records[secondary_record_nid]['Nid'] = secondary_record_nid | |
# Write out merged CSV. | |
secondary_csv_reader_fieldnames.remove('Nid') | |
merged_records_fieldnames = main_csv_reader_fieldnames + secondary_csv_reader_fieldnames | |
writer_file_handle = open(output_filename, 'w+', newline='') | |
writer = csv.DictWriter(writer_file_handle, fieldnames=merged_records_fieldnames, delimiter=output_delimiter) | |
writer.writeheader() | |
for nid, record in merged_records.items(): | |
writer.writerow(record) | |
writer_file_handle.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment