Skip to content

Instantly share code, notes, and snippets.

@mjordan
Last active September 30, 2021 20:53
Show Gist options
  • Save mjordan/a0f846b3bb4b3d34503f62b7d4431e62 to your computer and use it in GitHub Desktop.
Save mjordan/a0f846b3bb4b3d34503f62b7d4431e62 to your computer and use it in GitHub Desktop.
Marge CSVs
# Script to merge a secondary CSV file into a main CSV file. Columns not in a file are
# added to the merged file as empty CSV cells. Records are joined on a required 'Nid' column.
# Column headers should be unique in both files (other than 'Nid'); if they are present
# in both files, both instances will be added to the merged file.
import csv
main_filename = 'input/main.csv'
secondary_filename = 'input/secondary.csv'
output_filename = 'input/test_output.csv'
input_delimiter = '\t'
output_delimiter = ','
# Read main CSV.
main_csv_reader_file_handle = open(main_filename, 'r', encoding="utf-8", newline='')
main_csv_reader = csv.DictReader(main_csv_reader_file_handle, delimiter=input_delimiter)
main_csv_reader_fieldnames = main_csv_reader.fieldnames
main_records = dict()
for row in main_csv_reader:
main_records[row['Nid']] = dict(row)
# Read secondary CSV.
secondary_csv_reader_file_handle = open(secondary_filename, 'r', encoding="utf-8", newline='')
secondary_csv_reader = csv.DictReader(secondary_csv_reader_file_handle, delimiter=input_delimiter)
secondary_csv_reader_fieldnames = secondary_csv_reader.fieldnames
secondary_records = dict()
for row in secondary_csv_reader:
secondary_records[row['Nid']] = dict(row)
# Merge 'em.
merged_records = dict()
for main_record_nid, data in main_records.items():
merged_records[main_record_nid] = main_records[main_record_nid]
for empty_column in secondary_csv_reader_fieldnames:
if empty_column != 'Nid':
merged_records[main_record_nid][empty_column] = ''
for secondary_record_nid, data in secondary_records.items():
if secondary_record_nid in main_records.keys():
merged_records[secondary_record_nid] = dict()
merged_records[secondary_record_nid].update(secondary_records[secondary_record_nid])
merged_records[secondary_record_nid]['Nid'] = secondary_record_nid
for main_csv_column in main_csv_reader_fieldnames:
merged_records[secondary_record_nid][main_csv_column] = main_records[secondary_record_nid][main_csv_column]
else:
merged_records[secondary_record_nid] = secondary_records[secondary_record_nid]
for empty_column in main_csv_reader_fieldnames:
merged_records[secondary_record_nid][empty_column] = ''
merged_records[secondary_record_nid]['Nid'] = secondary_record_nid
# Write out merged CSV.
secondary_csv_reader_fieldnames.remove('Nid')
merged_records_fieldnames = main_csv_reader_fieldnames + secondary_csv_reader_fieldnames
writer_file_handle = open(output_filename, 'w+', newline='')
writer = csv.DictWriter(writer_file_handle, fieldnames=merged_records_fieldnames, delimiter=output_delimiter)
writer.writeheader()
for nid, record in merged_records.items():
writer.writerow(record)
writer_file_handle.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment