Last active
February 16, 2025 23:23
-
-
Save zk-1/d602a99136a35782618423fbf4c7d0d8 to your computer and use it in GitHub Desktop.
A tool to identify rows in a CSV file that may represent the same person, based on one or more provided matching types
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# frozen_string_literal: true | |
# Name: Person Grouper (person_grouper.rb) | |
# Description: A tool to identify rows in a CSV file that may represent | |
# the same person, based on one or more provided matching types | |
# Limitations: Algorithm is forward-facing only (no retrospecting merging) | |
# Author: Zoë Kelly ([email protected]) | |
# License: MIT | |
# Created: 2025-02-16 | |
# | |
# Usage: ruby person_grouper.rb <input_file_path> <matching_type> | |
# Test: ruby person_grouper.rb test | |
require 'csv' | |
require 'benchmark' | |
require 'minitest' | |
class PersonGrouper | |
class << self | |
def process(input_file_path, matching_type) | |
new(input_file_path:, matching_type:).process | |
end | |
end | |
def initialize(matching_type:, input_file_path: nil, input_csv: nil) | |
@input_file_path = input_file_path | |
@matching_type = matching_type | |
@input_csv = input_csv | |
@person_uuid_counter = 1 # Counter for unique person UUIDs | |
@attribute_uuid_counter = 1 # Counter for unique attribute UUIDs | |
@attribute_to_attribute_uuid_hash_map = {} # Map of attribute values to their UUIDs | |
@attribute_uuid_to_person_uuid_hash_map = {} # Map of attribute UUIDs to person UUIDs | |
@output_rows = [] | |
end | |
def process | |
puts "Processing #{@input_file_path || 'provided CSV object'} with #{@matching_type}" | |
# Input CSV can be provided as a path or as a CSV object | |
unless input_csv | |
puts 'Input CSV not provided!' | |
return | |
end | |
# Process each row in the input CSV file | |
input_csv.each do |row| | |
# Assign or determine the Attribute UUIDs for any eligible attributes in the current row | |
attribute_uuids = attribute_uuids_for(row) | |
# Assign or determine the Person UUID for the current row | |
person_uuid = person_uuid_for(attribute_uuids) | |
# Add the Person UUID and the current row's existing values to the output rows | |
@output_rows << [person_uuid].concat(row.fields) | |
end | |
# Create the CSV object | |
output_csv = CSV::Table.new([CSV::Row.new(output_headers, [], true)]) | |
@output_rows.each { |row| output_csv << row } | |
if @input_file_path | |
# Generate string and write to output CSV file | |
output_file = @input_file_path.sub('.csv', '_output.csv') | |
File.write(output_file, output_csv.to_csv) | |
puts "Done! Output written to #{output_file}" | |
else | |
# Return the CSV object when using input_csv | |
output_csv | |
end | |
rescue StandardError => e | |
puts "Sorry, there was an error: #{e.message}" | |
end | |
private | |
# Determine the output headers | |
def output_headers | |
['uuid'].concat(input_headers) | |
end | |
# Determine the Attribute UUIDs for any eligible attributes in the current row | |
def attribute_uuids_for(row) | |
attribute_uuids = [] | |
# Iterate over each keyword and its corresponding column indexes | |
keyword_to_column_indexes.each do |keyword, column_indexes| | |
column_indexes.each do |column_index| | |
next unless (column_raw_value = row[column_index]) | |
# Normalize the column value | |
column_value = normalize_column_value(keyword, column_raw_value) | |
# Determine the composite attribute | |
# This is to ensure that e.g. a first name cannot be matched with a last name, | |
# even if they are the same name | |
attribute = "#{keyword}_#{column_value}" | |
# Determine the UUID for the attribute if it already exists | |
attribute_uuid = @attribute_to_attribute_uuid_hash_map[attribute] | |
# If the attribute does not yet have a UUID, assign one and increment the counter, | |
# saving the mapping to the hash map for efficient subsequent lookups | |
if attribute_uuid.nil? | |
@attribute_to_attribute_uuid_hash_map[attribute] = @attribute_uuid_counter | |
attribute_uuid = @attribute_uuid_counter | |
@attribute_uuid_counter += 1 | |
end | |
attribute_uuids << attribute_uuid | |
end | |
end | |
# Return the list of attribute UUIDs found for the current row | |
attribute_uuids | |
end | |
# Determine the Person UUID for the current row | |
def person_uuid_for(attribute_uuids) | |
# Find out if we have previously linked a Person UUID to any of the attributes | |
person_uuids = attribute_uuids.map do |attribute_uuid| | |
@attribute_uuid_to_person_uuid_hash_map[attribute_uuid] | |
end | |
# Although there could be more than one Person UUID linked to the attributes, | |
# we choose to use the lowest value, as it is the oldest Person UUID | |
# this provides a forward-facing algorithm that is relatively accurate | |
person_uuid = person_uuids.compact.uniq.min | |
# If we haven't found any Person UUIDs linked to the attributes, then all attributes | |
# must be new, and so we should assign a new Person UUID and increment the counter | |
if person_uuid.nil? | |
person_uuid = @person_uuid_counter | |
@person_uuid_counter += 1 | |
end | |
# Save the mapping of the attribute UUIDs to the Person UUID, only if the attribute UUID | |
# is not already mapped to a Person UUID. If it is already mapped, we do not overwrite it, | |
# to ensure that UUIDs are not skipped. This is a trade-off within the forward-facing algorithm. | |
attribute_uuids.each do |attribute_uuid| | |
@attribute_uuid_to_person_uuid_hash_map[attribute_uuid] ||= person_uuid | |
end | |
person_uuid | |
end | |
# If the input CSV object was not provided during initialization, read it from the input file path. | |
def input_csv | |
@input_csv ||= CSV.read(@input_file_path, headers: true) | |
end | |
# Determine the headers of the input CSV file | |
def input_headers | |
@input_headers ||= input_csv.headers | |
end | |
# Determine the keywords from the matching type | |
def keywords | |
@matching_type.split('_or_').map(&:strip) | |
end | |
# Determine the column indexes for each keyword | |
# A keyword is a string provided by the user as part of the matching type | |
# e.g. 'phone_or_email_or_lastname' => ['phone', 'email', 'lastname'] | |
def keyword_to_column_indexes | |
# Memoize the result | |
return @keyword_to_column_indexes if defined?(@keyword_to_column_indexes) | |
@keyword_to_column_indexes = {} | |
# Iterate over each header and its corresponding column index | |
input_headers.each_with_index do |header, column_index| | |
# Determine if the header matches any of the keywords | |
next unless (keyword = keywords.find { |k| header.match?(/^#{k}/i) }) | |
# Add the column index to the list of column indexes for the keyword | |
@keyword_to_column_indexes[keyword] ||= [] | |
@keyword_to_column_indexes[keyword] << column_index | |
end | |
@keyword_to_column_indexes | |
end | |
# Normalize the value of a column. Other normalization methods could be added here | |
# for different types of data, such as zip/postal codes. | |
def normalize_column_value(keyword, value) | |
# Ensure the value is a string and in lowercase and has no leading or trailing whitespace | |
value = value.to_s.strip.downcase | |
case keyword | |
when 'phone' | |
# Remove any non-numeric characters and leading '1' | |
value.gsub(/[^0-9]/, '').gsub(/^1/, '') | |
else | |
value | |
end | |
end | |
end | |
# ------------------------------------------------------------------------------------------------- | |
class PersonGrouperTest < Minitest::Test | |
def test_groups_related_records_by_phone | |
input_csv = CSV.parse(input_csv_data, headers: true) | |
# Process the CSV with phone matching | |
grouper = PersonGrouper.new(matching_type: 'phone', input_csv:) | |
output_csv = grouper.process | |
# Convert output to array for easier testing | |
output_rows = output_csv.to_a | |
# Test expectations | |
assert_equal 5, output_rows.length # Including header row | |
# First two records should have the same UUID (same phone) | |
assert_equal output_rows[1][0], output_rows[2][0] | |
# Third and fourth records should have different UUIDs | |
refute_equal output_rows[1][0], output_rows[3][0] | |
refute_equal output_rows[3][0], output_rows[4][0] | |
end | |
def test_groups_related_records_by_email | |
input_csv = CSV.parse(input_csv_data, headers: true) | |
# Process the CSV with email matching | |
grouper = PersonGrouper.new(matching_type: 'email', input_csv:) | |
output_csv = grouper.process | |
output_rows = output_csv.to_a | |
# Test expectations | |
assert_equal 5, output_rows.length # Including header row | |
# First and third records should have the same UUID (same email) | |
assert_equal output_rows[1][0], output_rows[3][0] | |
# Second and fourth records should have the same UUID (same email) | |
assert_equal output_rows[2][0], output_rows[4][0] | |
# First and second records should have different UUIDs | |
refute_equal output_rows[1][0], output_rows[2][0] | |
end | |
def test_groups_related_records_by_phone_or_email | |
input_csv = CSV.parse(input_csv_data, headers: true) | |
# Process the CSV with phone_or_email matching | |
grouper = PersonGrouper.new(matching_type: 'phone_or_email', input_csv:) | |
output_csv = grouper.process | |
output_rows = output_csv.to_a | |
# Test expectations | |
assert_equal 5, output_rows.length # Including header row | |
# First and second records should have same UUID (same phone) | |
assert_equal output_rows[1][0], output_rows[2][0] | |
# First and third records should have same UUID (same email) | |
assert_equal output_rows[1][0], output_rows[3][0] | |
# Second and fourth records should have same UUID (same email) | |
assert_equal output_rows[2][0], output_rows[4][0] | |
# All records should have the same UUID due to transitive relationships | |
assert_equal output_rows[1][0], output_rows[4][0] | |
end | |
private | |
def input_csv_data | |
<<~CSV | |
first_name,last_name,phone1,phone2,email | |
John,Doe,123-456-7890,,[email protected] | |
Johnny,Doe,,123.456.7890,[email protected] | |
Jane,Smith,555-867-5309,,[email protected] | |
Bob,Jones,777-888-9999,,[email protected] | |
CSV | |
end | |
end | |
# ------------------------------------------------------------------------------------------------- | |
if ARGV.length == 1 && ARGV[0] == 'test' | |
require 'minitest/autorun' | |
exit 0 | |
elsif ARGV.length != 2 | |
puts 'Usage: ruby person_grouper.rb <input_file_path> <matching_type>' | |
puts 'Matching type consists of one or more keywords, separated by \'_or_\'' | |
puts 'Examples: email, phone, email_or_phone, phone_or_email_or_lastname' | |
exit 1 | |
end | |
# Get the input file and matching type from the command line arguments | |
input_file_path = ARGV[0] | |
matching_type = ARGV[1] | |
# Benchmark the processing time of the PersonGrouper | |
Benchmark.bm do |x| | |
x.report('Processing CSV') do | |
PersonGrouper.process(input_file_path, matching_type) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment