Skip to content

Instantly share code, notes, and snippets.

@zk-1
Last active February 16, 2025 23:23
Show Gist options
  • Save zk-1/d602a99136a35782618423fbf4c7d0d8 to your computer and use it in GitHub Desktop.
Save zk-1/d602a99136a35782618423fbf4c7d0d8 to your computer and use it in GitHub Desktop.
A tool to identify rows in a CSV file that may represent the same person, based on one or more provided matching types
#!/usr/bin/env ruby
# frozen_string_literal: true
# Name: Person Grouper (person_grouper.rb)
# Description: A tool to identify rows in a CSV file that may represent
# the same person, based on one or more provided matching types
# Limitations: Algorithm is forward-facing only (no retrospecting merging)
# Author: Zoë Kelly ([email protected])
# License: MIT
# Created: 2025-02-16
#
# Usage: ruby person_grouper.rb <input_file_path> <matching_type>
# Test: ruby person_grouper.rb test
require 'csv'
require 'benchmark'
require 'minitest'
class PersonGrouper
class << self
def process(input_file_path, matching_type)
new(input_file_path:, matching_type:).process
end
end
def initialize(matching_type:, input_file_path: nil, input_csv: nil)
@input_file_path = input_file_path
@matching_type = matching_type
@input_csv = input_csv
@person_uuid_counter = 1 # Counter for unique person UUIDs
@attribute_uuid_counter = 1 # Counter for unique attribute UUIDs
@attribute_to_attribute_uuid_hash_map = {} # Map of attribute values to their UUIDs
@attribute_uuid_to_person_uuid_hash_map = {} # Map of attribute UUIDs to person UUIDs
@output_rows = []
end
def process
puts "Processing #{@input_file_path || 'provided CSV object'} with #{@matching_type}"
# Input CSV can be provided as a path or as a CSV object
unless input_csv
puts 'Input CSV not provided!'
return
end
# Process each row in the input CSV file
input_csv.each do |row|
# Assign or determine the Attribute UUIDs for any eligible attributes in the current row
attribute_uuids = attribute_uuids_for(row)
# Assign or determine the Person UUID for the current row
person_uuid = person_uuid_for(attribute_uuids)
# Add the Person UUID and the current row's existing values to the output rows
@output_rows << [person_uuid].concat(row.fields)
end
# Create the CSV object
output_csv = CSV::Table.new([CSV::Row.new(output_headers, [], true)])
@output_rows.each { |row| output_csv << row }
if @input_file_path
# Generate string and write to output CSV file
output_file = @input_file_path.sub('.csv', '_output.csv')
File.write(output_file, output_csv.to_csv)
puts "Done! Output written to #{output_file}"
else
# Return the CSV object when using input_csv
output_csv
end
rescue StandardError => e
puts "Sorry, there was an error: #{e.message}"
end
private
# Determine the output headers
def output_headers
['uuid'].concat(input_headers)
end
# Determine the Attribute UUIDs for any eligible attributes in the current row
def attribute_uuids_for(row)
attribute_uuids = []
# Iterate over each keyword and its corresponding column indexes
keyword_to_column_indexes.each do |keyword, column_indexes|
column_indexes.each do |column_index|
next unless (column_raw_value = row[column_index])
# Normalize the column value
column_value = normalize_column_value(keyword, column_raw_value)
# Determine the composite attribute
# This is to ensure that e.g. a first name cannot be matched with a last name,
# even if they are the same name
attribute = "#{keyword}_#{column_value}"
# Determine the UUID for the attribute if it already exists
attribute_uuid = @attribute_to_attribute_uuid_hash_map[attribute]
# If the attribute does not yet have a UUID, assign one and increment the counter,
# saving the mapping to the hash map for efficient subsequent lookups
if attribute_uuid.nil?
@attribute_to_attribute_uuid_hash_map[attribute] = @attribute_uuid_counter
attribute_uuid = @attribute_uuid_counter
@attribute_uuid_counter += 1
end
attribute_uuids << attribute_uuid
end
end
# Return the list of attribute UUIDs found for the current row
attribute_uuids
end
# Determine the Person UUID for the current row
def person_uuid_for(attribute_uuids)
# Find out if we have previously linked a Person UUID to any of the attributes
person_uuids = attribute_uuids.map do |attribute_uuid|
@attribute_uuid_to_person_uuid_hash_map[attribute_uuid]
end
# Although there could be more than one Person UUID linked to the attributes,
# we choose to use the lowest value, as it is the oldest Person UUID
# this provides a forward-facing algorithm that is relatively accurate
person_uuid = person_uuids.compact.uniq.min
# If we haven't found any Person UUIDs linked to the attributes, then all attributes
# must be new, and so we should assign a new Person UUID and increment the counter
if person_uuid.nil?
person_uuid = @person_uuid_counter
@person_uuid_counter += 1
end
# Save the mapping of the attribute UUIDs to the Person UUID, only if the attribute UUID
# is not already mapped to a Person UUID. If it is already mapped, we do not overwrite it,
# to ensure that UUIDs are not skipped. This is a trade-off within the forward-facing algorithm.
attribute_uuids.each do |attribute_uuid|
@attribute_uuid_to_person_uuid_hash_map[attribute_uuid] ||= person_uuid
end
person_uuid
end
# If the input CSV object was not provided during initialization, read it from the input file path.
def input_csv
@input_csv ||= CSV.read(@input_file_path, headers: true)
end
# Determine the headers of the input CSV file
def input_headers
@input_headers ||= input_csv.headers
end
# Determine the keywords from the matching type
def keywords
@matching_type.split('_or_').map(&:strip)
end
# Determine the column indexes for each keyword
# A keyword is a string provided by the user as part of the matching type
# e.g. 'phone_or_email_or_lastname' => ['phone', 'email', 'lastname']
def keyword_to_column_indexes
# Memoize the result
return @keyword_to_column_indexes if defined?(@keyword_to_column_indexes)
@keyword_to_column_indexes = {}
# Iterate over each header and its corresponding column index
input_headers.each_with_index do |header, column_index|
# Determine if the header matches any of the keywords
next unless (keyword = keywords.find { |k| header.match?(/^#{k}/i) })
# Add the column index to the list of column indexes for the keyword
@keyword_to_column_indexes[keyword] ||= []
@keyword_to_column_indexes[keyword] << column_index
end
@keyword_to_column_indexes
end
# Normalize the value of a column. Other normalization methods could be added here
# for different types of data, such as zip/postal codes.
def normalize_column_value(keyword, value)
# Ensure the value is a string and in lowercase and has no leading or trailing whitespace
value = value.to_s.strip.downcase
case keyword
when 'phone'
# Remove any non-numeric characters and leading '1'
value.gsub(/[^0-9]/, '').gsub(/^1/, '')
else
value
end
end
end
# -------------------------------------------------------------------------------------------------
class PersonGrouperTest < Minitest::Test
def test_groups_related_records_by_phone
input_csv = CSV.parse(input_csv_data, headers: true)
# Process the CSV with phone matching
grouper = PersonGrouper.new(matching_type: 'phone', input_csv:)
output_csv = grouper.process
# Convert output to array for easier testing
output_rows = output_csv.to_a
# Test expectations
assert_equal 5, output_rows.length # Including header row
# First two records should have the same UUID (same phone)
assert_equal output_rows[1][0], output_rows[2][0]
# Third and fourth records should have different UUIDs
refute_equal output_rows[1][0], output_rows[3][0]
refute_equal output_rows[3][0], output_rows[4][0]
end
def test_groups_related_records_by_email
input_csv = CSV.parse(input_csv_data, headers: true)
# Process the CSV with email matching
grouper = PersonGrouper.new(matching_type: 'email', input_csv:)
output_csv = grouper.process
output_rows = output_csv.to_a
# Test expectations
assert_equal 5, output_rows.length # Including header row
# First and third records should have the same UUID (same email)
assert_equal output_rows[1][0], output_rows[3][0]
# Second and fourth records should have the same UUID (same email)
assert_equal output_rows[2][0], output_rows[4][0]
# First and second records should have different UUIDs
refute_equal output_rows[1][0], output_rows[2][0]
end
def test_groups_related_records_by_phone_or_email
input_csv = CSV.parse(input_csv_data, headers: true)
# Process the CSV with phone_or_email matching
grouper = PersonGrouper.new(matching_type: 'phone_or_email', input_csv:)
output_csv = grouper.process
output_rows = output_csv.to_a
# Test expectations
assert_equal 5, output_rows.length # Including header row
# First and second records should have same UUID (same phone)
assert_equal output_rows[1][0], output_rows[2][0]
# First and third records should have same UUID (same email)
assert_equal output_rows[1][0], output_rows[3][0]
# Second and fourth records should have same UUID (same email)
assert_equal output_rows[2][0], output_rows[4][0]
# All records should have the same UUID due to transitive relationships
assert_equal output_rows[1][0], output_rows[4][0]
end
private
def input_csv_data
<<~CSV
first_name,last_name,phone1,phone2,email
John,Doe,123-456-7890,,[email protected]
Johnny,Doe,,123.456.7890,[email protected]
Jane,Smith,555-867-5309,,[email protected]
Bob,Jones,777-888-9999,,[email protected]
CSV
end
end
# -------------------------------------------------------------------------------------------------
if ARGV.length == 1 && ARGV[0] == 'test'
require 'minitest/autorun'
exit 0
elsif ARGV.length != 2
puts 'Usage: ruby person_grouper.rb <input_file_path> <matching_type>'
puts 'Matching type consists of one or more keywords, separated by \'_or_\''
puts 'Examples: email, phone, email_or_phone, phone_or_email_or_lastname'
exit 1
end
# Get the input file and matching type from the command line arguments
input_file_path = ARGV[0]
matching_type = ARGV[1]
# Benchmark the processing time of the PersonGrouper
Benchmark.bm do |x|
x.report('Processing CSV') do
PersonGrouper.process(input_file_path, matching_type)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment