zk-1 · February 16, 2025 23:23
diff --git a/person_grouper.rb b/person_grouper.rb
 #!/usr/bin/env ruby
 # frozen_string_literal: true

 #          Name: Person Grouper (person_grouper.rb)
 #   Description: A tool to identify rows in a CSV file that may represent
 #                the same person, based on one or more provided matching types
 #   Limitations: Algorithm is forward-facing only (no retrospecting merging)
 #        Author: Zoë Kelly ([email protected])
 #       License: MIT
 #       Created: 2025-02-16
 #
 #         Usage: ruby person_grouper.rb <input_file_path> <matching_type>
 #          Test: ruby person_grouper.rb test

 require 'csv'
 require 'benchmark'
 require 'minitest'

 class PersonGrouper
  class << self
    def process(input_file_path, matching_type)
      new(input_file_path:, matching_type:).process
    end
  end

  def initialize(matching_type:, input_file_path: nil, input_csv: nil)
    @input_file_path = input_file_path
    @matching_type = matching_type
    @input_csv = input_csv

    @person_uuid_counter = 1                      # Counter for unique person UUIDs
    @attribute_uuid_counter = 1                   # Counter for unique attribute UUIDs
    @attribute_to_attribute_uuid_hash_map = {}    # Map of attribute values to their UUIDs
    @attribute_uuid_to_person_uuid_hash_map = {}  # Map of attribute UUIDs to person UUIDs
    @output_rows = []
  end

  def process
    puts "Processing #{@input_file_path || 'provided CSV object'} with #{@matching_type}"

    # Input CSV can be provided as a path or as a CSV object
    unless input_csv
      puts 'Input CSV not provided!'
      return
    end

    # Process each row in the input CSV file
    input_csv.each do |row|
      # Assign or determine the Attribute UUIDs for any eligible attributes in the current row
      attribute_uuids = attribute_uuids_for(row)

      # Assign or determine the Person UUID for the current row
      person_uuid = person_uuid_for(attribute_uuids)

      # Add the Person UUID and the current row's existing values to the output rows
      @output_rows << [person_uuid].concat(row.fields)
    end

    # Create the CSV object
    output_csv = CSV::Table.new([CSV::Row.new(output_headers, [], true)])
    @output_rows.each { |row| output_csv << row }

    if @input_file_path
      # Generate string and write to output CSV file
      output_file = @input_file_path.sub('.csv', '_output.csv')
      File.write(output_file, output_csv.to_csv)
      puts "Done! Output written to #{output_file}"
    else
      # Return the CSV object when using input_csv
      output_csv
    end
  rescue StandardError => e
    puts "Sorry, there was an error: #{e.message}"
  end

  private

  # Determine the output headers
  def output_headers
    ['uuid'].concat(input_headers)
  end

  # Determine the Attribute UUIDs for any eligible attributes in the current row
  def attribute_uuids_for(row)
    attribute_uuids = []

    # Iterate over each keyword and its corresponding column indexes
    keyword_to_column_indexes.each do |keyword, column_indexes|
      column_indexes.each do |column_index|
        next unless (column_raw_value = row[column_index])

        # Normalize the column value
        column_value = normalize_column_value(keyword, column_raw_value)

        # Determine the composite attribute
        # This is to ensure that e.g. a first name cannot be matched with a last name,
        # even if they are the same name
        attribute = "#{keyword}_#{column_value}"

        # Determine the UUID for the attribute if it already exists
        attribute_uuid = @attribute_to_attribute_uuid_hash_map[attribute]

        # If the attribute does not yet have a UUID, assign one and increment the counter,
        # saving the mapping to the hash map for efficient subsequent lookups
        if attribute_uuid.nil?
          @attribute_to_attribute_uuid_hash_map[attribute] = @attribute_uuid_counter
          attribute_uuid = @attribute_uuid_counter
          @attribute_uuid_counter += 1
        end

        attribute_uuids << attribute_uuid
      end
    end

    # Return the list of attribute UUIDs found for the current row
    attribute_uuids
  end

  # Determine the Person UUID for the current row
  def person_uuid_for(attribute_uuids)
    # Find out if we have previously linked a Person UUID to any of the attributes
    person_uuids = attribute_uuids.map do |attribute_uuid|
      @attribute_uuid_to_person_uuid_hash_map[attribute_uuid]
    end

    # Although there could be more than one Person UUID linked to the attributes,
    # we choose to use the lowest value, as it is the oldest Person UUID
    # this provides a forward-facing algorithm that is relatively accurate
    person_uuid = person_uuids.compact.uniq.min

    # If we haven't found any Person UUIDs linked to the attributes, then all attributes
    # must be new, and so we should assign a new Person UUID and increment the counter
    if person_uuid.nil?
      person_uuid = @person_uuid_counter
      @person_uuid_counter += 1
    end

    # Save the mapping of the attribute UUIDs to the Person UUID, only if the attribute UUID
    # is not already mapped to a Person UUID. If it is already mapped, we do not overwrite it,
    # to ensure that UUIDs are not skipped. This is a trade-off within the forward-facing algorithm.
    attribute_uuids.each do |attribute_uuid|
      @attribute_uuid_to_person_uuid_hash_map[attribute_uuid] ||= person_uuid
    end

    person_uuid
  end

  # If the input CSV object was not provided during initialization, read it from the input file path.
  def input_csv
    @input_csv ||= CSV.read(@input_file_path, headers: true)
  end

  # Determine the headers of the input CSV file
  def input_headers
    @input_headers ||= input_csv.headers
  end

  # Determine the keywords from the matching type
  def keywords
    @matching_type.split('_or_').map(&:strip)
  end

  # Determine the column indexes for each keyword
  # A keyword is a string provided by the user as part of the matching type
  # e.g. 'phone_or_email_or_lastname' => ['phone', 'email', 'lastname']
  def keyword_to_column_indexes
    # Memoize the result
    return @keyword_to_column_indexes if defined?(@keyword_to_column_indexes)

    @keyword_to_column_indexes = {}

    # Iterate over each header and its corresponding column index
    input_headers.each_with_index do |header, column_index|
      # Determine if the header matches any of the keywords
      next unless (keyword = keywords.find { |k| header.match?(/^#{k}/i) })

      # Add the column index to the list of column indexes for the keyword
      @keyword_to_column_indexes[keyword] ||= []
      @keyword_to_column_indexes[keyword] << column_index
    end

    @keyword_to_column_indexes
  end

  # Normalize the value of a column. Other normalization methods could be added here
  # for different types of data, such as zip/postal codes.
  def normalize_column_value(keyword, value)
    # Ensure the value is a string and in lowercase and has no leading or trailing whitespace
    value = value.to_s.strip.downcase

    case keyword
    when 'phone'
      # Remove any non-numeric characters and leading '1'
      value.gsub(/[^0-9]/, '').gsub(/^1/, '')
    else
      value
    end
  end
 end

 # -------------------------------------------------------------------------------------------------

 class PersonGrouperTest < Minitest::Test
  def test_groups_related_records_by_phone
    input_csv = CSV.parse(input_csv_data, headers: true)

    # Process the CSV with phone matching
    grouper = PersonGrouper.new(matching_type: 'phone', input_csv:)
    output_csv = grouper.process

    # Convert output to array for easier testing
    output_rows = output_csv.to_a

    # Test expectations
    assert_equal 5, output_rows.length # Including header row

    # First two records should have the same UUID (same phone)
    assert_equal output_rows[1][0], output_rows[2][0]

    # Third and fourth records should have different UUIDs
    refute_equal output_rows[1][0], output_rows[3][0]
    refute_equal output_rows[3][0], output_rows[4][0]
  end

  def test_groups_related_records_by_email
    input_csv = CSV.parse(input_csv_data, headers: true)

    # Process the CSV with email matching
    grouper = PersonGrouper.new(matching_type: 'email', input_csv:)
    output_csv = grouper.process

    output_rows = output_csv.to_a

    # Test expectations
    assert_equal 5, output_rows.length # Including header row

    # First and third records should have the same UUID (same email)
    assert_equal output_rows[1][0], output_rows[3][0]

    # Second and fourth records should have the same UUID (same email)
    assert_equal output_rows[2][0], output_rows[4][0]

    # First and second records should have different UUIDs
    refute_equal output_rows[1][0], output_rows[2][0]
  end

  def test_groups_related_records_by_phone_or_email
    input_csv = CSV.parse(input_csv_data, headers: true)

    # Process the CSV with phone_or_email matching
    grouper = PersonGrouper.new(matching_type: 'phone_or_email', input_csv:)
    output_csv = grouper.process

    output_rows = output_csv.to_a

    # Test expectations
    assert_equal 5, output_rows.length # Including header row

    # First and second records should have same UUID (same phone)
    assert_equal output_rows[1][0], output_rows[2][0]

    # First and third records should have same UUID (same email)
    assert_equal output_rows[1][0], output_rows[3][0]

    # Second and fourth records should have same UUID (same email)
    assert_equal output_rows[2][0], output_rows[4][0]

    # All records should have the same UUID due to transitive relationships
    assert_equal output_rows[1][0], output_rows[4][0]
  end

  private

  def input_csv_data
    <<~CSV
      first_name,last_name,phone1,phone2,email
      John,Doe,123-456-7890,,[email protected]
      Johnny,Doe,,123.456.7890,[email protected]
      Jane,Smith,555-867-5309,,[email protected]
      Bob,Jones,777-888-9999,,[email protected]
    CSV
  end
 end

 # -------------------------------------------------------------------------------------------------

 if ARGV.length == 1 && ARGV[0] == 'test'
  require 'minitest/autorun'
  exit 0
 elsif ARGV.length != 2
  puts 'Usage: ruby person_grouper.rb <input_file_path> <matching_type>'
  puts 'Matching type consists of one or more keywords, separated by \'_or_\''
  puts 'Examples: email, phone, email_or_phone, phone_or_email_or_lastname'
  exit 1
 end

 # Get the input file and matching type from the command line arguments
 input_file_path = ARGV[0]
 matching_type = ARGV[1]

 # Benchmark the processing time of the PersonGrouper
 Benchmark.bm do |x|
  x.report('Processing CSV') do
    PersonGrouper.process(input_file_path, matching_type)
  end
 end
	#!/usr/bin/env ruby
	# frozen_string_literal: true

	# Name: Person Grouper (person_grouper.rb)
	# Description: A tool to identify rows in a CSV file that may represent
	# the same person, based on one or more provided matching types
	# Limitations: Algorithm is forward-facing only (no retrospecting merging)
	# Author: Zoë Kelly ([email protected])
	# License: MIT
	# Created: 2025-02-16
	#
	# Usage: ruby person_grouper.rb <input_file_path> <matching_type>
	# Test: ruby person_grouper.rb test

	require 'csv'
	require 'benchmark'
	require 'minitest'

	class PersonGrouper
	class << self
	def process(input_file_path, matching_type)
	new(input_file_path:, matching_type:).process
	end
	end

	def initialize(matching_type:, input_file_path: nil, input_csv: nil)
	@input_file_path = input_file_path
	@matching_type = matching_type
	@input_csv = input_csv

	@person_uuid_counter = 1 # Counter for unique person UUIDs
	@attribute_uuid_counter = 1 # Counter for unique attribute UUIDs
	@attribute_to_attribute_uuid_hash_map = {} # Map of attribute values to their UUIDs
	@attribute_uuid_to_person_uuid_hash_map = {} # Map of attribute UUIDs to person UUIDs
	@output_rows = []
	end

	def process
	puts "Processing #{@input_file_path \|\| 'provided CSV object'} with #{@matching_type}"

	# Input CSV can be provided as a path or as a CSV object
	unless input_csv
	puts 'Input CSV not provided!'
	return
	end

	# Process each row in the input CSV file
	input_csv.each do \|row\|
	# Assign or determine the Attribute UUIDs for any eligible attributes in the current row
	attribute_uuids = attribute_uuids_for(row)

	# Assign or determine the Person UUID for the current row
	person_uuid = person_uuid_for(attribute_uuids)

	# Add the Person UUID and the current row's existing values to the output rows
	@output_rows << [person_uuid].concat(row.fields)
	end

	# Create the CSV object
	output_csv = CSV::Table.new([CSV::Row.new(output_headers, [], true)])
	@output_rows.each { \|row\| output_csv << row }

	if @input_file_path
	# Generate string and write to output CSV file
	output_file = @input_file_path.sub('.csv', '_output.csv')
	File.write(output_file, output_csv.to_csv)
	puts "Done! Output written to #{output_file}"
	else
	# Return the CSV object when using input_csv
	output_csv
	end
	rescue StandardError => e
	puts "Sorry, there was an error: #{e.message}"
	end

	private

	# Determine the output headers
	def output_headers
	['uuid'].concat(input_headers)
	end

	# Determine the Attribute UUIDs for any eligible attributes in the current row
	def attribute_uuids_for(row)
	attribute_uuids = []

	# Iterate over each keyword and its corresponding column indexes
	keyword_to_column_indexes.each do \|keyword, column_indexes\|
	column_indexes.each do \|column_index\|
	next unless (column_raw_value = row[column_index])

	# Normalize the column value
	column_value = normalize_column_value(keyword, column_raw_value)

	# Determine the composite attribute
	# This is to ensure that e.g. a first name cannot be matched with a last name,
	# even if they are the same name
	attribute = "#{keyword}_#{column_value}"

	# Determine the UUID for the attribute if it already exists
	attribute_uuid = @attribute_to_attribute_uuid_hash_map[attribute]

	# If the attribute does not yet have a UUID, assign one and increment the counter,
	# saving the mapping to the hash map for efficient subsequent lookups
	if attribute_uuid.nil?
	@attribute_to_attribute_uuid_hash_map[attribute] = @attribute_uuid_counter
	attribute_uuid = @attribute_uuid_counter
	@attribute_uuid_counter += 1
	end

	attribute_uuids << attribute_uuid
	end
	end

	# Return the list of attribute UUIDs found for the current row
	attribute_uuids
	end

	# Determine the Person UUID for the current row
	def person_uuid_for(attribute_uuids)
	# Find out if we have previously linked a Person UUID to any of the attributes
	person_uuids = attribute_uuids.map do \|attribute_uuid\|
	@attribute_uuid_to_person_uuid_hash_map[attribute_uuid]
	end

	# Although there could be more than one Person UUID linked to the attributes,
	# we choose to use the lowest value, as it is the oldest Person UUID
	# this provides a forward-facing algorithm that is relatively accurate
	person_uuid = person_uuids.compact.uniq.min

	# If we haven't found any Person UUIDs linked to the attributes, then all attributes
	# must be new, and so we should assign a new Person UUID and increment the counter
	if person_uuid.nil?
	person_uuid = @person_uuid_counter
	@person_uuid_counter += 1
	end

	# Save the mapping of the attribute UUIDs to the Person UUID, only if the attribute UUID
	# is not already mapped to a Person UUID. If it is already mapped, we do not overwrite it,
	# to ensure that UUIDs are not skipped. This is a trade-off within the forward-facing algorithm.
	attribute_uuids.each do \|attribute_uuid\|
	@attribute_uuid_to_person_uuid_hash_map[attribute_uuid] \|\|= person_uuid
	end

	person_uuid
	end

	# If the input CSV object was not provided during initialization, read it from the input file path.
	def input_csv
	@input_csv \|\|= CSV.read(@input_file_path, headers: true)
	end

	# Determine the headers of the input CSV file
	def input_headers
	@input_headers \|\|= input_csv.headers
	end

	# Determine the keywords from the matching type
	def keywords
	@matching_type.split('_or_').map(&:strip)
	end

	# Determine the column indexes for each keyword
	# A keyword is a string provided by the user as part of the matching type
	# e.g. 'phone_or_email_or_lastname' => ['phone', 'email', 'lastname']
	def keyword_to_column_indexes
	# Memoize the result
	return @keyword_to_column_indexes if defined?(@keyword_to_column_indexes)

	@keyword_to_column_indexes = {}

	# Iterate over each header and its corresponding column index
	input_headers.each_with_index do \|header, column_index\|
	# Determine if the header matches any of the keywords
	next unless (keyword = keywords.find { \|k\| header.match?(/^#{k}/i) })

	# Add the column index to the list of column indexes for the keyword
	@keyword_to_column_indexes[keyword] \|\|= []
	@keyword_to_column_indexes[keyword] << column_index
	end

	@keyword_to_column_indexes
	end

	# Normalize the value of a column. Other normalization methods could be added here
	# for different types of data, such as zip/postal codes.
	def normalize_column_value(keyword, value)
	# Ensure the value is a string and in lowercase and has no leading or trailing whitespace
	value = value.to_s.strip.downcase

	case keyword
	when 'phone'
	# Remove any non-numeric characters and leading '1'
	value.gsub(/[^0-9]/, '').gsub(/^1/, '')
	else
	value
	end
	end
	end

	# -------------------------------------------------------------------------------------------------

	class PersonGrouperTest < Minitest::Test
	def test_groups_related_records_by_phone
	input_csv = CSV.parse(input_csv_data, headers: true)

	# Process the CSV with phone matching
	grouper = PersonGrouper.new(matching_type: 'phone', input_csv:)
	output_csv = grouper.process

	# Convert output to array for easier testing
	output_rows = output_csv.to_a

	# Test expectations
	assert_equal 5, output_rows.length # Including header row

	# First two records should have the same UUID (same phone)
	assert_equal output_rows[1][0], output_rows[2][0]

	# Third and fourth records should have different UUIDs
	refute_equal output_rows[1][0], output_rows[3][0]
	refute_equal output_rows[3][0], output_rows[4][0]
	end

	def test_groups_related_records_by_email
	input_csv = CSV.parse(input_csv_data, headers: true)

	# Process the CSV with email matching
	grouper = PersonGrouper.new(matching_type: 'email', input_csv:)
	output_csv = grouper.process

	output_rows = output_csv.to_a

	# Test expectations
	assert_equal 5, output_rows.length # Including header row

	# First and third records should have the same UUID (same email)
	assert_equal output_rows[1][0], output_rows[3][0]

	# Second and fourth records should have the same UUID (same email)
	assert_equal output_rows[2][0], output_rows[4][0]

	# First and second records should have different UUIDs
	refute_equal output_rows[1][0], output_rows[2][0]
	end

	def test_groups_related_records_by_phone_or_email
	input_csv = CSV.parse(input_csv_data, headers: true)

	# Process the CSV with phone_or_email matching
	grouper = PersonGrouper.new(matching_type: 'phone_or_email', input_csv:)
	output_csv = grouper.process

	output_rows = output_csv.to_a

	# Test expectations
	assert_equal 5, output_rows.length # Including header row

	# First and second records should have same UUID (same phone)
	assert_equal output_rows[1][0], output_rows[2][0]

	# First and third records should have same UUID (same email)
	assert_equal output_rows[1][0], output_rows[3][0]

	# Second and fourth records should have same UUID (same email)
	assert_equal output_rows[2][0], output_rows[4][0]

	# All records should have the same UUID due to transitive relationships
	assert_equal output_rows[1][0], output_rows[4][0]
	end

	private

	def input_csv_data
	<<~CSV
	first_name,last_name,phone1,phone2,email
	John,Doe,123-456-7890,,[email protected]
	Johnny,Doe,,123.456.7890,[email protected]
	Jane,Smith,555-867-5309,,[email protected]
	Bob,Jones,777-888-9999,,[email protected]
	CSV
	end
	end

	# -------------------------------------------------------------------------------------------------

	if ARGV.length == 1 && ARGV[0] == 'test'
	require 'minitest/autorun'
	exit 0
	elsif ARGV.length != 2
	puts 'Usage: ruby person_grouper.rb <input_file_path> <matching_type>'
	puts 'Matching type consists of one or more keywords, separated by \'_or_\''
	puts 'Examples: email, phone, email_or_phone, phone_or_email_or_lastname'
	exit 1
	end

	# Get the input file and matching type from the command line arguments
	input_file_path = ARGV[0]
	matching_type = ARGV[1]

	# Benchmark the processing time of the PersonGrouper
	Benchmark.bm do \|x\|
	x.report('Processing CSV') do
	PersonGrouper.process(input_file_path, matching_type)
	end
	end