Skip to content

Instantly share code, notes, and snippets.

@gordol
Last active March 28, 2025 03:23
Show Gist options
  • Select an option

  • Save gordol/56a04fe0e822ba63ba51d00a5649f2dc to your computer and use it in GitHub Desktop.

Select an option

Save gordol/56a04fe0e822ba63ba51d00a5649f2dc to your computer and use it in GitHub Desktop.
"""
We are trying to determine which offenses can be matched to a candidate. Candidates names' often do not match exactly to criminal records, but still represent the candidate. We are given a JSON file of possible criminal offenses and a candidate's aliases. We use a point system to judge how well a record matches to a candidate.
The point system starts out at 100 (100% accurate) being an exact match. We can do 4 edit types to a record name to get to an exact match of one of the candidate's aliases.
Edits:
A middle name swapped with a first name subtracts 7 points
A first name nickname swap subtracts 5 points
A middle name nickname swap subtracts 3 points
Nicknames comes from nick_names.json. If a name is contained in the set of nicknames, the other names can be considered nicknames.
The candidate's aliases and offenses come from input.json.
Goal 1:
With one edit per record name list the offenses that have a score above 94.
#####################################################################################
todo: is it not a flaw to assume 100% score by default?
it might be better to first break down the candidate aliases into first, middle, last.
then build a list of alternate candidate names using the edits, and run those alt candidate f/m/l against the edits and match exactly against the records.
building an alternative match for each record, and then checking against the aliases may not be ideal.
#####################################################################################
"""
from copy import copy
import pprint
pp = pprint.PrettyPrinter(indent=4)
def middle_name_swap(record):
names = []
#A middle name swapped with a first name subtracts 7 points
if record['_middle']:
names.append('%s %s %s' % (record['_middle'], record['_first'], record['_last']))
return (-7, names)
else:
return(0, names)
def first_nick_swap(record):
#A first name nickname swap subtracts 5 points
names = []
nicks = find_nick_names(record['_first'])
for nick in nicks:
names.append('%s %s %s' % (nick, record['_middle'], record['_last']))
return (-5, names)
def middle_nick_swap(record):
#A middle name nickname swap subtracts 3 points
names = []
nicks = find_nick_names(record['_middle'])
for nick in nicks:
names.append('%s %s %s' % (record['_first'], nick, record['_last']))
return (-3, names)
def maiden_last_swap(record):
#todo
#A maiden last name swap subtracts 5 points
return 0, []
def find_nick_names(search):
if search:
for nicks in nick_names:
if search.upper() in nicks:
return nicks
return []
nick_names = [
["EDWARD", "EDDIE", "ED", "EDDY"],
["AL", "ALPHONSE", "ALI"],
["BONNIE", "BON", "BONNY", "BONAVENTURE"]
]
data = {
"candidate": {
"aliases": ["Bonnie Elizabeth Parker", "Bonnie Elizabeth Clyde", "Bonnie Parker"]
},
"records": [
{
"county": "San Francisco",
"offense": "speeding 2",
"name": "Elizabeth Bonnie Parker",
"severity": "misdemeanor"
},
{
"county": "San Francisco",
"offense": "speeding",
"name": "Bonnie Elizabeth Parker",
"severity": "misdemeanor"
},
{
"county": "Nevada",
"offense": "robbery",
"name": "Bonny Elizabeth Parker",
"severity": "felony"
},
{
"county": "Los Angeles",
"offense": "murder",
"name": "Elizabeth James",
"severity": "felony"
},
{
"county": "Marin",
"offense": "assault",
"name": "Elizabeth Parker",
"severity": "felony"
},
{
"county": "Lassen",
"offense": "assault 2",
"name": "Bonny Elizabeth Clyde",
"severity": "misdemeanor"
}
]
}
def filter_records():
#iterate the records, normalize names, run edits, and tally up the scores
data['candidate']['aliases'] = [alias.upper() for alias in data['candidate']['aliases']]
filtered_records = []
for record in data['records']:
record['score'] = 100
record['name'] = record['name'].upper()
record['_name_parts'] = record['name'].split(' ')
record['_first'] = None
record['_middle'] = None
record['_last'] = None
record['_aliases'] = []
if len(record['_name_parts']) == 3:
record['_first'] = record['_name_parts'][0]
record['_middle'] = record['_name_parts'][1]
record['_last'] = record['_name_parts'][2]
if len(record['_name_parts']) == 2:
record['_first'] = record['_name_parts'][0]
record['_last'] = record['_name_parts'][1]
edits = [
middle_name_swap(record),
first_nick_swap(record),
middle_nick_swap(record),
#maiden_last_swap(record),
]
for score, aliases in edits:
for alias in aliases:
record['_aliases'].append(alias)
if alias in data['candidate']['aliases']:
record['score'] += score
if record['score'] > 94:
filtered_records.append(record)
return filtered_records
pp.pprint(filter_records())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment