Last active
March 28, 2025 03:23
-
-
Save gordol/56a04fe0e822ba63ba51d00a5649f2dc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| We are trying to determine which offenses can be matched to a candidate. Candidates names' often do not match exactly to criminal records, but still represent the candidate. We are given a JSON file of possible criminal offenses and a candidate's aliases. We use a point system to judge how well a record matches to a candidate. | |
| The point system starts out at 100 (100% accurate) being an exact match. We can do 4 edit types to a record name to get to an exact match of one of the candidate's aliases. | |
| Edits: | |
| A middle name swapped with a first name subtracts 7 points | |
| A first name nickname swap subtracts 5 points | |
| A middle name nickname swap subtracts 3 points | |
| Nicknames comes from nick_names.json. If a name is contained in the set of nicknames, the other names can be considered nicknames. | |
| The candidate's aliases and offenses come from input.json. | |
| Goal 1: | |
| With one edit per record name list the offenses that have a score above 94. | |
| ##################################################################################### | |
| todo: is it not a flaw to assume 100% score by default? | |
| it might be better to first break down the candidate aliases into first, middle, last. | |
| then build a list of alternate candidate names using the edits, and run those alt candidate f/m/l against the edits and match exactly against the records. | |
| building an alternative match for each record, and then checking against the aliases may not be ideal. | |
| ##################################################################################### | |
| """ | |
| from copy import copy | |
| import pprint | |
| pp = pprint.PrettyPrinter(indent=4) | |
| def middle_name_swap(record): | |
| names = [] | |
| #A middle name swapped with a first name subtracts 7 points | |
| if record['_middle']: | |
| names.append('%s %s %s' % (record['_middle'], record['_first'], record['_last'])) | |
| return (-7, names) | |
| else: | |
| return(0, names) | |
| def first_nick_swap(record): | |
| #A first name nickname swap subtracts 5 points | |
| names = [] | |
| nicks = find_nick_names(record['_first']) | |
| for nick in nicks: | |
| names.append('%s %s %s' % (nick, record['_middle'], record['_last'])) | |
| return (-5, names) | |
| def middle_nick_swap(record): | |
| #A middle name nickname swap subtracts 3 points | |
| names = [] | |
| nicks = find_nick_names(record['_middle']) | |
| for nick in nicks: | |
| names.append('%s %s %s' % (record['_first'], nick, record['_last'])) | |
| return (-3, names) | |
| def maiden_last_swap(record): | |
| #todo | |
| #A maiden last name swap subtracts 5 points | |
| return 0, [] | |
| def find_nick_names(search): | |
| if search: | |
| for nicks in nick_names: | |
| if search.upper() in nicks: | |
| return nicks | |
| return [] | |
| nick_names = [ | |
| ["EDWARD", "EDDIE", "ED", "EDDY"], | |
| ["AL", "ALPHONSE", "ALI"], | |
| ["BONNIE", "BON", "BONNY", "BONAVENTURE"] | |
| ] | |
| data = { | |
| "candidate": { | |
| "aliases": ["Bonnie Elizabeth Parker", "Bonnie Elizabeth Clyde", "Bonnie Parker"] | |
| }, | |
| "records": [ | |
| { | |
| "county": "San Francisco", | |
| "offense": "speeding 2", | |
| "name": "Elizabeth Bonnie Parker", | |
| "severity": "misdemeanor" | |
| }, | |
| { | |
| "county": "San Francisco", | |
| "offense": "speeding", | |
| "name": "Bonnie Elizabeth Parker", | |
| "severity": "misdemeanor" | |
| }, | |
| { | |
| "county": "Nevada", | |
| "offense": "robbery", | |
| "name": "Bonny Elizabeth Parker", | |
| "severity": "felony" | |
| }, | |
| { | |
| "county": "Los Angeles", | |
| "offense": "murder", | |
| "name": "Elizabeth James", | |
| "severity": "felony" | |
| }, | |
| { | |
| "county": "Marin", | |
| "offense": "assault", | |
| "name": "Elizabeth Parker", | |
| "severity": "felony" | |
| }, | |
| { | |
| "county": "Lassen", | |
| "offense": "assault 2", | |
| "name": "Bonny Elizabeth Clyde", | |
| "severity": "misdemeanor" | |
| } | |
| ] | |
| } | |
| def filter_records(): | |
| #iterate the records, normalize names, run edits, and tally up the scores | |
| data['candidate']['aliases'] = [alias.upper() for alias in data['candidate']['aliases']] | |
| filtered_records = [] | |
| for record in data['records']: | |
| record['score'] = 100 | |
| record['name'] = record['name'].upper() | |
| record['_name_parts'] = record['name'].split(' ') | |
| record['_first'] = None | |
| record['_middle'] = None | |
| record['_last'] = None | |
| record['_aliases'] = [] | |
| if len(record['_name_parts']) == 3: | |
| record['_first'] = record['_name_parts'][0] | |
| record['_middle'] = record['_name_parts'][1] | |
| record['_last'] = record['_name_parts'][2] | |
| if len(record['_name_parts']) == 2: | |
| record['_first'] = record['_name_parts'][0] | |
| record['_last'] = record['_name_parts'][1] | |
| edits = [ | |
| middle_name_swap(record), | |
| first_nick_swap(record), | |
| middle_nick_swap(record), | |
| #maiden_last_swap(record), | |
| ] | |
| for score, aliases in edits: | |
| for alias in aliases: | |
| record['_aliases'].append(alias) | |
| if alias in data['candidate']['aliases']: | |
| record['score'] += score | |
| if record['score'] > 94: | |
| filtered_records.append(record) | |
| return filtered_records | |
| pp.pprint(filter_records()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment