Last active
December 28, 2021 23:20
-
-
Save mbruzek/89efbf573409f7d9ecfa9413abc2d63e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
A Python3 tool that uses the NASA-Acronym data to find similar expansions. | |
This code uses the Levenshtein Distance to calculate the difference between | |
sequences of characters in the expansions field of the data. | |
This file relies on thefuzz python package: https://github.com/seatgeek/thefuzz | |
Usage: python3 find_similar.py MPCV | |
""" | |
import argparse | |
import json | |
import sys | |
import thefuzz | |
from thefuzz import fuzz | |
from thefuzz import process | |
ABBREVIATIONS = 'the specific abbreviations to examine' | |
SUMMARY = 'Find the abbreviations with similar expansions and print them out.' | |
THRESHOLD = 'the match threshold from 0 to 100 used to find duplicate strings' | |
VERBOSITY_LEVEL = 'the amount of information to print on each result' | |
acronym_path = 'lists/acronyms.json' | |
# The fuzzy scorer algorithm to use when finding similar strings. | |
scorer=fuzz.partial_ratio | |
# The match ratio to use for the scorer algorithm, lower returns more matches. | |
threshold = 88 | |
# Set the default level of printed output. | |
verbosity_level = 0 | |
# Read from the acronym file. | |
with open(acronym_path) as reader: | |
json_data = reader.read() | |
# Load the JSON data into a python object. | |
data = json.loads(json_data) | |
def command_line(): | |
"""Parse the arguments from the command line.""" | |
global threshold | |
global verbosity_level | |
parser = argparse.ArgumentParser(description=SUMMARY) | |
parser.add_argument('abbreviations', help=ABBREVIATIONS, nargs='*') | |
parser.add_argument('-t', '--threshold', type=int, help=THRESHOLD) | |
parser.add_argument('-v', '--verbose', action='count', | |
default=verbosity_level, help=VERBOSITY_LEVEL) | |
arguments = parser.parse_args() | |
threshold = arguments.threshold | |
verbosity_level = arguments.verbose | |
if len(arguments.abbreviations) < 1: | |
# There were no specific abbreviations requested, find all similar. | |
find_all() | |
else: | |
# Find the similar for the abbreviations entered as arguments. | |
find_similar_expansions(arguments.abbreviations) | |
def find_all(): | |
"""Find the similar expansions for all the abbreviations.""" | |
all_abbreviations = [element['abbreviation'] for element in data] | |
find_similar_expansions(all_abbreviations) | |
def find_similar_expansions(abbreviations): | |
"""Find the similar expansions for the list of abbreviations.""" | |
overloaded_dict = {} | |
overloaded_set = set() | |
results = [] | |
if verbosity_level > 0: | |
check = 'Checking {} abbreviations for similarity in {} total records.' | |
print(check.format(len(abbreviations), len(data))) | |
# Loop over each element in the data list by index. | |
for index, element in enumerate(data): | |
abbreviation = element['abbreviation'] | |
# Is the abbreviation is in the list of acronyms to expand? | |
if abbreviation in abbreviations: | |
# Use casefold on the key to utilize caseless matching. | |
key = abbreviation.casefold() | |
if key in overloaded_set: | |
# The key is already in the set, add another index to dict. | |
overloaded_dict[key].append(index) | |
else: | |
# The key is not in the set, add it and set the initial index. | |
overloaded_set.add(key) | |
overloaded_dict[key] = [index] | |
# Loop over each key in the overloaded set. | |
for key in overloaded_set: | |
# Count the number of times this key is overloaded. | |
overloaded_count = len(overloaded_dict[key]) | |
# If the key has more than one expansion it is overloaded. | |
if overloaded_count > 1: | |
abbreviation = data[overloaded_dict[key][0]]['abbreviation'] | |
overloaded_objects = [data[index] for index in overloaded_dict[key]] | |
expansions = [element['expansion'] for element in overloaded_objects] | |
# Compare the list of expansions use fuzzy matching to remove duplicates. | |
deduped = process.dedupe(expansions, threshold=threshold, scorer=scorer) | |
if len(expansions) != len(deduped): | |
# Duplicate found, start the result with the abbreviation. | |
result = '{} '.format(abbreviation) | |
# Get the list of differences between expansions and deduped. | |
difference = list(set(expansions) - set(deduped)) | |
if verbosity_level > 0: | |
# The difference between the set are possible duplicates. | |
result += 'has possible duplicate expansions {}'.format( | |
difference) | |
if verbosity_level > 1: | |
# Output all expansions this abbreviation (to gain context). | |
for item in overloaded_objects: | |
result += '\n{:<60} {:<9} {:<5} {:<3}'.format( | |
item['expansion'], | |
item['source'], | |
item['acronym_id'], | |
item['source_id']) | |
results.append(result) | |
# Count the number of duplicates found. | |
number_of_results = len(results) | |
if verbosity_level > 0 and number_of_results > 1: | |
found = '{} possible duplicates found with a threshold of {}.' | |
print(found.format(number_of_results, threshold)) | |
# Print out each result. | |
for result in results: | |
print(result) | |
if __name__ == '__main__': | |
if len(sys.argv) > 1: | |
# Process the command line arguments. | |
command_line() | |
else: | |
# There are no arguments, find all similar. | |
find_all() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment