mbruzek · December 28, 2021 23:20
diff --git a/verbose find_similar.py b/verbose find_similar.py
 #!/usr/bin/env python3

 """
 A Python3 tool that uses the NASA-Acronym data to find similar expansions.

 This code uses the Levenshtein Distance to calculate the difference between
 sequences of characters in the expansions field of the data.

 This file relies on thefuzz python package: https://github.com/seatgeek/thefuzz

 Usage: python3 find_similar.py MPCV

 """

 import argparse
 import json
 import sys
 import thefuzz
 from thefuzz import fuzz
 from thefuzz import process


 ABBREVIATIONS = 'the specific abbreviations to examine'
 SUMMARY = 'Find the abbreviations with similar expansions and print them out.'
 THRESHOLD = 'the match threshold from 0 to 100 used to find duplicate strings'
 VERBOSITY_LEVEL = 'the amount of information to print on each result'

 acronym_path = 'lists/acronyms.json'
 # The fuzzy scorer algorithm to use when finding similar strings.
 scorer=fuzz.partial_ratio
 # The match ratio to use for the scorer algorithm, lower returns more matches.
 threshold = 88
 # Set the default level of printed output.
 verbosity_level = 0

 # Read from the acronym file.
 with open(acronym_path) as reader:
    json_data = reader.read()
 # Load the JSON data into a python object.
 data = json.loads(json_data)


 def command_line():
    """Parse the arguments from the command line."""
    global threshold
    global verbosity_level
    parser = argparse.ArgumentParser(description=SUMMARY)
    parser.add_argument('abbreviations', help=ABBREVIATIONS, nargs='*')
    parser.add_argument('-t', '--threshold', type=int, help=THRESHOLD)
    parser.add_argument('-v', '--verbose', action='count',
                        default=verbosity_level, help=VERBOSITY_LEVEL)
    arguments = parser.parse_args()
    threshold = arguments.threshold
    verbosity_level = arguments.verbose
    if len(arguments.abbreviations) < 1:
        # There were no specific abbreviations requested, find all similar.
        find_all()
    else:
        # Find the similar for the abbreviations entered as arguments.
        find_similar_expansions(arguments.abbreviations)


 def find_all():
    """Find the similar expansions for all the abbreviations."""
    all_abbreviations = [element['abbreviation'] for element in data]
    find_similar_expansions(all_abbreviations)


 def find_similar_expansions(abbreviations):
    """Find the similar expansions for the list of abbreviations."""
    overloaded_dict = {}
    overloaded_set = set()
    results = []

    if verbosity_level > 0:
        check = 'Checking {} abbreviations for similarity in {} total records.'
        print(check.format(len(abbreviations), len(data)))
    # Loop over each element in the data list by index.
    for index, element in enumerate(data):
        abbreviation = element['abbreviation']
        # Is the abbreviation is in the list of acronyms to expand?
        if abbreviation in abbreviations:
            # Use casefold on the key to utilize caseless matching.
            key = abbreviation.casefold()
            if key in overloaded_set:
                # The key is already in the set, add another index to dict.
                overloaded_dict[key].append(index)
            else:
                # The key is not in the set, add it and set the initial index.
                overloaded_set.add(key)
                overloaded_dict[key] = [index]

    # Loop over each key in the overloaded set.
    for key in overloaded_set:
        # Count the number of times this key is overloaded.
        overloaded_count = len(overloaded_dict[key])
        # If the key has more than one expansion it is overloaded.
        if overloaded_count > 1:
            abbreviation = data[overloaded_dict[key][0]]['abbreviation']
            overloaded_objects = [data[index] for index in overloaded_dict[key]]
            expansions = [element['expansion'] for element in overloaded_objects]
            # Compare the list of expansions use fuzzy matching to remove duplicates.
            deduped = process.dedupe(expansions, threshold=threshold, scorer=scorer)
            if len(expansions) != len(deduped):
                # Duplicate found, start the result with the abbreviation.
                result = '{} '.format(abbreviation)
                # Get the list of differences between expansions and deduped.
                difference = list(set(expansions) - set(deduped))
                if verbosity_level > 0:
                    # The difference between the set are possible duplicates.
                    result += 'has possible duplicate expansions {}'.format(
                                                                    difference)
                if verbosity_level > 1:
                    # Output all expansions this abbreviation (to gain context).
                    for item in overloaded_objects:
                        result += '\n{:<60} {:<9} {:<5} {:<3}'.format(
                                                            item['expansion'],
                                                            item['source'],
                                                            item['acronym_id'],
                                                            item['source_id'])
                results.append(result)
    # Count the number of duplicates found.
    number_of_results = len(results)
    if verbosity_level > 0 and number_of_results > 1:
        found = '{} possible duplicates found with a threshold of {}.'
        print(found.format(number_of_results, threshold))
    # Print out each result.
    for result in results:
        print(result)


 if __name__ == '__main__':
    if len(sys.argv) > 1:
        # Process the command line arguments.
        command_line()
    else:
        # There are no arguments, find all similar.
        find_all()
	#!/usr/bin/env python3

	"""
	A Python3 tool that uses the NASA-Acronym data to find similar expansions.

	This code uses the Levenshtein Distance to calculate the difference between
	sequences of characters in the expansions field of the data.

	This file relies on thefuzz python package: https://github.com/seatgeek/thefuzz

	Usage: python3 find_similar.py MPCV

	"""

	import argparse
	import json
	import sys
	import thefuzz
	from thefuzz import fuzz
	from thefuzz import process


	ABBREVIATIONS = 'the specific abbreviations to examine'
	SUMMARY = 'Find the abbreviations with similar expansions and print them out.'
	THRESHOLD = 'the match threshold from 0 to 100 used to find duplicate strings'
	VERBOSITY_LEVEL = 'the amount of information to print on each result'

	acronym_path = 'lists/acronyms.json'
	# The fuzzy scorer algorithm to use when finding similar strings.
	scorer=fuzz.partial_ratio
	# The match ratio to use for the scorer algorithm, lower returns more matches.
	threshold = 88
	# Set the default level of printed output.
	verbosity_level = 0

	# Read from the acronym file.
	with open(acronym_path) as reader:
	json_data = reader.read()
	# Load the JSON data into a python object.
	data = json.loads(json_data)


	def command_line():
	"""Parse the arguments from the command line."""
	global threshold
	global verbosity_level
	parser = argparse.ArgumentParser(description=SUMMARY)
	parser.add_argument('abbreviations', help=ABBREVIATIONS, nargs='*')
	parser.add_argument('-t', '--threshold', type=int, help=THRESHOLD)
	parser.add_argument('-v', '--verbose', action='count',
	default=verbosity_level, help=VERBOSITY_LEVEL)
	arguments = parser.parse_args()
	threshold = arguments.threshold
	verbosity_level = arguments.verbose
	if len(arguments.abbreviations) < 1:
	# There were no specific abbreviations requested, find all similar.
	find_all()
	else:
	# Find the similar for the abbreviations entered as arguments.
	find_similar_expansions(arguments.abbreviations)


	def find_all():
	"""Find the similar expansions for all the abbreviations."""
	all_abbreviations = [element['abbreviation'] for element in data]
	find_similar_expansions(all_abbreviations)


	def find_similar_expansions(abbreviations):
	"""Find the similar expansions for the list of abbreviations."""
	overloaded_dict = {}
	overloaded_set = set()
	results = []

	if verbosity_level > 0:
	check = 'Checking {} abbreviations for similarity in {} total records.'
	print(check.format(len(abbreviations), len(data)))
	# Loop over each element in the data list by index.
	for index, element in enumerate(data):
	abbreviation = element['abbreviation']
	# Is the abbreviation is in the list of acronyms to expand?
	if abbreviation in abbreviations:
	# Use casefold on the key to utilize caseless matching.
	key = abbreviation.casefold()
	if key in overloaded_set:
	# The key is already in the set, add another index to dict.
	overloaded_dict[key].append(index)
	else:
	# The key is not in the set, add it and set the initial index.
	overloaded_set.add(key)
	overloaded_dict[key] = [index]

	# Loop over each key in the overloaded set.
	for key in overloaded_set:
	# Count the number of times this key is overloaded.
	overloaded_count = len(overloaded_dict[key])
	# If the key has more than one expansion it is overloaded.
	if overloaded_count > 1:
	abbreviation = data[overloaded_dict[key][0]]['abbreviation']
	overloaded_objects = [data[index] for index in overloaded_dict[key]]
	expansions = [element['expansion'] for element in overloaded_objects]
	# Compare the list of expansions use fuzzy matching to remove duplicates.
	deduped = process.dedupe(expansions, threshold=threshold, scorer=scorer)
	if len(expansions) != len(deduped):
	# Duplicate found, start the result with the abbreviation.
	result = '{} '.format(abbreviation)
	# Get the list of differences between expansions and deduped.
	difference = list(set(expansions) - set(deduped))
	if verbosity_level > 0:
	# The difference between the set are possible duplicates.
	result += 'has possible duplicate expansions {}'.format(
	difference)
	if verbosity_level > 1:
	# Output all expansions this abbreviation (to gain context).
	for item in overloaded_objects:
	result += '\n{:<60} {:<9} {:<5} {:<3}'.format(
	item['expansion'],
	item['source'],
	item['acronym_id'],
	item['source_id'])
	results.append(result)
	# Count the number of duplicates found.
	number_of_results = len(results)
	if verbosity_level > 0 and number_of_results > 1:
	found = '{} possible duplicates found with a threshold of {}.'
	print(found.format(number_of_results, threshold))
	# Print out each result.
	for result in results:
	print(result)


	if __name__ == '__main__':
	if len(sys.argv) > 1:
	# Process the command line arguments.
	command_line()
	else:
	# There are no arguments, find all similar.
	find_all()