Created
October 3, 2023 20:38
-
-
Save Varriount/523461a8ae5b39a32fb4e6e96e453cdf to your computer and use it in GitHub Desktop.
Sort terms based on word and character similarity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
import sys | |
from editdistance import eval as distance | |
from pprint import pprint | |
import re | |
def sort_phrases(phrase_list): | |
result = [] | |
result.append(phrase_list.pop()) | |
while phrase_list: | |
index, score = find_best_phrase(result[-1], phrase_list) | |
result.append(phrase_list.pop(index)) | |
return result | |
def find_best_phrase(target_phrase, sample_phrases): | |
best_index = None | |
best_score = None | |
for index, sample_phrase in enumerate(sample_phrases): | |
score = score_phrases(target_phrase, sample_phrase) | |
if best_score is None or score < best_score: | |
best_index = index | |
best_score = score | |
return best_index, best_score | |
def score_phrases(left_phrase, right_phrase): | |
return min( | |
( | |
100 - intersecting_word_count(left_variation, right_variation), | |
distance(left_variation, right_variation) | |
) | |
for left_variation in left_phrase | |
for right_variation in right_phrase | |
) | |
def intersecting_word_count(left, right): | |
left_set = set(re.split(r"\W+", left)) | |
right_set = set(re.split(r"\W+", right)) | |
return len(left_set & right_set) | |
if __name__ == '__main__': | |
delimiter = sys.stdin.readline()[:-1] | |
if delimiter == '': | |
delimiter = '(?!x)x' | |
phrases = [line.strip() for line in sys.stdin.readlines()] | |
phrases_map = { | |
tuple(re.split(delimiter, phrase)): phrase | |
for phrase in phrases | |
} | |
keys = sort_phrases(list(phrases_map.keys())) | |
for key in keys: | |
print(phrases_map[key]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment