Last active
February 3, 2025 15:40
-
-
Save Zadigo/981fab4f4a893ec404b9f8f991b0a66b to your computer and use it in GitHub Desktop.
A simple Python module that calculates the distance between two words
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from difflib import SequenceMatcher | |
import random | |
import Levenshtein | |
from rapidfuzz import fuzz, process | |
from thefuzz import fuzz as thefuzz | |
import pandas | |
from unidecode import unidecode | |
# https://www.datacamp.com/tutorial/fuzzy-string-python | |
# https://en.wikipedia.org/wiki/Levenshtein_distance#:~:text=The%20Levenshtein%20distance%20between%20two,Levenshtein%20distance | |
class FuzzyMatcherMixin: | |
"""A processor that intends to match words | |
that might be similar and therefore create a set | |
of recommended products based on word closeness""" | |
def __init__(self, threshold=0.85): | |
self.threshold = threshold | |
def preprocess_text(self, text: str) -> str: | |
"""Preprocess text for better matching: | |
- Convert to lowercase | |
- Remove special characters | |
- Remove extra whitespace | |
- Handle accents | |
""" | |
if not text: | |
return "" | |
# Convert to lowercase | |
text = text.lower() | |
# Remove accents | |
text = unidecode(text) | |
# Remove special characters but keep spaces | |
text = re.sub(r'[^a-z0-9\s]', '', text) | |
# Remove extra whitespace | |
text = ' '.join(text.split()) | |
return text | |
def simple_ratio_match(self, guess: str, target: str) -> float: | |
"""Basic Levenshtein ratio matching""" | |
guess = self.preprocess_text(guess) | |
target = self.preprocess_text(target) | |
return Levenshtein.ratio(guess, target) | |
def partial_ratio_match(self, guess: str, target: str) -> float: | |
"""Partial ratio matching - good for when guess is part of target | |
or vice versa""" | |
guess = self.preprocess_text(guess) | |
target = self.preprocess_text(target) | |
return thefuzz.partial_ratio(guess, target) / 100.0 | |
def token_sort_ratio_match(self, guess: str, target: str) -> float: | |
"""Token sort ratio - good for when words are in different orders | |
Example: 'hello world' matches 'world hello'""" | |
guess = self.preprocess_text(guess) | |
target = self.preprocess_text(target) | |
return thefuzz.token_sort_ratio(guess, target) / 100.0 | |
def token_set_ratio_match(self, guess: str, target: str) -> float: | |
"""Token set ratio - good for partial matches with extra words | |
Example: 'hello world' matches 'hello wonderful world'""" | |
guess = self.preprocess_text(guess) | |
target = self.preprocess_text(target) | |
return thefuzz.token_set_ratio(guess, target) / 100.0 | |
def weighted_ratio_match(self, guess: str, target: str) -> float: | |
"""Combines multiple matching strategies with weights""" | |
simple_ratio = self.simple_ratio_match(guess, target) | |
partial_ratio = self.partial_ratio_match(guess, target) | |
token_sort = self.token_sort_ratio_match(guess, target) | |
token_set = self.token_set_ratio_match(guess, target) | |
# Weights for different strategies | |
weights = { | |
'simple': 0.2, | |
'partial': 0.3, | |
'token_sort': 0.2, | |
'token_set': 0.3 | |
} | |
weighted_score = ( | |
simple_ratio * weights['simple'] + | |
partial_ratio * weights['partial'] + | |
token_sort * weights['token_sort'] + | |
token_set * weights['token_set'] | |
) | |
return weighted_score | |
def is_match(self, guess: str, target: str, match_type='weighted') -> bool: | |
"""Determine if guess matches target using specified matching strategy""" | |
match_functions = { | |
'simple': self.simple_ratio_match, | |
'partial': self.partial_ratio_match, | |
'token_sort': self.token_sort_ratio_match, | |
'token_set': self.token_set_ratio_match, | |
'weighted': self.weighted_ratio_match | |
} | |
if match_type not in match_functions: | |
raise ValueError(f"Unknown match type: {match_type}") | |
score = match_functions[match_type](guess, target) | |
return score >= self.threshold | |
def get_match_details(self, guess: str, target: str) -> dict: | |
"""Get detailed matching information for all strategies""" | |
return { | |
'simple_ratio': self.simple_ratio_match(guess, target), | |
'partial_ratio': self.partial_ratio_match(guess, target), | |
'token_sort_ratio': self.token_sort_ratio_match(guess, target), | |
'token_set_ratio': self.token_set_ratio_match(guess, target), | |
'weighted_ratio': self.weighted_ratio_match(guess, target) | |
} | |
class SpacyMixin: | |
"""A processor that uses the spacy algorithm to create | |
a list of recommended products""" | |
def recommendation_by_similarity(self, values, products, initial_product, quantity: int) -> list[int]: | |
import scpacy | |
df = pandas.DataFrame(values) | |
try: | |
calculator = spacy.load('fr_core_news_md') | |
except Exception as e: | |
# Instead of failing hard, just return | |
# the first set of available products | |
# to the frontend | |
return products[:quantity] | |
for item in df.itertuples(name='Product'): | |
product_instance = calculator(initial_product.name) | |
result = calculator(item.name).similarity(product_instance) | |
df.loc[item.Index, 'similarity'] = result | |
df = df.sort_values('similarity') | |
high_similarity = df.loc[lambda x: x.similarity > 0.8] | |
selected_items = random.choices( | |
high_similarity.id.to_list(), | |
k=quantity | |
) | |
return selected_items |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment