Skip to content

Instantly share code, notes, and snippets.

@munro
Created March 15, 2023 15:34
Show Gist options
  • Save munro/b7652326c465a6a8aca4a98cf4a44878 to your computer and use it in GitHub Desktop.
Save munro/b7652326c465a6a8aca4a98cf4a44878 to your computer and use it in GitHub Desktop.
Works well for smaller datasets!
import re
from functools import lru_cache
from typing import Sequence, TypeVar
from fuzzywuzzy import fuzz
from unidecode import unidecode
T = TypeVar("T")
def fuzzy_dedupe_slow(a_values: Sequence[T], b_values: Sequence[T]) -> set[tuple[T, T, int]]:
dedupe: set[tuple[T, T, int]] = set()
for a in a_values:
distances = [(b, edit_distance(a, b)) for b in b_values]
best_b, score = sorted(distances, key=lambda x: x[1], reverse=True)[0]
dedupe.add((a, best_b, score))
remaining_b = set(b_values) - set([b for _, b, _ in dedupe])
for b in remaining_b:
distances = [(a, edit_distance(a, b)) for a in a_values]
best_a, score = sorted(distances, key=lambda x: x[1], reverse=True)[0]
dedupe.add((best_a, b, score))
return dedupe
def test_fuzzy_dedupe_slow():
a_values = ["hllowrld", "fb"]
b_values = ["Hello World", "Foobar", "Meow"]
dedupe = fuzzy_dedupe_slow(a_values, b_values)
assert dedupe == {("hllowrld", "Hello World", 89), ("fb", "Foobar", 50), ("hllowrld", "Meow", 33)}
@lru_cache()
def edit_distance(a, b):
a = clean_text(a)
b = clean_text(b)
return fuzz.ratio(a, b)
@lru_cache()
def clean_text(value: str):
value = unidecode(value).lower()
# remove non letters, including spaces -- you may want to keep spaces tho
value = re.sub(r"[^A-Za-z0-9]", "", value)
return value
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment