Created
March 15, 2023 15:34
-
-
Save munro/b7652326c465a6a8aca4a98cf4a44878 to your computer and use it in GitHub Desktop.
Works well for smaller datasets!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from functools import lru_cache | |
from typing import Sequence, TypeVar | |
from fuzzywuzzy import fuzz | |
from unidecode import unidecode | |
T = TypeVar("T") | |
def fuzzy_dedupe_slow(a_values: Sequence[T], b_values: Sequence[T]) -> set[tuple[T, T, int]]: | |
dedupe: set[tuple[T, T, int]] = set() | |
for a in a_values: | |
distances = [(b, edit_distance(a, b)) for b in b_values] | |
best_b, score = sorted(distances, key=lambda x: x[1], reverse=True)[0] | |
dedupe.add((a, best_b, score)) | |
remaining_b = set(b_values) - set([b for _, b, _ in dedupe]) | |
for b in remaining_b: | |
distances = [(a, edit_distance(a, b)) for a in a_values] | |
best_a, score = sorted(distances, key=lambda x: x[1], reverse=True)[0] | |
dedupe.add((best_a, b, score)) | |
return dedupe | |
def test_fuzzy_dedupe_slow(): | |
a_values = ["hllowrld", "fb"] | |
b_values = ["Hello World", "Foobar", "Meow"] | |
dedupe = fuzzy_dedupe_slow(a_values, b_values) | |
assert dedupe == {("hllowrld", "Hello World", 89), ("fb", "Foobar", 50), ("hllowrld", "Meow", 33)} | |
@lru_cache() | |
def edit_distance(a, b): | |
a = clean_text(a) | |
b = clean_text(b) | |
return fuzz.ratio(a, b) | |
@lru_cache() | |
def clean_text(value: str): | |
value = unidecode(value).lower() | |
# remove non letters, including spaces -- you may want to keep spaces tho | |
value = re.sub(r"[^A-Za-z0-9]", "", value) | |
return value |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment