Last active
April 28, 2018 17:12
-
-
Save brymck/17d48c83753a98c071a57ecff7a6892c to your computer and use it in GitHub Desktop.
Fairly fast trigram fuzzy set implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# based on fuzzyset but considerably optimized to focus on trigrams and cosine similarity and be immutable through public | |
# interfaces after initialization | |
import re | |
from math import sqrt | |
from operator import itemgetter | |
from collections import defaultdict | |
_non_word_sub = re.compile(r'[^\w, ]+').sub | |
class TrigramFuzzySet(object): | |
def __init__(self, iterable=None): | |
self._match_dict = defaultdict(list) | |
self._items = [] | |
for value in iterable: | |
self._add(value) | |
def _add(self, value): | |
lvalue = value.lower() | |
simplified = '-' + _non_word_sub('', lvalue) + '-' | |
gram_count = len(simplified) - 2 | |
idx = len(self._items) | |
for i in range(gram_count): | |
self._match_dict[simplified[i:i + 3]].append(idx) | |
self._items.append((sqrt(gram_count), lvalue)) | |
def get(self, value): | |
simplified = '-' + _non_word_sub('', value.lower()) + '-' | |
gram_count = len(simplified) - 2 | |
norm = sqrt(gram_count) | |
matches = defaultdict(float) | |
match_dict = self._match_dict | |
for i in range(gram_count): | |
for idx in match_dict[simplified[i:i + 3]]: | |
matches[idx] += 1 | |
items = self._items | |
results = [(match_score / (norm * items[idx][0]), items[idx][1]) | |
for idx, match_score in matches.items()] | |
results.sort(reverse=True, key=itemgetter(0)) | |
threshold = results[0][0] * 0.5 | |
return [(score, lval) for score, lval in results[:100] if score > threshold] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment