Skip to content

Instantly share code, notes, and snippets.

@brymck
Last active April 28, 2018 17:12
Show Gist options
  • Save brymck/17d48c83753a98c071a57ecff7a6892c to your computer and use it in GitHub Desktop.
Save brymck/17d48c83753a98c071a57ecff7a6892c to your computer and use it in GitHub Desktop.
Fairly fast trigram fuzzy set implementation
# based on fuzzyset but considerably optimized to focus on trigrams and cosine similarity and be immutable through public
# interfaces after initialization
import re
from math import sqrt
from operator import itemgetter
from collections import defaultdict
_non_word_sub = re.compile(r'[^\w, ]+').sub
class TrigramFuzzySet(object):
def __init__(self, iterable=None):
self._match_dict = defaultdict(list)
self._items = []
for value in iterable:
self._add(value)
def _add(self, value):
lvalue = value.lower()
simplified = '-' + _non_word_sub('', lvalue) + '-'
gram_count = len(simplified) - 2
idx = len(self._items)
for i in range(gram_count):
self._match_dict[simplified[i:i + 3]].append(idx)
self._items.append((sqrt(gram_count), lvalue))
def get(self, value):
simplified = '-' + _non_word_sub('', value.lower()) + '-'
gram_count = len(simplified) - 2
norm = sqrt(gram_count)
matches = defaultdict(float)
match_dict = self._match_dict
for i in range(gram_count):
for idx in match_dict[simplified[i:i + 3]]:
matches[idx] += 1
items = self._items
results = [(match_score / (norm * items[idx][0]), items[idx][1])
for idx, match_score in matches.items()]
results.sort(reverse=True, key=itemgetter(0))
threshold = results[0][0] * 0.5
return [(score, lval) for score, lval in results[:100] if score > threshold]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment