brymck · April 28, 2018 17:12
diff --git a/trigramfuzzyset.py b/trigramfuzzyset.py
 # based on fuzzyset but considerably optimized to focus on trigrams and cosine similarity and be immutable through public
 # interfaces after initialization
 import re
 from math import sqrt
 from operator import itemgetter
 from collections import defaultdict

 _non_word_sub = re.compile(r'[^\w, ]+').sub

 class TrigramFuzzySet(object):
    def __init__(self, iterable=None):
        self._match_dict = defaultdict(list)
        self._items = []
        for value in iterable:
            self._add(value)

    def _add(self, value):
        lvalue = value.lower()
        simplified = '-' + _non_word_sub('', lvalue) + '-'
        gram_count = len(simplified) - 2
        idx = len(self._items)
        for i in range(gram_count):
            self._match_dict[simplified[i:i + 3]].append(idx)
        self._items.append((sqrt(gram_count), lvalue))

    def get(self, value):
        simplified = '-' + _non_word_sub('', value.lower()) + '-'
        gram_count = len(simplified) - 2
        norm = sqrt(gram_count)

        matches = defaultdict(float)
        match_dict = self._match_dict
        for i in range(gram_count):
            for idx in match_dict[simplified[i:i + 3]]:
                matches[idx] += 1

        items = self._items
        results = [(match_score / (norm * items[idx][0]), items[idx][1])
                   for idx, match_score in matches.items()]
        results.sort(reverse=True, key=itemgetter(0))

        threshold = results[0][0] * 0.5
        return [(score, lval) for score, lval in results[:100] if score > threshold]
	# based on fuzzyset but considerably optimized to focus on trigrams and cosine similarity and be immutable through public
	# interfaces after initialization
	import re
	from math import sqrt
	from operator import itemgetter
	from collections import defaultdict

	_non_word_sub = re.compile(r'[^\w, ]+').sub

	class TrigramFuzzySet(object):
	def __init__(self, iterable=None):
	self._match_dict = defaultdict(list)
	self._items = []
	for value in iterable:
	self._add(value)

	def _add(self, value):
	lvalue = value.lower()
	simplified = '-' + _non_word_sub('', lvalue) + '-'
	gram_count = len(simplified) - 2
	idx = len(self._items)
	for i in range(gram_count):
	self._match_dict[simplified[i:i + 3]].append(idx)
	self._items.append((sqrt(gram_count), lvalue))

	def get(self, value):
	simplified = '-' + _non_word_sub('', value.lower()) + '-'
	gram_count = len(simplified) - 2
	norm = sqrt(gram_count)

	matches = defaultdict(float)
	match_dict = self._match_dict
	for i in range(gram_count):
	for idx in match_dict[simplified[i:i + 3]]:
	matches[idx] += 1

	items = self._items
	results = [(match_score / (norm * items[idx][0]), items[idx][1])
	for idx, match_score in matches.items()]
	results.sort(reverse=True, key=itemgetter(0))

	threshold = results[0][0] * 0.5
	return [(score, lval) for score, lval in results[:100] if score > threshold]