Skip to content

Instantly share code, notes, and snippets.

@vikramsoni2
Last active March 5, 2019 11:46
Show Gist options
  • Save vikramsoni2/3989c574ccf4afa4f13ce39b2087ffff to your computer and use it in GitHub Desktop.
Save vikramsoni2/3989c574ccf4afa4f13ce39b2087ffff to your computer and use it in GitHub Desktop.
Fuzzy matching between two arrays of strings. for each item in array1, selects the most similar item from array2
### Fuzzy matching between two arrays.
### for each item in array1, selects the most similar item from array2
import re, math
from collections import Counter
WORD = re.compile(r'\w+')
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
selected_columns = {}
for i, selected in enumerate(arr2):
cosine_score = 0.0
for to_select in arr1:
tmp_score = get_cosine(text_to_vector(re.subn(r'([ _./\[\(\)\]]+)', ' ', to_select)[0].replace('__', ' ').strip(' ').lower()),
text_to_vector(re.subn(r'([ _./\[\(\)\]]+)', ' ', selected)[0].replace('__', ' ').strip(' ').lower()))
if tmp_score > cosine_score:
cosine_score = tmp_score
selected_columns[selected] = to_select
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment