Last active
March 5, 2019 11:46
-
-
Save vikramsoni2/3989c574ccf4afa4f13ce39b2087ffff to your computer and use it in GitHub Desktop.
Fuzzy matching between two arrays of strings. for each item in array1, selects the most similar item from array2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Fuzzy matching between two arrays. | |
### for each item in array1, selects the most similar item from array2 | |
import re, math | |
from collections import Counter | |
WORD = re.compile(r'\w+') | |
def get_cosine(vec1, vec2): | |
intersection = set(vec1.keys()) & set(vec2.keys()) | |
numerator = sum([vec1[x] * vec2[x] for x in intersection]) | |
sum1 = sum([vec1[x]**2 for x in vec1.keys()]) | |
sum2 = sum([vec2[x]**2 for x in vec2.keys()]) | |
denominator = math.sqrt(sum1) * math.sqrt(sum2) | |
if not denominator: | |
return 0.0 | |
else: | |
return float(numerator) / denominator | |
def text_to_vector(text): | |
words = WORD.findall(text) | |
return Counter(words) | |
selected_columns = {} | |
for i, selected in enumerate(arr2): | |
cosine_score = 0.0 | |
for to_select in arr1: | |
tmp_score = get_cosine(text_to_vector(re.subn(r'([ _./\[\(\)\]]+)', ' ', to_select)[0].replace('__', ' ').strip(' ').lower()), | |
text_to_vector(re.subn(r'([ _./\[\(\)\]]+)', ' ', selected)[0].replace('__', ' ').strip(' ').lower())) | |
if tmp_score > cosine_score: | |
cosine_score = tmp_score | |
selected_columns[selected] = to_select | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment