Last active
December 5, 2019 22:59
-
-
Save emrepun/ea480c6e6eaf073814732d21a0912458 to your computer and use it in GitHub Desktop.
engine_gist_2.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, math | |
from collections import Counter | |
class CosineSimilarity: | |
def __init__(self): | |
print("Cosine Similarity initialized") | |
@staticmethod | |
def cosine_similarity_of(text1, text2): | |
#get words first | |
first = re.compile(r"[\w']+").findall(text1) | |
second = re.compile(r"[\w']+").findall(text2) | |
#get dictionary with each word and count. | |
vector1 = Counter(first) | |
vector2 = Counter(second) | |
#convert vectors to set to find common words as intersection | |
common = set(vector1.keys()).intersection(set(vector2.keys())) | |
dot_product = 0.0 | |
for i in common: | |
#get amount of each common word for both vectors and multiply them then add them together. | |
dot_product += vector1[i] * vector2[i] | |
squared_sum_vector1 = 0.0 | |
squared_sum_vector2 = 0.0 | |
#get squared sum values of word counts from each vector. | |
for i in vector1.keys(): | |
squared_sum_vector1 += vector1[i]**2 | |
for i in vector2.keys(): | |
squared_sum_vector2 += vector2[i]**2 | |
#calculate magnitude with squared sums. | |
magnitude = math.sqrt(squared_sum_vector1) * math.sqrt(squared_sum_vector2) | |
if not magnitude: | |
return 0.0 | |
else: | |
return float(dot_product) / magnitude |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment