Skip to content

Instantly share code, notes, and snippets.

@emrepun
Last active December 5, 2019 22:59
Show Gist options
  • Save emrepun/ea480c6e6eaf073814732d21a0912458 to your computer and use it in GitHub Desktop.
Save emrepun/ea480c6e6eaf073814732d21a0912458 to your computer and use it in GitHub Desktop.
engine_gist_2.py
import re, math
from collections import Counter
class CosineSimilarity:
def __init__(self):
print("Cosine Similarity initialized")
@staticmethod
def cosine_similarity_of(text1, text2):
#get words first
first = re.compile(r"[\w']+").findall(text1)
second = re.compile(r"[\w']+").findall(text2)
#get dictionary with each word and count.
vector1 = Counter(first)
vector2 = Counter(second)
#convert vectors to set to find common words as intersection
common = set(vector1.keys()).intersection(set(vector2.keys()))
dot_product = 0.0
for i in common:
#get amount of each common word for both vectors and multiply them then add them together.
dot_product += vector1[i] * vector2[i]
squared_sum_vector1 = 0.0
squared_sum_vector2 = 0.0
#get squared sum values of word counts from each vector.
for i in vector1.keys():
squared_sum_vector1 += vector1[i]**2
for i in vector2.keys():
squared_sum_vector2 += vector2[i]**2
#calculate magnitude with squared sums.
magnitude = math.sqrt(squared_sum_vector1) * math.sqrt(squared_sum_vector2)
if not magnitude:
return 0.0
else:
return float(dot_product) / magnitude
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment