Last active
April 9, 2018 15:02
-
-
Save EdisonChendi/d54597cbbba3afce459b18ac7d529c9a to your computer and use it in GitHub Desktop.
calculate how different of two docs - word2vec
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding:UTF-8 | |
s1 = "my name is Hulk" | |
s2 = "my name is Iron man" | |
s3 = "your job is save the word" | |
from collections import defaultdict | |
import re | |
import math | |
from string import whitespace, punctuation, ascii_uppercase | |
table = str.maketrans(whitespace+punctuation+ascii_uppercase, " "*len(whitespace+punctuation)+ascii_uppercase) | |
def str_2_dict(s): | |
d = defaultdict(lambda: 0) | |
for w in s.translate(table).split(" "): | |
d[w] += 1 | |
return d | |
def vec_length(d): | |
return math.sqrt(sum(v**2 for v in d.values())) | |
def dot_product(d1, d2): | |
return sum(d1[k]*d2[k] for k in d1.keys() & d2.keys()) | |
def word_vec(s1, s2): | |
d1 = str_2_dict(s1) | |
d1_vec_len = vec_length(d1) | |
d2 = str_2_dict(s2) | |
d2_vec_len = vec_length(d2) | |
dot = dot_product(d1, d2) | |
return dot / (d1_vec_len * d2_vec_len) | |
print(word_vec(s1, s2)) | |
print(word_vec(s2, s3)) | |
print(word_vec(s1, s3)) | |
print(word_vec(s1, s1 + " " + s2)) | |
print(word_vec(s1, s1 + " " + s1)) | |
print(word_vec(s1, s1)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment