Skip to content

Instantly share code, notes, and snippets.

@EdisonChendi
Last active April 9, 2018 15:02
Show Gist options
  • Save EdisonChendi/d54597cbbba3afce459b18ac7d529c9a to your computer and use it in GitHub Desktop.
Save EdisonChendi/d54597cbbba3afce459b18ac7d529c9a to your computer and use it in GitHub Desktop.
calculate how different of two docs - word2vec
#coding:UTF-8
s1 = "my name is Hulk"
s2 = "my name is Iron man"
s3 = "your job is save the word"
from collections import defaultdict
import re
import math
from string import whitespace, punctuation, ascii_uppercase
table = str.maketrans(whitespace+punctuation+ascii_uppercase, " "*len(whitespace+punctuation)+ascii_uppercase)
def str_2_dict(s):
d = defaultdict(lambda: 0)
for w in s.translate(table).split(" "):
d[w] += 1
return d
def vec_length(d):
return math.sqrt(sum(v**2 for v in d.values()))
def dot_product(d1, d2):
return sum(d1[k]*d2[k] for k in d1.keys() & d2.keys())
def word_vec(s1, s2):
d1 = str_2_dict(s1)
d1_vec_len = vec_length(d1)
d2 = str_2_dict(s2)
d2_vec_len = vec_length(d2)
dot = dot_product(d1, d2)
return dot / (d1_vec_len * d2_vec_len)
print(word_vec(s1, s2))
print(word_vec(s2, s3))
print(word_vec(s1, s3))
print(word_vec(s1, s1 + " " + s2))
print(word_vec(s1, s1 + " " + s1))
print(word_vec(s1, s1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment