EdisonChendi · April 9, 2018 15:02
diff --git a/document_distance.py b/document_distance.py
 #coding:UTF-8

 s1 = "my name is Hulk"
 s2 = "my name is Iron man"
 s3 = "your job is save the word"

 from collections import defaultdict
 import re
 import math
 from string import whitespace, punctuation, ascii_uppercase

 table = str.maketrans(whitespace+punctuation+ascii_uppercase, " "*len(whitespace+punctuation)+ascii_uppercase)

 def str_2_dict(s):
    d = defaultdict(lambda: 0)
    for w in s.translate(table).split(" "):
        d[w] += 1
    return d

 def vec_length(d):
    return math.sqrt(sum(v**2 for v in d.values()))

 def dot_product(d1, d2):
    return sum(d1[k]*d2[k] for k in d1.keys() & d2.keys())

 def word_vec(s1, s2):
    d1 = str_2_dict(s1)
    d1_vec_len = vec_length(d1)

    d2 = str_2_dict(s2)
    d2_vec_len = vec_length(d2)

    dot = dot_product(d1, d2)

    return dot / (d1_vec_len * d2_vec_len)

 print(word_vec(s1, s2))
 print(word_vec(s2, s3))
 print(word_vec(s1, s3))
 print(word_vec(s1, s1 + " " + s2))
 print(word_vec(s1, s1 + " " + s1))
 print(word_vec(s1, s1))
	#coding:UTF-8

	s1 = "my name is Hulk"
	s2 = "my name is Iron man"
	s3 = "your job is save the word"

	from collections import defaultdict
	import re
	import math
	from string import whitespace, punctuation, ascii_uppercase

	table = str.maketrans(whitespace+punctuation+ascii_uppercase, " "*len(whitespace+punctuation)+ascii_uppercase)

	def str_2_dict(s):
	d = defaultdict(lambda: 0)
	for w in s.translate(table).split(" "):
	d[w] += 1
	return d

	def vec_length(d):
	return math.sqrt(sum(v**2 for v in d.values()))

	def dot_product(d1, d2):
	return sum(d1[k]*d2[k] for k in d1.keys() & d2.keys())

	def word_vec(s1, s2):
	d1 = str_2_dict(s1)
	d1_vec_len = vec_length(d1)

	d2 = str_2_dict(s2)
	d2_vec_len = vec_length(d2)

	dot = dot_product(d1, d2)

	return dot / (d1_vec_len * d2_vec_len)

	print(word_vec(s1, s2))
	print(word_vec(s2, s3))
	print(word_vec(s1, s3))
	print(word_vec(s1, s1 + " " + s2))
	print(word_vec(s1, s1 + " " + s1))
	print(word_vec(s1, s1))