Skip to content

Instantly share code, notes, and snippets.

@behitek
Created June 9, 2017 02:58
Show Gist options
  • Select an option

  • Save behitek/87374b7492d1d62592b95b60c38235b2 to your computer and use it in GitHub Desktop.

Select an option

Save behitek/87374b7492d1d62592b95b60c38235b2 to your computer and use it in GitHub Desktop.
GensimExample.py
import sys
import gensim
from gensim.models import word2vec
def w2v(s1, s2, wordmodel):
if s1 == s2:
return 1.0
s1words = s1.split()
s2words = s2.split()
s1wordsset = set(s1words)
s2wordsset = set(s2words)
vocab = wordmodel.vocab # the vocabulary considered in the word embeddings
if len(s1wordsset & s2wordsset) == 0:
return 0.0
for word in s1wordsset.copy(): # remove sentence words not found in the vocab
if (word not in vocab):
s1words.remove(word)
for word in s2wordsset.copy(): # idem
if (word not in vocab):
s2words.remove(word)
return wordmodel.n_similarity(s1words, s2words)
if __name__ == '__main__':
wordmodelfile = "C:\\Users\\Hieu Nguyen\\Desktop\\GoogleNews-vectors-negative300.bin.gz"
wordmodel = gensim.models.KeyedVectors.load_word2vec_format(wordmodelfile, binary=True)
s1 = "As California Bounces Back , Governor Calls For Lofty Goals"
s2 = "With California Rebounding, Governor Pushes Big Projects"
print
"sim(s1,s2) = ", w2v(s1, s2, wordmodel), "/1."
s3 = "Special measures for Beijing polution"
s4 = "Smog cloud blankets Beijing"
print
"sim(s3,s4) = ", w2v(s3, s4, wordmodel), "/1."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment