Created
June 9, 2017 02:58
-
-
Save behitek/87374b7492d1d62592b95b60c38235b2 to your computer and use it in GitHub Desktop.
GensimExample.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| import gensim | |
| from gensim.models import word2vec | |
| def w2v(s1, s2, wordmodel): | |
| if s1 == s2: | |
| return 1.0 | |
| s1words = s1.split() | |
| s2words = s2.split() | |
| s1wordsset = set(s1words) | |
| s2wordsset = set(s2words) | |
| vocab = wordmodel.vocab # the vocabulary considered in the word embeddings | |
| if len(s1wordsset & s2wordsset) == 0: | |
| return 0.0 | |
| for word in s1wordsset.copy(): # remove sentence words not found in the vocab | |
| if (word not in vocab): | |
| s1words.remove(word) | |
| for word in s2wordsset.copy(): # idem | |
| if (word not in vocab): | |
| s2words.remove(word) | |
| return wordmodel.n_similarity(s1words, s2words) | |
| if __name__ == '__main__': | |
| wordmodelfile = "C:\\Users\\Hieu Nguyen\\Desktop\\GoogleNews-vectors-negative300.bin.gz" | |
| wordmodel = gensim.models.KeyedVectors.load_word2vec_format(wordmodelfile, binary=True) | |
| s1 = "As California Bounces Back , Governor Calls For Lofty Goals" | |
| s2 = "With California Rebounding, Governor Pushes Big Projects" | |
| "sim(s1,s2) = ", w2v(s1, s2, wordmodel), "/1." | |
| s3 = "Special measures for Beijing polution" | |
| s4 = "Smog cloud blankets Beijing" | |
| "sim(s3,s4) = ", w2v(s3, s4, wordmodel), "/1." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment