import gensim
# Load Google's pre-trained Word2Vec model.
# dl link https://github.com/mmihaltz/word2vec-GoogleNews-vectors
model = gensim.models.Word2Vec.load_word2vec_format('~/Documents/GoogleNews-vectors-negative300.bin', binary=True)
/Users/jeff/Documents/jeffcode/pond5/seo/langenv/lib/python2.7/site-packages/gensim/utils.py:1015: UserWarning: Pattern library is not installed, lemmatization won't be available.
warnings.warn("Pattern library is not installed, lemmatization won't be available.")
import pandas as pd
def get_similar_words(your_word, top_n=20):
try:
if " " in your_word:
raise Exception("use underscores not spaces in : {}".format(your_word))
except KeyError:
raise Exception("That word wasn't in the corpora: {}".format(your_word))
gensim_output = model.similar_by_word(your_word, topn=top_n)
return {your_word: gensim_output}
words_of_interest = ["dogs", "city", "man", "woman", "trump", "clinton"]
data = map(get_similar_words, words_of_interest)
for result in data:
for k,v in result.items():
for recco_term, score in v:
if score > .55:
print(k,recco_term, score)
('dogs', u'dog', 0.8680489659309387)
('dogs', u'canines', 0.8181711435317993)
('dogs', u'cats', 0.7651764154434204)
('dogs', u'pit_bulls', 0.7548302412033081)
('dogs', u'pets', 0.7424418330192566)
('dogs', u'puppies', 0.7385992407798767)
('dogs', u'pooches', 0.7162367105484009)
('dogs', u'German_shepherds', 0.7071062922477722)
('dogs', u'animals', 0.6985694169998169)
('dogs', u'pit_bull', 0.6983615159988403)
('dogs', u'puppy', 0.69722580909729)
('dogs', u'Rottweilers', 0.6932708024978638)
('dogs', u'Labrador_Retrievers', 0.6893705129623413)
('dogs', u'pitbulls', 0.6863663196563721)
('dogs', u'Labrador_retrievers', 0.6796903610229492)
('dogs', u'German_Shepherds', 0.6734660863876343)
('dogs', u'pit_bulls_rottweilers', 0.6698372960090637)
('dogs', u'Shelties', 0.669748842716217)
('dogs', u'dachshunds', 0.6692028045654297)
('dogs', u'pit_bull_terriers', 0.6684573888778687)
('city', u'citys', 0.6803999543190002)
('city', u'mayor', 0.6751153469085693)
('city', u'town', 0.6723740100860596)
('city', u'municipality', 0.6530812382698059)
('city', u'municipal', 0.6222546696662903)
('city', u'downtown', 0.6198148727416992)
('city', u'thecity', 0.6003420948982239)
('city', u'Mayor', 0.5988517999649048)
('city', u'district', 0.5929595232009888)
('city', u'county', 0.5858184099197388)
('city', u'neighborhoods', 0.5759680271148682)
('city', u'City', 0.5731768608093262)
('city', u'metropolis', 0.5717256665229797)
('city', u'cities', 0.5715821981430054)
('city', u'citywide', 0.5690124034881592)
('city', u'neighborhood', 0.5595947504043579)
('city', u'council', 0.5540551543235779)
('city', u'township', 0.5518128275871277)
('man', u'woman', 0.7664012312889099)
('man', u'boy', 0.6824870705604553)
('man', u'teenager', 0.6586930155754089)
('man', u'teenage_girl', 0.6147903203964233)
('man', u'girl', 0.5921714305877686)
('man', u'suspected_purse_snatcher', 0.571636438369751)
('man', u'robber', 0.5585119724273682)
('man', u'Robbery_suspect', 0.5584409832954407)
('man', u'teen_ager', 0.5549197196960449)
('woman', u'man', 0.7664012312889099)
('woman', u'girl', 0.7494641542434692)
('woman', u'teenage_girl', 0.7336830496788025)
('woman', u'teenager', 0.6317086219787598)
('woman', u'lady', 0.6288787126541138)
('woman', u'teenaged_girl', 0.6141784191131592)
('woman', u'mother', 0.607630729675293)
('woman', u'policewoman', 0.6069462299346924)
('woman', u'boy', 0.5975908041000366)
('woman', u'Woman', 0.5770982503890991)
('woman', u'sexually_assualted', 0.5723769664764404)
('woman', u'she', 0.5641393661499023)
('trump', u'trumps', 0.7198435068130493)
('trump', u'trumping', 0.580585241317749)
('trump', u'supersede', 0.5600422620773315)
('clinton', u'obama', 0.6934448480606079)
('clinton', u'mccain', 0.6902835369110107)
('clinton', u'hillary', 0.6762183308601379)
('clinton', u'barack_obama', 0.6693141460418701)
('clinton', u'reagan', 0.6634905338287354)
('clinton', u'clintons', 0.6601782441139221)
('clinton', u'john_mccain', 0.6575263738632202)
('clinton', u'kerry', 0.6521391272544861)
('clinton', u'palin', 0.6504560708999634)
('clinton', u'hillary_clinton', 0.6483214497566223)
('clinton', u'biden', 0.6376449465751648)
('clinton', u'george_bush', 0.6313134431838989)
('clinton', u'dodd', 0.6177599430084229)
('clinton', u'dems', 0.6080954074859619)
('clinton', u'hilary', 0.6074385643005371)
('clinton', u'dick_cheney', 0.60042405128479)
('clinton', u'barack', 0.6001541614532471)
('clinton', u'cheney', 0.5996928215026855)
('clinton', u'kennedy', 0.5983184576034546)
('clinton', u'pelosi', 0.5955385565757751)
# check words in the models vocan
print len(model.vocab.keys()) # 3 million terms
model.vocab.keys()[:10]
3000000
[u'Allanah_Munson',
u'WINDS_WILL',
u'nab_sexual_predators',
u'By_Alexandra_Barham',
u'Mayor_Noramie_Jasmin',
u'Chief_Executive_Glenn_Tilton',
u'Neil_Kinnock',
u'Makoto_Tamada_JPN_Konica',
u'abductor_muscle',
u'visit_www.availability.sungard.com']
methods = [x for x in dir(model) if not x.startswith("_")]
sorted(methods)
['accuracy',
'alpha',
'batch_words',
'build_vocab',
'cbow_mean',
'clear_sims',
'create_binary_tree',
'cum_table',
'doesnt_match',
'estimate_memory',
'finalize_vocab',
'hashfxn',
'hs',
'index2word',
'init_sims',
'intersect_word2vec_format',
'iter',
'layer1_size',
'load',
'load_word2vec_format',
'log_accuracy',
'make_cum_table',
'max_vocab_size',
'min_alpha',
'min_alpha_yet_reached',
'min_count',
'most_similar',
'most_similar_cosmul',
'n_similarity',
'negative',
'null_word',
'random',
'reset_from',
'reset_weights',
'sample',
'save',
'save_word2vec_format',
'scale_vocab',
'scan_vocab',
'score',
'seed',
'seeded_vector',
'sg',
'similar_by_vector',
'similar_by_word',
'similarity',
'sort_vocab',
'sorted_vocab',
'syn0',
'syn0norm',
'total_train_time',
'train',
'train_count',
'update_weights',
'vector_size',
'vocab',
'window',
'wmdistance',
'workers']
# Check signature
help(model.similar_by_word)
Help on method similar_by_word in module gensim.models.word2vec:
similar_by_word(self, word, topn=10, restrict_vocab=None) method of gensim.models.word2vec.Word2Vec instance
Find the top-N most similar words.
If topn is False, similar_by_word returns the vector of similarity scores.
`restrict_vocab` is an optional integer which limits the range of vectors which
are searched for most-similar values. For example, restrict_vocab=10000 would
only check the first 10000 word vectors in the vocabulary order. (This may be
meaningful if you've sorted the vocabulary by descending frequency.)
Example::
>>> trained_model.similar_by_word('graph')
[('user', 0.9999163150787354), ...]