Word2Vec Python Quickstart - Word Similarity

import gensim

# Load Google's pre-trained Word2Vec model.
# dl link https://github.com/mmihaltz/word2vec-GoogleNews-vectors
model = gensim.models.Word2Vec.load_word2vec_format('~/Documents/GoogleNews-vectors-negative300.bin', binary=True)

/Users/jeff/Documents/jeffcode/pond5/seo/langenv/lib/python2.7/site-packages/gensim/utils.py:1015: UserWarning: Pattern library is not installed, lemmatization won't be available.
  warnings.warn("Pattern library is not installed, lemmatization won't be available.")

import pandas as pd

def get_similar_words(your_word, top_n=20):
    try:
        if " " in your_word:
            raise Exception("use underscores not spaces in : {}".format(your_word))
    except KeyError:
        raise Exception("That word wasn't in the corpora: {}".format(your_word))
    gensim_output = model.similar_by_word(your_word, topn=top_n)
    return {your_word: gensim_output}

words_of_interest = ["dogs", "city", "man", "woman", "trump", "clinton"]
data = map(get_similar_words, words_of_interest)

for result in data:
    for k,v in result.items():
        for recco_term, score in v:
            if score > .55:
                print(k,recco_term, score)

('dogs', u'dog', 0.8680489659309387)
('dogs', u'canines', 0.8181711435317993)
('dogs', u'cats', 0.7651764154434204)
('dogs', u'pit_bulls', 0.7548302412033081)
('dogs', u'pets', 0.7424418330192566)
('dogs', u'puppies', 0.7385992407798767)
('dogs', u'pooches', 0.7162367105484009)
('dogs', u'German_shepherds', 0.7071062922477722)
('dogs', u'animals', 0.6985694169998169)
('dogs', u'pit_bull', 0.6983615159988403)
('dogs', u'puppy', 0.69722580909729)
('dogs', u'Rottweilers', 0.6932708024978638)
('dogs', u'Labrador_Retrievers', 0.6893705129623413)
('dogs', u'pitbulls', 0.6863663196563721)
('dogs', u'Labrador_retrievers', 0.6796903610229492)
('dogs', u'German_Shepherds', 0.6734660863876343)
('dogs', u'pit_bulls_rottweilers', 0.6698372960090637)
('dogs', u'Shelties', 0.669748842716217)
('dogs', u'dachshunds', 0.6692028045654297)
('dogs', u'pit_bull_terriers', 0.6684573888778687)
('city', u'citys', 0.6803999543190002)
('city', u'mayor', 0.6751153469085693)
('city', u'town', 0.6723740100860596)
('city', u'municipality', 0.6530812382698059)
('city', u'municipal', 0.6222546696662903)
('city', u'downtown', 0.6198148727416992)
('city', u'thecity', 0.6003420948982239)
('city', u'Mayor', 0.5988517999649048)
('city', u'district', 0.5929595232009888)
('city', u'county', 0.5858184099197388)
('city', u'neighborhoods', 0.5759680271148682)
('city', u'City', 0.5731768608093262)
('city', u'metropolis', 0.5717256665229797)
('city', u'cities', 0.5715821981430054)
('city', u'citywide', 0.5690124034881592)
('city', u'neighborhood', 0.5595947504043579)
('city', u'council', 0.5540551543235779)
('city', u'township', 0.5518128275871277)
('man', u'woman', 0.7664012312889099)
('man', u'boy', 0.6824870705604553)
('man', u'teenager', 0.6586930155754089)
('man', u'teenage_girl', 0.6147903203964233)
('man', u'girl', 0.5921714305877686)
('man', u'suspected_purse_snatcher', 0.571636438369751)
('man', u'robber', 0.5585119724273682)
('man', u'Robbery_suspect', 0.5584409832954407)
('man', u'teen_ager', 0.5549197196960449)
('woman', u'man', 0.7664012312889099)
('woman', u'girl', 0.7494641542434692)
('woman', u'teenage_girl', 0.7336830496788025)
('woman', u'teenager', 0.6317086219787598)
('woman', u'lady', 0.6288787126541138)
('woman', u'teenaged_girl', 0.6141784191131592)
('woman', u'mother', 0.607630729675293)
('woman', u'policewoman', 0.6069462299346924)
('woman', u'boy', 0.5975908041000366)
('woman', u'Woman', 0.5770982503890991)
('woman', u'sexually_assualted', 0.5723769664764404)
('woman', u'she', 0.5641393661499023)
('trump', u'trumps', 0.7198435068130493)
('trump', u'trumping', 0.580585241317749)
('trump', u'supersede', 0.5600422620773315)
('clinton', u'obama', 0.6934448480606079)
('clinton', u'mccain', 0.6902835369110107)
('clinton', u'hillary', 0.6762183308601379)
('clinton', u'barack_obama', 0.6693141460418701)
('clinton', u'reagan', 0.6634905338287354)
('clinton', u'clintons', 0.6601782441139221)
('clinton', u'john_mccain', 0.6575263738632202)
('clinton', u'kerry', 0.6521391272544861)
('clinton', u'palin', 0.6504560708999634)
('clinton', u'hillary_clinton', 0.6483214497566223)
('clinton', u'biden', 0.6376449465751648)
('clinton', u'george_bush', 0.6313134431838989)
('clinton', u'dodd', 0.6177599430084229)
('clinton', u'dems', 0.6080954074859619)
('clinton', u'hilary', 0.6074385643005371)
('clinton', u'dick_cheney', 0.60042405128479)
('clinton', u'barack', 0.6001541614532471)
('clinton', u'cheney', 0.5996928215026855)
('clinton', u'kennedy', 0.5983184576034546)
('clinton', u'pelosi', 0.5955385565757751)

# check words in the models vocan
print len(model.vocab.keys()) # 3 million terms
model.vocab.keys()[:10]

3000000





[u'Allanah_Munson',
 u'WINDS_WILL',
 u'nab_sexual_predators',
 u'By_Alexandra_Barham',
 u'Mayor_Noramie_Jasmin',
 u'Chief_Executive_Glenn_Tilton',
 u'Neil_Kinnock',
 u'Makoto_Tamada_JPN_Konica',
 u'abductor_muscle',
 u'visit_www.availability.sungard.com']

methods = [x for x in dir(model) if not x.startswith("_")]
sorted(methods)

['accuracy',
 'alpha',
 'batch_words',
 'build_vocab',
 'cbow_mean',
 'clear_sims',
 'create_binary_tree',
 'cum_table',
 'doesnt_match',
 'estimate_memory',
 'finalize_vocab',
 'hashfxn',
 'hs',
 'index2word',
 'init_sims',
 'intersect_word2vec_format',
 'iter',
 'layer1_size',
 'load',
 'load_word2vec_format',
 'log_accuracy',
 'make_cum_table',
 'max_vocab_size',
 'min_alpha',
 'min_alpha_yet_reached',
 'min_count',
 'most_similar',
 'most_similar_cosmul',
 'n_similarity',
 'negative',
 'null_word',
 'random',
 'reset_from',
 'reset_weights',
 'sample',
 'save',
 'save_word2vec_format',
 'scale_vocab',
 'scan_vocab',
 'score',
 'seed',
 'seeded_vector',
 'sg',
 'similar_by_vector',
 'similar_by_word',
 'similarity',
 'sort_vocab',
 'sorted_vocab',
 'syn0',
 'syn0norm',
 'total_train_time',
 'train',
 'train_count',
 'update_weights',
 'vector_size',
 'vocab',
 'window',
 'wmdistance',
 'workers']

# Check signature
help(model.similar_by_word)

Help on method similar_by_word in module gensim.models.word2vec:

similar_by_word(self, word, topn=10, restrict_vocab=None) method of gensim.models.word2vec.Word2Vec instance
    Find the top-N most similar words.
    
    If topn is False, similar_by_word returns the vector of similarity scores.
    
    `restrict_vocab` is an optional integer which limits the range of vectors which
    are searched for most-similar values. For example, restrict_vocab=10000 would
    only check the first 10000 word vectors in the vocabulary order. (This may be
    meaningful if you've sorted the vocabulary by descending frequency.)
    
    Example::
    
      >>> trained_model.similar_by_word('graph')
      [('user', 0.9999163150787354), ...]

jrjames83/word2vec_python_gensim.md