Created
April 23, 2017 11:21
-
-
Save Mohamedemad4/621c28d0feca4f64c3f4496857bb6fc9 to your computer and use it in GitHub Desktop.
this script resmbels a quick intro to googles word2vec wrapper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
#this script resmbels a quick intro to googles word2vec wrapper found in https://github.com/danielfrg/word2vec | |
#you can download text8 from :http://mattmahoney.net/dc/text8.zip | |
#word2vec description :https://code.google.com/archive/p/word2vec/ | |
import time | |
import word2vec | |
from math import sqrt | |
def euclidean_distance(x,y): | |
'returns euclidean distance bettwen 2 vectors' | |
return sqrt(sum(pow(a-b,2) for a, b in zip(x, y))) | |
#Save text in bin with vectors | |
word2vec.word2vec('text8', 'text8.bin', size=100, verbose=True) | |
#load a model | |
t2=time.time() | |
model=word2vec.load('text8.bin') | |
print 'Model Loaded in {0}S '.format(round(time.time()-t2, 3)) | |
eng_vec=model.get_vector('english') | |
eng=model.get_word(eng_vec) | |
print "Similarity of 'irish' and 'english' {0}".format(euclidean_distance(model.get_vector('irish'),model.get_vector('english'))) | |
print "Similarity of 'french' and 'english' {0}".format(euclidean_distance(model.get_vector('french'),model.get_vector('english'))) | |
print "Similarity of 'stanza' and 'english' {0}".format(euclidean_distance(model.get_vector('hello'),model.get_vector('english'))) | |
print "Similarity of 'play' and 'hello' {0}".format(euclidean_distance(model.get_vector('play'),model.get_vector('hello'))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment