Last active
December 18, 2017 21:29
-
-
Save jojonki/d70657f5ad014fc3f3d07dc8eda1c7a8 to your computer and use it in GitHub Desktop.
load_wowrd2vec_binray_gensim.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from gensim.models.keyedvectors import KeyedVectors | |
| model_path = './data/GoogleNews-vectors-negative300.bin' | |
| model = KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True) | |
| model.wv['computer'] # array([ 1.07421875e-01, -2.01171875e-01, 1.23046875e-01, | |
| # you can use this weights like this. | |
| def load_embd_weights(word2vec, vocab_size, embd_size, w2i): | |
| embedding_matrix = np.zeros((vocab_size, embd_size)) | |
| print('embed_matrix.shape', embedding_matrix.shape) | |
| found_ct = 0 | |
| for word, idx in w2i.items(): | |
| # words not found in embedding index will be all-zeros. | |
| if word in word2vec.wv: | |
| embedding_matrix[idx] = word2vec.wv[word] | |
| found_ct += 1 | |
| print(found_ct, 'words are found in word2vec. vocab_size is', vocab_size) | |
| return torch.from_numpy(embedding_matrix).type(torch.FloatTensor) | |
| pre_embd_w = load_embdf_weights(model, vocab_size, embd_size, w2i) | |
| embd = nn.Embedding(vocab_size, embd_size) | |
| embd.weight = nn.Parameter(pre_embd_w, requires_grad=is_train_embd) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment