Skip to content

Instantly share code, notes, and snippets.

@jojonki
Last active December 18, 2017 21:29
Show Gist options
  • Select an option

  • Save jojonki/d70657f5ad014fc3f3d07dc8eda1c7a8 to your computer and use it in GitHub Desktop.

Select an option

Save jojonki/d70657f5ad014fc3f3d07dc8eda1c7a8 to your computer and use it in GitHub Desktop.
load_wowrd2vec_binray_gensim.py
from gensim.models.keyedvectors import KeyedVectors
model_path = './data/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)
model.wv['computer'] # array([ 1.07421875e-01, -2.01171875e-01, 1.23046875e-01,
# you can use this weights like this.
def load_embd_weights(word2vec, vocab_size, embd_size, w2i):
embedding_matrix = np.zeros((vocab_size, embd_size))
print('embed_matrix.shape', embedding_matrix.shape)
found_ct = 0
for word, idx in w2i.items():
# words not found in embedding index will be all-zeros.
if word in word2vec.wv:
embedding_matrix[idx] = word2vec.wv[word]
found_ct += 1
print(found_ct, 'words are found in word2vec. vocab_size is', vocab_size)
return torch.from_numpy(embedding_matrix).type(torch.FloatTensor)
pre_embd_w = load_embdf_weights(model, vocab_size, embd_size, w2i)
embd = nn.Embedding(vocab_size, embd_size)
embd.weight = nn.Parameter(pre_embd_w, requires_grad=is_train_embd)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment