Skip to content

Instantly share code, notes, and snippets.

@amn41
Last active October 17, 2017 01:19
Show Gist options
  • Save amn41/a43ea2efdce42ea3e37b4cd45a552fcc to your computer and use it in GitHub Desktop.
Save amn41/a43ea2efdce42ea3e37b4cd45a552fcc to your computer and use it in GitHub Desktop.
class Embedding(object):
def __init__(self,vocab_file,vectors_file):
with open(vocab_file, 'r') as f:
words = [x.rstrip().split(' ')[0] for x in f.readlines()]
with open(vectors_file, 'r') as f:
vectors = {}
for line in f:
vals = line.rstrip().split(' ')
vectors[vals[0]] = [float(x) for x in vals[1:]]
vocab_size = len(words)
vocab = {w: idx for idx, w in enumerate(words)}
ivocab = {idx: w for idx, w in enumerate(words)}
vector_dim = len(vectors[ivocab[0]])
W = np.zeros((vocab_size, vector_dim))
for word, v in vectors.items():
if word == '<unk>':
continue
W[vocab[word], :] = v
# normalize each word vector to unit variance
W_norm = np.zeros(W.shape)
d = (np.sum(W ** 2, 1) ** (0.5))
W_norm = (W.T / d).T
self.W = W_norm
self.vocab = vocab
self.ivocab = ivocab
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment