Last active
December 1, 2018 12:25
-
-
Save DerekChia/b87ca38d7efa34cd4d94edf850790033 to your computer and use it in GitHub Desktop.
w2v_training
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Training | |
w2v.train(training_data) | |
class word2vec(): | |
def train(self, training_data): | |
# Initialising weight matrices | |
# Both s1 and s2 should be randomly initialised but for this demo, we pre-determine the arrays (getW1 and getW2) | |
# getW1 - shape (9x10) and getW2 - shape (10x9) | |
self.w1 = np.array(getW1) | |
self.w2 = np.array(getW2) | |
# self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n)) | |
# self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count)) | |
# Cycle through each epoch | |
for i in range(self.epochs): | |
# Intialise to 0 loss | |
self.loss = 0 | |
# Cycle through each training sample | |
# w_t = vector for target word, w_c = vectors for context words | |
for w_t, w_c in training_data: | |
# Forward pass - Pass in vector for target word (w_t) to get: | |
# 1. predicted y using softmax (y_pred) 2. matrix of hidden layer (h) 3. output layer before softmax (u) | |
y_pred, h, u = self.forward_pass(w_t) | |
# Calculate error | |
# 1. For a target word, calculate difference between y_pred and each of the context words | |
# 2. Sum up the differences using np.sum to give us the error for this particular target word | |
EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0) | |
# Backpropagation | |
# We use SGD to backpropagate errors - calculate loss on the output layer | |
self.backprop(EI, h, w_t) | |
# Calculate loss | |
# There are 2 parts to the loss function | |
# Part 1: -ve sum of all the output + | |
# Part 2: length of context words * log of sum for all elements (exponential-ed) in the output layer before softmax (u) | |
# Note: word.index(1) returns the index in the context word vector with value 1 | |
# Note: u[word.index(1)] returns the value of the output layer before softmax | |
self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u))) | |
print('Epoch:', i, "Loss:", self.loss) | |
def forward_pass(self, x): | |
# x is one-hot vector for target word, shape - 9x1 | |
# Run through first matrix (w1) to get hidden layer - 10x9 dot 9x1 gives us 10x1 | |
h = np.dot(self.w1.T, x) | |
# Dot product hidden layer with second matrix (w2) - 9x10 dot 10x1 gives us 9x1 | |
u = np.dot(self.w2.T, h) | |
# Run 1x9 through softmax to force each element to range of [0, 1] - 1x8 | |
y_c = self.softmax(u) | |
return y_c, h, u | |
def softmax(self, x): | |
e_x = np.exp(x - np.max(x)) | |
return e_x / e_x.sum(axis=0) | |
def backprop(self, e, h, x): | |
# https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.outer.html | |
# Column vector EI represents row-wise sum of prediction errors across each context word for the current center word | |
# Going backwards, we need to take derivative of E with respect of w2 | |
# h - shape 10x1, e - shape 9x1, dl_dw2 - shape 10x9 | |
dl_dw2 = np.outer(h, e) | |
# x - shape 1x8, w2 - 5x8, e.T - 8x1 | |
# x - 1x8, np.dot() - 5x1, dl_dw1 - 8x5 | |
dl_dw1 = np.outer(x, np.dot(self.w2, e.T)) | |
# Update weights | |
self.w1 = self.w1 - (self.lr * dl_dw1) | |
self.w2 = self.w2 - (self.lr * dl_dw2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment