-
-
Save aneesh-joshi/c8a451502958fa367d84bf038081ee4b to your computer and use it in GitHub Desktop.
import tensorflow as tf | |
import numpy as np | |
corpus_raw = 'He is the king . The king is royal . She is the royal queen ' | |
# convert to lower case | |
corpus_raw = corpus_raw.lower() | |
words = [] | |
for word in corpus_raw.split(): | |
if word != '.': # because we don't want to treat . as a word | |
words.append(word) | |
words = set(words) # so that all duplicate words are removed | |
word2int = {} | |
int2word = {} | |
vocab_size = len(words) # gives the total number of unique words | |
for i,word in enumerate(words): | |
word2int[word] = i | |
int2word[i] = word | |
# raw sentences is a list of sentences. | |
raw_sentences = corpus_raw.split('.') | |
sentences = [] | |
for sentence in raw_sentences: | |
sentences.append(sentence.split()) | |
WINDOW_SIZE = 2 | |
data = [] | |
for sentence in sentences: | |
for word_index, word in enumerate(sentence): | |
for nb_word in sentence[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(sentence)) + 1] : | |
if nb_word != word: | |
data.append([word, nb_word]) | |
# function to convert numbers to one hot vectors | |
def to_one_hot(data_point_index, vocab_size): | |
temp = np.zeros(vocab_size) | |
temp[data_point_index] = 1 | |
return temp | |
x_train = [] # input word | |
y_train = [] # output word | |
for data_word in data: | |
x_train.append(to_one_hot(word2int[ data_word[0] ], vocab_size)) | |
y_train.append(to_one_hot(word2int[ data_word[1] ], vocab_size)) | |
# convert them to numpy arrays | |
x_train = np.asarray(x_train) | |
y_train = np.asarray(y_train) | |
# making placeholders for x_train and y_train | |
x = tf.placeholder(tf.float32, shape=(None, vocab_size)) | |
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size)) | |
EMBEDDING_DIM = 5 # you can choose your own number | |
W1 = tf.Variable(tf.random_normal([vocab_size, EMBEDDING_DIM])) | |
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM])) #bias | |
hidden_representation = tf.add(tf.matmul(x,W1), b1) | |
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab_size])) | |
b2 = tf.Variable(tf.random_normal([vocab_size])) | |
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_representation, W2), b2)) | |
sess = tf.Session() | |
init = tf.global_variables_initializer() | |
sess.run(init) #make sure you do this! | |
# define the loss function: | |
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1])) | |
# define the training step: | |
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss) | |
n_iters = 10000 | |
# train for n_iter iterations | |
for _ in range(n_iters): | |
sess.run(train_step, feed_dict={x: x_train, y_label: y_train}) | |
print('loss is : ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train})) | |
vectors = sess.run(W1 + b1) | |
def euclidean_dist(vec1, vec2): | |
return np.sqrt(np.sum((vec1-vec2)**2)) | |
def find_closest(word_index, vectors): | |
min_dist = 10000 # to act like positive infinity | |
min_index = -1 | |
query_vector = vectors[word_index] | |
for index, vector in enumerate(vectors): | |
if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector): | |
min_dist = euclidean_dist(vector, query_vector) | |
min_index = index | |
return min_index | |
from sklearn.manifold import TSNE | |
model = TSNE(n_components=2, random_state=0) | |
np.set_printoptions(suppress=True) | |
vectors = model.fit_transform(vectors) | |
from sklearn import preprocessing | |
normalizer = preprocessing.Normalizer() | |
vectors = normalizer.fit_transform(vectors, 'l2') | |
print(vectors) | |
import matplotlib.pyplot as plt | |
fig, ax = plt.subplots() | |
print(words) | |
for word in words: | |
print(word, vectors[word2int[word]][1]) | |
ax.annotate(word, (vectors[word2int[word]][0],vectors[word2int[word]][1] )) | |
plt.show() |
I enjoyed it as well - I added the code above and created a jupyter notebook for it - its a great learning tutorial
Very good notebook and tutorial -
Though i get the reference to 42
print(int2word[42]) - > in your guide is a bit confusing . You may want to change to something already available in the dictionary say [5] instead of 42 .
On Line 35:
if nb_word != word:
I believe this will generate incorrect neighbour pairs, since there are instances where a word might be its own neighbour. e.g. "I think I want to go to the park". Both "I" and "to" have neighbour pairs containing themselves.
On Line 35:
if nb_word != word:I believe this will generate incorrect neighbour pairs, since there are instances where a word might be its own neighbour. e.g. "I think I want to go to the park". Both "I" and "to" have neighbour pairs containing themselves.
Will it though? Since we are taking window of size 2 wont it just take [i,think] and then [think,i]
Hey! Love the tutorial. I had to add this to line 123 (before plt.show())