Last active
February 16, 2022 07:00
-
-
Save aneesh-joshi/c8a451502958fa367d84bf038081ee4b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
import numpy as np | |
corpus_raw = 'He is the king . The king is royal . She is the royal queen ' | |
# convert to lower case | |
corpus_raw = corpus_raw.lower() | |
words = [] | |
for word in corpus_raw.split(): | |
if word != '.': # because we don't want to treat . as a word | |
words.append(word) | |
words = set(words) # so that all duplicate words are removed | |
word2int = {} | |
int2word = {} | |
vocab_size = len(words) # gives the total number of unique words | |
for i,word in enumerate(words): | |
word2int[word] = i | |
int2word[i] = word | |
# raw sentences is a list of sentences. | |
raw_sentences = corpus_raw.split('.') | |
sentences = [] | |
for sentence in raw_sentences: | |
sentences.append(sentence.split()) | |
WINDOW_SIZE = 2 | |
data = [] | |
for sentence in sentences: | |
for word_index, word in enumerate(sentence): | |
for nb_word in sentence[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(sentence)) + 1] : | |
if nb_word != word: | |
data.append([word, nb_word]) | |
# function to convert numbers to one hot vectors | |
def to_one_hot(data_point_index, vocab_size): | |
temp = np.zeros(vocab_size) | |
temp[data_point_index] = 1 | |
return temp | |
x_train = [] # input word | |
y_train = [] # output word | |
for data_word in data: | |
x_train.append(to_one_hot(word2int[ data_word[0] ], vocab_size)) | |
y_train.append(to_one_hot(word2int[ data_word[1] ], vocab_size)) | |
# convert them to numpy arrays | |
x_train = np.asarray(x_train) | |
y_train = np.asarray(y_train) | |
# making placeholders for x_train and y_train | |
x = tf.placeholder(tf.float32, shape=(None, vocab_size)) | |
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size)) | |
EMBEDDING_DIM = 5 # you can choose your own number | |
W1 = tf.Variable(tf.random_normal([vocab_size, EMBEDDING_DIM])) | |
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM])) #bias | |
hidden_representation = tf.add(tf.matmul(x,W1), b1) | |
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab_size])) | |
b2 = tf.Variable(tf.random_normal([vocab_size])) | |
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_representation, W2), b2)) | |
sess = tf.Session() | |
init = tf.global_variables_initializer() | |
sess.run(init) #make sure you do this! | |
# define the loss function: | |
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1])) | |
# define the training step: | |
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss) | |
n_iters = 10000 | |
# train for n_iter iterations | |
for _ in range(n_iters): | |
sess.run(train_step, feed_dict={x: x_train, y_label: y_train}) | |
print('loss is : ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train})) | |
vectors = sess.run(W1 + b1) | |
def euclidean_dist(vec1, vec2): | |
return np.sqrt(np.sum((vec1-vec2)**2)) | |
def find_closest(word_index, vectors): | |
min_dist = 10000 # to act like positive infinity | |
min_index = -1 | |
query_vector = vectors[word_index] | |
for index, vector in enumerate(vectors): | |
if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector): | |
min_dist = euclidean_dist(vector, query_vector) | |
min_index = index | |
return min_index | |
from sklearn.manifold import TSNE | |
model = TSNE(n_components=2, random_state=0) | |
np.set_printoptions(suppress=True) | |
vectors = model.fit_transform(vectors) | |
from sklearn import preprocessing | |
normalizer = preprocessing.Normalizer() | |
vectors = normalizer.fit_transform(vectors, 'l2') | |
print(vectors) | |
import matplotlib.pyplot as plt | |
fig, ax = plt.subplots() | |
print(words) | |
for word in words: | |
print(word, vectors[word2int[word]][1]) | |
ax.annotate(word, (vectors[word2int[word]][0],vectors[word2int[word]][1] )) | |
plt.show() |
On Line 35:
if nb_word != word:
I believe this will generate incorrect neighbour pairs, since there are instances where a word might be its own neighbour. e.g. "I think I want to go to the park". Both "I" and "to" have neighbour pairs containing themselves.
On Line 35:
if nb_word != word:I believe this will generate incorrect neighbour pairs, since there are instances where a word might be its own neighbour. e.g. "I think I want to go to the park". Both "I" and "to" have neighbour pairs containing themselves.
Will it though? Since we are taking window of size 2 wont it just take [i,think] and then [think,i]
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@Leothorn you're suggesting this change because there would be only 5 unique words in the set now? seems like a good catch...
let's link to the guide