Skip to content

Instantly share code, notes, and snippets.

View mbednarski's full-sized avatar

Mateusz Bednarski mbednarski

View GitHub Profile
import click
from random_forest import RandomForestModel
import sys
sys.path.append('src')
from data.preprocess import read_processed_data
@click.command()
corpus = [
'he is a king',
'she is a queen',
'he is a man',
'she is a woman',
'warsaw is poland capital',
'berlin is germany capital',
'paris is france capital',
]
def tokenize_corpus(corpus):
tokens = [x.split() for x in corpus]
return tokens
def tokenize_corpus(corpus):
tokens = [x.split() for x in corpus]
return tokens
tokenized_corpus = tokenize_corpus(corpus)
vocabulary = []
for sentence in tokenized_corpus:
for token in sentence:
if token not in vocabulary:
vocabulary.append(token)
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}
vocabulary_size = len(vocabulary)
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
indices = [word2idx[word] for word in sentence]
# for each word, threated as center word
for center_word_pos in range(len(indices)):
# for each window position
for w in range(-window_size, window_size + 1):
context_word_pos = center_word_pos + w
def get_input_layer(word_idx):
x = torch.zeros(vocabulary_size).float()
x[word_idx] = 1.0
return x
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
z1 = torch.matmul(W1, x)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
z2 = torch.matmul(W2, z1)
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.001
for epo in range(num_epochs):
loss_val = 0
for data, target in idx_pairs:
x = Variable(get_input_layer(data)).float()