Skip to content

Instantly share code, notes, and snippets.

@farizrahman4u
Created February 19, 2019 11:56
Show Gist options
  • Save farizrahman4u/e4e7ad935dc85e7c43b9d21eacb1f42d to your computer and use it in GitHub Desktop.
Save farizrahman4u/e4e7ad935dc85e7c43b9d21eacb1f42d to your computer and use it in GitHub Desktop.
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import json
import random
# indexes
word2idx = {}
idx2word = {}
tag2idx = {}
idx2tag = {}
def get_word(idx):
if idx == 0:
return "<UNK>"
return idx2word[idx]
def get_word_idx(word, add_new=False):
if word not in word2idx:
if add_new:
idx = len(word2idx) + 2
word2idx[word] = idx
idx2word[idx] = word
return idx
else:
return 0
return word2idx[word]
def get_tag(idx):
return idx2tag[idx]
def get_tag_idx(tag, add_new=False):
if tag not in tag2idx:
if add_new:
idx = len(tag2idx) + 1
tag2idx[tag] = idx
idx2tag[idx] = tag
return idx
else:
return 0
return tag2idx[tag]
def vectorize_sentence(sentence):
'''
# Arguments:
sentence: list of string
'''
return [get_word_idx(word, True) for word in sentence]
def vectorize_tags(tags):
'''
# Arguments:
tags: list of string
'''
return [get_tag_idx(tag, True) for tag in tags]
def get_data(test_split=0.1):
with open('data/data.json', 'r') as f:
rows = json.load(f)
X = []
Y = []
for row in rows:
sentance, poss, tags = row
x = vectorize_sentence(sentance)
y = vectorize_tags(tags)
X.append(x)
Y.append(y)
# save indexes for later use
with open('idexes.json', 'w') as f:
json.dump([word2idx, idx2tag, tag2idx, idx2tag], f)
# shuffle
idxs = list(range(len(X)))
random.shuffle(idxs)
X = [X[i] for i in idxs]
Y = [Y[i] for i in idxs]
# padding
X = pad_sequences(X, padding='post')
Y = pad_sequences(Y, padding='post')
if test_split is None:
return X, Y
else:
num_test_samples = int(len(X) * test_split)
X_train = X[:-num_test_samples]
Y_train = Y[:-num_test_samples]
X_test = X[-num_test_samples:]
Y_test = Y[-num_test_samples:]
train_data = X_train, Y_train
test_data = X_test, Y_test
return train_data, test_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment