Skip to content

Instantly share code, notes, and snippets.

@mirfan899
Last active November 15, 2019 04:14
Show Gist options
  • Save mirfan899/d03bb0b064562cee54a05dd4bc48e2a6 to your computer and use it in GitHub Desktop.
Save mirfan899/d03bb0b064562cee54a05dd4bc48e2a6 to your computer and use it in GitHub Desktop.
import codecs
tagged_sentences = codecs.open("../data/data.txt", encoding="utf-8").readlines()
print(tagged_sentences[0])
import ast
import numpy as np
from keras.layers import Dense, InputLayer, Embedding, Activation
from keras.models import Sequential
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
def logits_to_tokens(sequences, index):
token_sequences = []
for categorical_sequence in sequences:
token_sequence = []
for categorical in categorical_sequence:
token_sequence.append(index[np.argmax(categorical)])
token_sequences.append(token_sequence)
return token_sequences
def to_categorical(sequences, categories):
cat_sequences = []
for s in sequences:
cats = []
for item in s:
cats.append(np.zeros(categories))
cats[-1][item] = 1.0
cat_sequences.append(cats)
return np.array(cat_sequences)
# tagged_sentences = codecs.open("../data/data_lstm.txt", encoding="utf-8").readlines()
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
sentence, tags = zip(*ast.literal_eval(tagged_sentence))
sentences.append(np.array(sentence))
sentence_tags.append(np.array(tags))
(train_sentences,
test_sentences,
train_tags,
test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)
words, tags = set([]), set([])
for s in train_sentences:
for w in s:
words.add(w.lower())
for ts in train_tags:
for t in ts:
tags.add(t)
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0 # The special value used for padding
word2index['-OOV-'] = 1 # The special value used for OOVs
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0 # The special value used to padding
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []
for s in train_sentences:
s_int = []
for w in s:
try:
s_int.append(word2index[w.lower()])
except KeyError:
s_int.append(word2index['-OOV-'])
train_sentences_X.append(s_int)
for s in test_sentences:
s_int = []
for w in s:
try:
s_int.append(word2index[w.lower()])
except KeyError:
s_int.append(word2index['-OOV-'])
test_sentences_X.append(s_int)
for s in train_tags:
train_tags_y.append([tag2index[t] for t in s])
for s in test_tags:
test_tags_y.append([tag2index[t] for t in s])
MAX_LENGTH = len(max(train_sentences_X, key=len))
# print(MAX_LENGTH)
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
# train_tags_y = keras.utils.to_categorical(train_tags_, len(tag2index))
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')
# test_tags_y = keras.utils.to_categorical(test_tags_, len(tag2index))
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH,)))
model.add(Embedding(len(word2index), 128))
model.add(Dense(128))
model.add(Dense(len(tag2index)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer=Adam(0.001),
metrics=['accuracy'])
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=32, epochs=10, validation_split=0.2)
model.summary()
print("Original data from test samples")
print(test_sentences[0])
print(test_tags[0])
scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}") # acc: 98.39311069478103
test_samples = [
test_sentences[0]
]
test_samples_X = []
for s in test_samples:
s_int = []
for w in s:
try:
s_int.append(word2index[w.lower()])
except KeyError:
s_int.append(word2index['-OOV-'])
test_samples_X.append(s_int)
test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
predictions = model.predict(test_samples_X)
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment