Created
August 31, 2017 07:21
-
-
Save hackintoshrao/1a538f86e1d01015e92fa9edde6d3b5e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_embedding_layer(vocab_index, learned_embeddings, max_words, embedding_dim, max_seq_len, do_train): | |
"""" | |
Creates the Embedding layer using learned embedding and word indesx map of all | |
unique words in the text corpus. | |
||Params|| | |
vocab_index : Map of all unique words in your text corpus as keys and their index as values | |
learned_embeddings: Learned embedding representation from GLove/Word2Vec. | |
max_words: Max words to be used from the vocab | |
embedding_dim: Size of the learned embedding (100/300/600) | |
max_seq_len: max length of input text used for training/validation. | |
do_train: Boolean flag to indicate whether the embedding vectors has to be trained/altered. | |
||Return|| | |
embedding_layer: Embedding Layer | |
""" | |
vocab_size = min(max_words, len(vocab_index)) | |
embedding_matrix = np.zeros((vocab_size, embedding_dim)) | |
for word, i in vocab_index.items(): | |
if i >= max_words: | |
continue | |
embedding_vector = learned_embeddings.get(word) | |
if embedding_vector is not None: | |
# words not found in embedding index will be all-zeros. | |
embedding_matrix[i] = embedding_vector | |
embedding_layer = Embedding(vocab_size, | |
embedding_dim, | |
weights=[embedding_matrix], | |
input_length=max_seq_len, | |
trainable=do_train) | |
return embedding_layer | |
def model_nlp_classify_cnn(max_seq_len, embedding_layer, num_labels): | |
""" | |
Model for NLP classification using CNN. | |
||Params|| | |
max_seq_len: max length of input text used for training/validation. | |
embedding_layer: Keras Embedding Layer. | |
num_labels: Number of labels/categories in the output. | |
||Return|| | |
model: NLP classification model using CNN. | |
""" | |
sequence_input = Input(shape=(max_seq_len,), dtype='int32') | |
embedded_sequences = embedding_layer(sequence_input) | |
x = Conv1D(128, 5, activation='relu')(embedded_sequences) | |
x = MaxPooling1D(5)(x) | |
x = Conv1D(128, 5, activation='relu')(x) | |
x = MaxPooling1D(5)(x) | |
x = Conv1D(128, 5, activation='relu')(x) | |
x = MaxPooling1D(35)(x) | |
x = Flatten()(x) | |
x = Dense(128, activation='relu')(x) | |
predictions = Dense(num_labels, activation='softmax')(x) | |
model = Model(sequence_input, preds) | |
return model | |
def train_save_model(model, optimizer='rmsprop', save_path="./"): | |
""" | |
Trains the model and saves the trained model. | |
||PARAMS|| | |
model: Model for NLP classification. | |
optimizer: Optimizer algorithm to be used for training the network. | |
Defaults to rmsprop. SGC and Adam are few other options. | |
save_path: Path for saving the trained model. Defaults to current directory. | |
""" | |
model.compile(loss='categorical_crossentropy', | |
optimizer=optimizer, | |
metrics=['acc']) | |
model.fit(x_train, y_train, | |
batch_size=128, | |
epochs=10, | |
validation_data=(x_val, y_val)) | |
# Need to implement method for saving the model. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment