hackintoshrao · August 31, 2017 07:21
diff --git a/embedding_train.py b/embedding_train.py
 def get_embedding_layer(vocab_index, learned_embeddings, max_words, embedding_dim, max_seq_len, do_train):
        """"
        Creates the Embedding layer using learned embedding and word indesx map of all
        unique words in the text corpus.

        ||Params||

        vocab_index : Map of all unique words in your text corpus as keys and their index as values
        learned_embeddings: Learned embedding representation from GLove/Word2Vec.
        max_words: Max words to be used from the vocab
        embedding_dim: Size of the learned embedding (100/300/600)
        max_seq_len: max length of input text used for training/validation.
        do_train: Boolean flag to indicate whether the embedding vectors has to be trained/altered.

        ||Return||
        embedding_layer: Embedding Layer

        """
        vocab_size = min(max_words, len(vocab_index))
        embedding_matrix =   np.zeros((vocab_size, embedding_dim))

        for word, i in vocab_index.items():
            if i >= max_words:
                continue
            embedding_vector = learned_embeddings.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector

        embedding_layer = Embedding(vocab_size,
                                    embedding_dim,
                                    weights=[embedding_matrix],
                                    input_length=max_seq_len,
                                    trainable=do_train)
        return embedding_layer

 def  model_nlp_classify_cnn(max_seq_len, embedding_layer, num_labels):
    """
    Model for NLP classification using CNN.

    ||Params||

    max_seq_len: max length of input text used for training/validation.
    embedding_layer: Keras Embedding Layer.
    num_labels: Number of labels/categories in the output.

    ||Return||
    model: NLP classification model using CNN.
    """
    sequence_input = Input(shape=(max_seq_len,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Conv1D(128, 5, activation='relu')(embedded_sequences)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPooling1D(35)(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    predictions = Dense(num_labels, activation='softmax')(x)

    model = Model(sequence_input, preds)
    return model

 def train_save_model(model,  optimizer='rmsprop', save_path="./"):
    """
    Trains the model and saves the trained model.

    ||PARAMS||
    model:     Model for NLP classification.
    optimizer: Optimizer algorithm to be used for training the network.
               Defaults to rmsprop. SGC and Adam are few other options.
    save_path:  Path for saving the trained model. Defaults to current directory.

    """
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['acc'])

    model.fit(x_train, y_train,
              batch_size=128,
              epochs=10,
    validation_data=(x_val, y_val))
    # Need to implement method for saving the model.
	def get_embedding_layer(vocab_index, learned_embeddings, max_words, embedding_dim, max_seq_len, do_train):
	""""
	Creates the Embedding layer using learned embedding and word indesx map of all
	unique words in the text corpus.

	\|\|Params\|\|

	vocab_index : Map of all unique words in your text corpus as keys and their index as values
	learned_embeddings: Learned embedding representation from GLove/Word2Vec.
	max_words: Max words to be used from the vocab
	embedding_dim: Size of the learned embedding (100/300/600)
	max_seq_len: max length of input text used for training/validation.
	do_train: Boolean flag to indicate whether the embedding vectors has to be trained/altered.

	\|\|Return\|\|
	embedding_layer: Embedding Layer

	"""
	vocab_size = min(max_words, len(vocab_index))
	embedding_matrix = np.zeros((vocab_size, embedding_dim))

	for word, i in vocab_index.items():
	if i >= max_words:
	continue
	embedding_vector = learned_embeddings.get(word)
	if embedding_vector is not None:
	# words not found in embedding index will be all-zeros.
	embedding_matrix[i] = embedding_vector

	embedding_layer = Embedding(vocab_size,
	embedding_dim,
	weights=[embedding_matrix],
	input_length=max_seq_len,
	trainable=do_train)
	return embedding_layer

	def model_nlp_classify_cnn(max_seq_len, embedding_layer, num_labels):
	"""
	Model for NLP classification using CNN.

	\|\|Params\|\|

	max_seq_len: max length of input text used for training/validation.
	embedding_layer: Keras Embedding Layer.
	num_labels: Number of labels/categories in the output.

	\|\|Return\|\|
	model: NLP classification model using CNN.
	"""
	sequence_input = Input(shape=(max_seq_len,), dtype='int32')
	embedded_sequences = embedding_layer(sequence_input)
	x = Conv1D(128, 5, activation='relu')(embedded_sequences)
	x = MaxPooling1D(5)(x)
	x = Conv1D(128, 5, activation='relu')(x)
	x = MaxPooling1D(5)(x)
	x = Conv1D(128, 5, activation='relu')(x)
	x = MaxPooling1D(35)(x)
	x = Flatten()(x)
	x = Dense(128, activation='relu')(x)
	predictions = Dense(num_labels, activation='softmax')(x)

	model = Model(sequence_input, preds)
	return model

	def train_save_model(model, optimizer='rmsprop', save_path="./"):
	"""
	Trains the model and saves the trained model.

	\|\|PARAMS\|\|
	model: Model for NLP classification.
	optimizer: Optimizer algorithm to be used for training the network.
	Defaults to rmsprop. SGC and Adam are few other options.
	save_path: Path for saving the trained model. Defaults to current directory.

	"""
	model.compile(loss='categorical_crossentropy',
	optimizer=optimizer,
	metrics=['acc'])

	model.fit(x_train, y_train,
	batch_size=128,
	epochs=10,
	validation_data=(x_val, y_val))
	# Need to implement method for saving the model.