Created
September 28, 2021 18:32
-
-
Save Alir3z4/6b26353928633f7db59f40f71c8f67cf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pickle | |
import warnings | |
import numpy as np | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from tensorflow.keras.callbacks import EarlyStopping | |
from tensorflow.keras.layers import Dense | |
from tensorflow.keras.layers import Dropout | |
from tensorflow.keras.layers import Embedding | |
from tensorflow.keras.models import Sequential | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.python.keras.layers import GlobalAveragePooling1D, LSTM, Bidirectional | |
from bad_content import config | |
from bad_content.utils import show_plot_confusion_matrix, show_classification_report | |
warnings.filterwarnings("ignore") # We're outlaws! | |
def create_embedding_matrix(filepath, word_index, embedding_dim): | |
print('Creating embedding matrix from the glove.') | |
vocab_size = len(word_index) + 1 # Adding again 1 because of reserved 0 index | |
embedding_matrix = np.zeros((vocab_size, embedding_dim)) | |
with open(filepath, encoding='utf8') as f: | |
for line in f: | |
word, *vector = line.split() | |
if word in word_index: | |
idx = word_index[word] | |
embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim] | |
return embedding_matrix | |
def train(classification_report: bool = False, plot_confusion_matrix_report: bool = False) -> None: | |
"""For better result while training, play https://www.youtube.com/watch?v=_YYmfM2TfUA as loud as possible.""" | |
df = pd.read_csv('data/bad_content_clean.csv', encoding='utf-8') | |
df.head() | |
data = df.copy() # Make a copy of the data. | |
print(f'Value Count: {data.spam.value_counts()}') | |
# sns.countplot(data['spam']) | |
# plt.show() | |
X = data['content'].values | |
y = data['spam'].values | |
X_train: np.ndarray | |
X_test: np.ndarray | |
y_train: np.ndarray | |
y_test: np.ndarray | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) | |
# Prepare the tokenizer. | |
t = Tokenizer() | |
t.fit_on_texts(X_train) | |
# integer encode the documents | |
encoded_train = t.texts_to_sequences(X_train) | |
encoded_test = t.texts_to_sequences(X_test) | |
print(f'encoded_train[0:2]: {encoded_train[0:2]}') | |
# pad documents to a max length of 50 words. | |
max_length = 50 | |
padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post') | |
padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post') | |
print(f'padded_train: {padded_train}') | |
vocab_size = len(t.word_index) + 1 | |
embedding_dim = max_length | |
embedding_matrix = create_embedding_matrix( | |
f'data/glove.6B/glove.6B.{embedding_dim}d.txt', | |
t.word_index, | |
embedding_dim | |
) | |
def my_model(): | |
# Define the model as Sequential. | |
model = Sequential() | |
# The model trains for a number of epochs and stops once it is not improving anymore. | |
# This is made possible by the early [stopping callback](https://keras.io/api/callbacks/early_stopping/). | |
# The model training might run for about 11 or 12 epochs. | |
# This varies because of the stochastic[https://machinelearningmastery.com/stochastic-in-machine-learning/] | |
# nature of the model and even data splitting. | |
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False)) | |
# model.add(Flatten()) | |
model.add(GlobalAveragePooling1D()) | |
model.add(Dense(X_train.shape[0] / 4, activation='relu')) | |
model.add(Dropout(0.2)) | |
model.add(Dense(X_train.shape[0] / 6, activation='relu')) | |
model.add(Dropout(0.2)) | |
model.add(Dense(X_train.shape[0] / 8, activation='relu')) | |
model.add(Dropout(0.2)) | |
model.add(Dense(X_train.shape[0] / 10, activation='relu')) | |
model.add(Dropout(0.2)) | |
model.add(Dense(1, activation='sigmoid')) | |
# compile the model | |
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) | |
# summarize the model | |
print(f'model.summary(): {model.summary()}') | |
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10) | |
# checkpoint = ModelCheckpoint( | |
# 'models/weights.{epoch:02d}-{val_loss:.2f}.hdf5', | |
# monitor='val_accuracy', | |
# save_best_only=True, | |
# verbose=1, | |
# ) | |
# fit the model | |
model.fit( | |
x=padded_train, | |
y=y_train, | |
epochs=100, | |
# batch_size=20, | |
validation_data=(padded_test, y_test), | |
verbose=1, | |
# callbacks=[checkpoint, early_stop], | |
callbacks=[early_stop, ], | |
use_multiprocessing=True | |
) | |
return model | |
def ltsm_model(): | |
# LSTM hyperparameters | |
n_lstm = 20 | |
drop_lstm = 0.2 | |
model = Sequential() | |
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length)) | |
model.add(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True)) | |
model.add(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True)) | |
model.add(Dense(1, activation='sigmoid')) | |
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) | |
# summarize the model | |
print(f'model.summary(): {model.summary()}') | |
num_epochs = 30 | |
early_stop = EarlyStopping(monitor='val_loss', patience=2) | |
model.fit( | |
padded_train, | |
y_train, | |
epochs=num_epochs, | |
validation_data=(padded_test, y_test), | |
callbacks=[early_stop], | |
verbose=1, | |
) | |
return model | |
def bi_lstm_model(): | |
# LSTM hyperparameters | |
n_lstm = 20 | |
drop_lstm = 0.2 | |
model = Sequential() | |
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length)) | |
model.add(Bidirectional(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True))) | |
# model.add(Bidirectional(CuDNNLSTM( | |
# units=n_lstm, | |
# dropout=drop_lstm, | |
# return_sequences=True, | |
# recurrent_activation='sigmoid', | |
# ))) | |
model.add(Dense(1, activation='sigmoid')) | |
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) | |
# summarize the model | |
print(f'model.summary(): {model.summary()}') | |
num_epochs = 30 | |
early_stop = EarlyStopping(monitor='val_loss', patience=2) | |
model.fit( | |
padded_train, y_train, epochs=num_epochs, | |
validation_data=(padded_test, y_test), | |
callbacks=[early_stop, ], | |
verbose=1, | |
use_multiprocessing=True | |
) | |
return model | |
model = bi_lstm_model() | |
preds = (model.predict(padded_test) > 0.5).astype("int32") | |
if classification_report: | |
show_classification_report(y_test, preds) | |
if plot_confusion_matrix_report: | |
show_plot_confusion_matrix(y_test, preds) | |
if not os.path.exists(config.__MODEL_SAVE_PATH): | |
os.makedirs(config.__MODEL_SAVE_PATH) | |
print(f'Saving model to {config.__MODEL_SAVE_PATH}') | |
model.save(config.__MODEL_SAVE_PATH) | |
with open(f'{config.__MODEL_SAVE_PATH}/tokenizer.pkl', 'wb') as output: | |
pickle.dump(t, output, pickle.HIGHEST_PROTOCOL) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Link for the context behind this gist so I can bookmark it all in one place: https://news.ycombinator.com/item?id=28684764
Thanks for sharing!