-
-
Save emillykkejensen/aa7535c29538a956d5b9c41e31f731a1 to your computer and use it in GitHub Desktop.
####################################### | |
### -------- Load libraries ------- ### | |
# Load Huggingface transformers | |
from transformers import TFBertModel, BertConfig, BertTokenizerFast | |
# Then what you need from tensorflow.keras | |
from tensorflow.keras.layers import Input, Dropout, Dense | |
from tensorflow.keras.models import Model | |
from tensorflow.keras.optimizers import Adam | |
from tensorflow.keras.callbacks import EarlyStopping | |
from tensorflow.keras.initializers import TruncatedNormal | |
from tensorflow.keras.losses import CategoricalCrossentropy | |
from tensorflow.keras.metrics import CategoricalAccuracy | |
from tensorflow.keras.utils import to_categorical | |
# And pandas for data import + sklearn because you allways need sklearn | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
####################################### | |
### --------- Import data --------- ### | |
# Import data from csv | |
data = pd.read_csv('dev/Fun with BERT/complaints.csv') | |
# Select required columns | |
data = data[['Consumer complaint narrative', 'Product', 'Issue']] | |
# Remove a row if any of the three remaining columns are missing | |
data = data.dropna() | |
# Remove rows, where the label is present only ones (can't be split) | |
data = data.groupby('Issue').filter(lambda x : len(x) > 1) | |
data = data.groupby('Product').filter(lambda x : len(x) > 1) | |
# Set your model output as categorical and save in new label col | |
data['Issue_label'] = pd.Categorical(data['Issue']) | |
data['Product_label'] = pd.Categorical(data['Product']) | |
# Transform your output to numeric | |
data['Issue'] = data['Issue_label'].cat.codes | |
data['Product'] = data['Product_label'].cat.codes | |
# Split into train and test - stratify over Issue | |
data, data_test = train_test_split(data, test_size = 0.2, stratify = data[['Issue']]) | |
####################################### | |
### --------- Setup BERT ---------- ### | |
# Name of the BERT model to use | |
model_name = 'bert-base-uncased' | |
# Max length of tokens | |
max_length = 100 | |
# Load transformers config and set output_hidden_states to False | |
config = BertConfig.from_pretrained(model_name) | |
config.output_hidden_states = False | |
# Load BERT tokenizer | |
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config) | |
# Load the Transformers BERT model | |
transformer_model = TFBertModel.from_pretrained(model_name, config = config) | |
####################################### | |
### ------- Build the model ------- ### | |
# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model | |
# Load the MainLayer | |
bert = transformer_model.layers[0] | |
# Build your model input | |
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32') | |
# attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32') | |
# inputs = {'input_ids': input_ids, 'attention_mask': attention_mask} | |
inputs = {'input_ids': input_ids} | |
# Load the Transformers BERT model as a layer in a Keras model | |
bert_model = bert(inputs)[1] | |
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output') | |
pooled_output = dropout(bert_model, training=False) | |
# Then build your model output | |
issue = Dense(units=len(data.Issue_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='issue')(pooled_output) | |
product = Dense(units=len(data.Product_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='product')(pooled_output) | |
outputs = {'issue': issue, 'product': product} | |
# And combine it all in a model object | |
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass') | |
# Take a look at the model | |
model.summary() | |
####################################### | |
### ------- Train the model ------- ### | |
# Set an optimizer | |
optimizer = Adam( | |
learning_rate=5e-05, | |
epsilon=1e-08, | |
decay=0.01, | |
clipnorm=1.0) | |
# Set loss and metrics | |
loss = {'issue': CategoricalCrossentropy(from_logits = True), 'product': CategoricalCrossentropy(from_logits = True)} | |
metric = {'issue': CategoricalAccuracy('accuracy'), 'product': CategoricalAccuracy('accuracy')} | |
# Compile the model | |
model.compile( | |
optimizer = optimizer, | |
loss = loss, | |
metrics = metric) | |
# Ready output data for the model | |
y_issue = to_categorical(data['Issue']) | |
y_product = to_categorical(data['Product']) | |
# Tokenize the input (takes some time) | |
x = tokenizer( | |
text=data['Consumer complaint narrative'].to_list(), | |
add_special_tokens=True, | |
max_length=max_length, | |
truncation=True, | |
padding=True, | |
return_tensors='tf', | |
return_token_type_ids = False, | |
return_attention_mask = True, | |
verbose = True) | |
# Fit the model | |
history = model.fit( | |
# x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']}, | |
x={'input_ids': x['input_ids']}, | |
y={'issue': y_issue, 'product': y_product}, | |
validation_split=0.2, | |
batch_size=64, | |
epochs=10) | |
####################################### | |
### ----- Evaluate the model ------ ### | |
# Ready test data | |
test_y_issue = to_categorical(data_test['Issue']) | |
test_y_product = to_categorical(data_test['Product']) | |
test_x = tokenizer( | |
text=data_test['Consumer complaint narrative'].to_list(), | |
add_special_tokens=True, | |
max_length=max_length, | |
truncation=True, | |
padding=True, | |
return_tensors='tf', | |
return_token_type_ids = False, | |
return_attention_mask = False, | |
verbose = True) | |
# Run evaluation | |
model_eval = model.evaluate( | |
x={'input_ids': test_x['input_ids']}, | |
y={'issue': test_y_issue, 'product': test_y_product} | |
) |
Can you upload the model.predict() part? Or how you can apply model.predict() function for this model in a dataframe of Narratives ?
Predict is the same as evaluate, but without target data (y) - see: https://www.tensorflow.org/api_docs/python/tf/keras/Model#predict
I think your tutorial is great. But, similar to other people requesting you to post the prediction code on the towardsdatascience website,
I too could not get predictions, confusion matrix, and classification report working.
Below is my code. I appreciate any help. I am a beginner.
predicted_raw = model.predict({'input_ids':x_test['input_ids']})
y_predicted = numpy.argmax(predicted_raw, axis = 1)
The error is here: y_predicted = numpy.argmax(predicted_raw, axis = 1). The error message says "axis 1 is out of bounds for array of dimension 1" When I change axis to zero. The new error message is "Singleton array 0 cannot be considered a valid collection." I think what the axis=0 error says is that y_predicted is null. I double checked it with an if statement.
Haven't looked at this for a while, so it's not really fresh in memory sorry. But maybe have a look at your data filter process, looks like you might need to remove some missing’s?