-
-
Save emillykkejensen/aa7535c29538a956d5b9c41e31f731a1 to your computer and use it in GitHub Desktop.
####################################### | |
### -------- Load libraries ------- ### | |
# Load Huggingface transformers | |
from transformers import TFBertModel, BertConfig, BertTokenizerFast | |
# Then what you need from tensorflow.keras | |
from tensorflow.keras.layers import Input, Dropout, Dense | |
from tensorflow.keras.models import Model | |
from tensorflow.keras.optimizers import Adam | |
from tensorflow.keras.callbacks import EarlyStopping | |
from tensorflow.keras.initializers import TruncatedNormal | |
from tensorflow.keras.losses import CategoricalCrossentropy | |
from tensorflow.keras.metrics import CategoricalAccuracy | |
from tensorflow.keras.utils import to_categorical | |
# And pandas for data import + sklearn because you allways need sklearn | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
####################################### | |
### --------- Import data --------- ### | |
# Import data from csv | |
data = pd.read_csv('dev/Fun with BERT/complaints.csv') | |
# Select required columns | |
data = data[['Consumer complaint narrative', 'Product', 'Issue']] | |
# Remove a row if any of the three remaining columns are missing | |
data = data.dropna() | |
# Remove rows, where the label is present only ones (can't be split) | |
data = data.groupby('Issue').filter(lambda x : len(x) > 1) | |
data = data.groupby('Product').filter(lambda x : len(x) > 1) | |
# Set your model output as categorical and save in new label col | |
data['Issue_label'] = pd.Categorical(data['Issue']) | |
data['Product_label'] = pd.Categorical(data['Product']) | |
# Transform your output to numeric | |
data['Issue'] = data['Issue_label'].cat.codes | |
data['Product'] = data['Product_label'].cat.codes | |
# Split into train and test - stratify over Issue | |
data, data_test = train_test_split(data, test_size = 0.2, stratify = data[['Issue']]) | |
####################################### | |
### --------- Setup BERT ---------- ### | |
# Name of the BERT model to use | |
model_name = 'bert-base-uncased' | |
# Max length of tokens | |
max_length = 100 | |
# Load transformers config and set output_hidden_states to False | |
config = BertConfig.from_pretrained(model_name) | |
config.output_hidden_states = False | |
# Load BERT tokenizer | |
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config) | |
# Load the Transformers BERT model | |
transformer_model = TFBertModel.from_pretrained(model_name, config = config) | |
####################################### | |
### ------- Build the model ------- ### | |
# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model | |
# Load the MainLayer | |
bert = transformer_model.layers[0] | |
# Build your model input | |
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32') | |
# attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32') | |
# inputs = {'input_ids': input_ids, 'attention_mask': attention_mask} | |
inputs = {'input_ids': input_ids} | |
# Load the Transformers BERT model as a layer in a Keras model | |
bert_model = bert(inputs)[1] | |
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output') | |
pooled_output = dropout(bert_model, training=False) | |
# Then build your model output | |
issue = Dense(units=len(data.Issue_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='issue')(pooled_output) | |
product = Dense(units=len(data.Product_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='product')(pooled_output) | |
outputs = {'issue': issue, 'product': product} | |
# And combine it all in a model object | |
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass') | |
# Take a look at the model | |
model.summary() | |
####################################### | |
### ------- Train the model ------- ### | |
# Set an optimizer | |
optimizer = Adam( | |
learning_rate=5e-05, | |
epsilon=1e-08, | |
decay=0.01, | |
clipnorm=1.0) | |
# Set loss and metrics | |
loss = {'issue': CategoricalCrossentropy(from_logits = True), 'product': CategoricalCrossentropy(from_logits = True)} | |
metric = {'issue': CategoricalAccuracy('accuracy'), 'product': CategoricalAccuracy('accuracy')} | |
# Compile the model | |
model.compile( | |
optimizer = optimizer, | |
loss = loss, | |
metrics = metric) | |
# Ready output data for the model | |
y_issue = to_categorical(data['Issue']) | |
y_product = to_categorical(data['Product']) | |
# Tokenize the input (takes some time) | |
x = tokenizer( | |
text=data['Consumer complaint narrative'].to_list(), | |
add_special_tokens=True, | |
max_length=max_length, | |
truncation=True, | |
padding=True, | |
return_tensors='tf', | |
return_token_type_ids = False, | |
return_attention_mask = True, | |
verbose = True) | |
# Fit the model | |
history = model.fit( | |
# x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']}, | |
x={'input_ids': x['input_ids']}, | |
y={'issue': y_issue, 'product': y_product}, | |
validation_split=0.2, | |
batch_size=64, | |
epochs=10) | |
####################################### | |
### ----- Evaluate the model ------ ### | |
# Ready test data | |
test_y_issue = to_categorical(data_test['Issue']) | |
test_y_product = to_categorical(data_test['Product']) | |
test_x = tokenizer( | |
text=data_test['Consumer complaint narrative'].to_list(), | |
add_special_tokens=True, | |
max_length=max_length, | |
truncation=True, | |
padding=True, | |
return_tensors='tf', | |
return_token_type_ids = False, | |
return_attention_mask = False, | |
verbose = True) | |
# Run evaluation | |
model_eval = model.evaluate( | |
x={'input_ids': test_x['input_ids']}, | |
y={'issue': test_y_issue, 'product': test_y_product} | |
) |
Wonderful project @emillykkejensen and appreciate the ease of explanation. I do have a quick question, since we have multi-label and multi-class problem to deal with here, there is a probability that between issue and product labels above, there could be some where we do not have the same # of samples from target / output layers. So following the same pattern as described above, I ran into an issue where the shapes are not matching for labels. Similar to yours, I have not hardcoded the shape of the inputs, so do you have any suggestions to resolve this issue?
ValueError: Shapes (None, 263) and (None, 265) are incompatible
Haven't looked at this for a while, so it's not really fresh in memory sorry. But maybe have a look at your data filter process, looks like you might need to remove some missing’s?
Can you upload the model.predict() part? Or how you can apply model.predict() function for this model in a dataframe of Narratives ?
Predict is the same as evaluate, but without target data (y) - see: https://www.tensorflow.org/api_docs/python/tf/keras/Model#predict
I think your tutorial is great. But, similar to other people requesting you to post the prediction code on the towardsdatascience website,
I too could not get predictions, confusion matrix, and classification report working.
Below is my code. I appreciate any help. I am a beginner.
predicted_raw = model.predict({'input_ids':x_test['input_ids']})
y_predicted = numpy.argmax(predicted_raw, axis = 1)
The error is here: y_predicted = numpy.argmax(predicted_raw, axis = 1). The error message says "axis 1 is out of bounds for array of dimension 1" When I change axis to zero. The new error message is "Singleton array 0 cannot be considered a valid collection." I think what the axis=0 error says is that y_predicted is null. I double checked it with an if statement.
Predict is the same as evaluate, but without target data (y) - see: https://www.tensorflow.org/api_docs/python/tf/keras/Model#predict