Skip to content

Instantly share code, notes, and snippets.

@shresthakamal
Created February 20, 2023 08:56
Show Gist options
  • Save shresthakamal/968c64359de16d1f6e1f3fcb51ee0991 to your computer and use it in GitHub Desktop.
Save shresthakamal/968c64359de16d1f6e1f3fcb51ee0991 to your computer and use it in GitHub Desktop.
Standard Finetunning Steps
# standard steps to follow for finetunning BERT
# 1. Load the pre-trained model
# 2. Tokenize the input
# 3. Convert the tokens to their index numbers in the BERT vocabulary
# 4. Set all of the model’s parameters to their gradients to zero
# 5. Run the forward pass, calculate the loss, and perform a backward pass to calculate the gradients
# 6. Clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
# 7. Update the model’s parameters
# 8. Update the learning rate.
# 9. Clear the calculated gradients
# 1. Load the pre-trained model
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load pre-trained model (weights)
model = BertForSequenceClassification.from_pretrained(
"bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
num_labels = 2, # The number of output labels--2 for binary classification.
# You can increase this for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)
# 2. Tokenize the input
# Print the original sentence.
print(' Original: ', sentences[0])
# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))
# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))
# 3. Convert the tokens to their index numbers in the BERT vocabulary
# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenizer.tokenize(sentences[0]))
# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))
# 4. Set all of the model’s parameters to their gradients to zero
# Load pre-trained model (weights)
model = BertForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels = 2,
output_attentions = False,
output_hidden_states = False,
)
# Tell pytorch to run this model on the GPU.
model.cuda()
# 5. Run the forward pass, calculate the loss, and perform a backward pass to calculate the gradients
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
# Copy the model to the GPU.
model.to(device)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
# For every sentence...
for sent in sentences:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
)
# Add the encoded sentence to the list.
input_ids.append(encoded_sent)
# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN,
dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask)
# Use train_test_split to split our data into train and validation sets for training
# Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,
random_state=2018, test_size=0.1)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
random_state=2018, test_size=0.1)
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
batch_size = 32
# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
# with an iterator the entire dataset does not need to be loaded into memory
# The DataLoader needs to know our batch size for training, so we specify it here.
# For fine-tuning BERT on a specific task, the authors recommend between 2 and 4 epochs.
# We chose to run for 4, but we'll see later that this may be over-fitting the training data.
# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
# 6. Clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
# This training code is based on the `run_glue.py` script here:
#
#
# We'll store a number of quantities such as training and validation loss, validation accuracy, and timings.
training_stats = []
# Measure the total training time for the whole run.
total_t0 = time.time()
# For each epoch...
for epoch_i in range(0, epochs):
# ========================================
# Training
# ========================================
# Perform one full pass over the training set.
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('Training...')
# Measure how long the training epoch takes.
t0 = time.time()
# Reset the total loss for this epoch.
total_train_loss = 0
# Put the model into training mode. Don't be mislead--the call to `train` just changes the *mode*, it doesn't *perform* the training.
# `dropout` and `batchnorm` layers behave differently during training vs. test (source).
model.train()
model.to(device)
#
# As we unpack the batch, we'll also copy each tensor to the GPU using the `to` method.
#
# `batch` contains three pytorch tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
# Always clear any previously calculated gradients before performing a backward pass.
model.zero_grad()
# Perform a forward pass (evaluate the model on this training batch).
# This will return the loss (rather than the model output) because we have provided the `labels`.
# The documentation for this `model` function is here:
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
loss, logits = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
# Accumulate the training loss over all of the batches so that we can calculate the average loss at the end.
total_train_loss += loss.item()
# Perform a backward pass to calculate the gradients.
loss.backward()
# Clip the norm of the gradients to 1.0 to prevent "exploding gradients".
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters and take a step using the computed gradient.
# The optimizer dictates the "update rule"--how the parameters are modified based on their gradients, the learning rate, etc.
optimizer.step()
# Update the learning rate.
scheduler.step()
# Calculate the average loss over all of the batches.
avg_train_loss = total_train_loss / len(train_dataloader)
# Measure how long this epoch took.
training_time = format_time(time.time() - t0)
print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epcoh took: {:}".format(training_time))
# ========================================
# Validation
# ========================================
# After the completion of each training epoch, measure our performance on our validation set.
print("")
print("Running Validation...")
t0 = time.time()
# Put the model in evaluation mode--the dropout layers behave differently during evaluation.
model.eval()
# Tracking variables
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0
# Evaluate data for one epoch
for batch in validation_dataloader:
# Unpack this training batch from our dataloader.
#
# As we unpack the batch, we'll also copy each tensor to the GPU using the `to` method.
#
# `batch` contains three pytorch tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
# Tell pytorch not to bother with constructing the compute graph during the forward pass, since this is only needed for backprop (training).
with torch.no_grad():
# Forward pass, calculate logit predictions.
# token_type_ids is the same as the "segment ids", which
# differentiates sentence 1 and 2 in 2-sentence tasks.
# The documentation for this `model` function is here:
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
(loss, logits) = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
# Accumulate the validation loss.
total_eval_loss += loss.item()
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
# Calculate the accuracy for this batch of test sentences.
total_eval_accuracy += flat_accuracy(logits, label_ids)
# Report the final accuracy for this validation run.
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
print(" Accuracy: {0:.2f}".format(avg_val_accuracy))
# Calculate the average loss over all of the batches.
avg_val_loss = total_eval_loss / len(validation_dataloader)
# Measure how long the validation run took.
validation_time = format_time(time.time() - t0)
print(" Validation Loss: {0:.2f}".format(avg_val_loss))
print(" Validation took: {:}".format(validation_time))
# Record all statistics from this epoch.
training_stats.append(
{
'epoch': epoch_i + 1,
'Training Loss': avg_train_loss,
'Valid. Loss': avg_val_loss,
'Valid. Accur.': avg_val_accuracy,
'Training Time': training_time,
'Validation Time': validation_time
}
)
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
# Display floats with two decimal places.
pd.set_option('precision', 2)
# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)
# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')
# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
# Display the table.
print(df_stats)
# Save the trained model and the associated configuration
# If we have a distributed model, save only the encapsulated model
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
model_to_save = model.module if hasattr(model, 'module') else model
# If we save using the predefined names, we can load using `from_pretrained`
output_dir = './model_save/'
# Create output directory if needed
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print("Saving model to %s" % output_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# Good practice: save your training arguments together with the trained model
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
# Load a trained model and vocabulary that you have fine-tuned
model = model_class.from_pretrained(output_dir)
tokenizer = tokenizer_class.from_pretrained(output_dir)
# Copy the model to the GPU.
model.to(device)
# ========================================
# Testing
# ========================================
# After the completion of each training epoch, measure our performance on our test set.
print("")
print("Running Testing...")
t0 = time.time()
# Put the model in evaluation mode--the dropout layers behave differently during evaluation.
model.eval()
# Tracking variables
predictions , true_labels = [], []
# Predict
for batch in test_dataloader:
# Unpack this training batch from our dataloader.
#
# As we unpack the batch, we'll also copy each tensor to the GPU using the `to` method.
#
# `batch` contains three pytorch tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
# Tell pytorch not to bother with constructing the compute graph during the forward pass, since this is only needed for backprop (training).
with torch.no_grad():
# Forward pass, calculate logit predictions.
# token_type_ids is the same as the "segment ids", which
# differentiates sentence 1 and 2 in 2-sentence tasks.
# The documentation for this `model` function is here:
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
(loss, logits) = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
# Store predictions and true labels
predictions.append(logits)
true_labels.append(label_ids)
print(' DONE.')
print("Testing took {:}".format(format_time(time.time() - t0)))
# Combine the results across all batches.
flat_predictions = np.concatenate(predictions, axis=0)
# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)
# Calculate the MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
print('MCC: {:.3f}'.format(mcc))
# Calculate the accuracy.
accuracy = accuracy_score(flat_true_labels, flat_predictions)
print('Accuracy: {:.3f}'.format(accuracy))
# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(len(flat_true_labels)))
# Create a classification report.
report = classification_report(flat_true_labels, flat_predictions, labels=[0,1], target_names=['negative', 'positive'], output_dict=True)
print(report)
# Save the report
with open('report.json', 'w') as f:
json.dump(report, f)
# Save the predictions
with open('predictions.json', 'w') as f:
json.dump(flat_predictions.tolist(), f)
# Save the true labels
with open('true_labels.json', 'w') as f:
json.dump(flat_true_labels.tolist(), f)
# Save the MCC
with open('mcc.json', 'w') as f:
json.dump(mcc, f)
# Save the accuracy
with open('accuracy.json', 'w') as f:
json.dump(accuracy, f)
# Save the training stats
with open('training_stats.json', 'w') as f:
json.dump(training_stats, f)
# Save the training arguments
with open('training_args.json', 'w') as f:
json.dump(args, f)
# Save the model
model.save_pretrained('model_save')
# Save the tokenizer
tokenizer.save_pretrained('model_save')
# Save the training arguments
torch.save(args, os.path.join('model_save', 'training_args.bin'))
# ========================================
# Plotting
# ========================================
# Plot the training loss and accuracy curves for training and validation
# Set the style
plt.style.use(['seaborn-whitegrid'])
# Increase the plot size and font size.
plt.rcParams['figure.figsize'] = [12, 4]
# Plot the learning curve.
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.title("Training loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")# Copy the model to the GPU.
model.to(device)
# ========================================
# Testing
# ========================================
# After the completion of each training epoch, measure our performance on our test set.
print("")
print("Running Testing...")
t0 = time.time()
# Put the model in evaluation mode--the dropout layers behave differently during evaluation.
model.eval()
# Tracking variables
predictions , true_labels = [], []
# Predict
for batch in test_dataloader:
# Unpack this training batch from our dataloader.
#
# As we unpack the batch, we'll also copy each tensor to the GPU using the `to` method.
#
# `batch` contains three pytorch tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
# Tell pytorch not to bother with constructing the compute graph during the forward pass, since this is only needed for backprop (training).
with torch.no_grad():
# Forward pass, calculate logit predictions.
# token_type_ids is the same as the "segment ids", which
# differentiates sentence 1 and 2 in 2-sentence tasks.
# The documentation for this `model` function is here:
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
(loss, logits) = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
# Store predictions and true labels
predictions.append(logits)
true_labels.append(label_ids)
print(' DONE.')
print("Testing took {:}".format(format_time(time.time() - t0)))
# Combine the results across all batches.
flat_predictions = np.concatenate(predictions, axis=0)
# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)
# Calculate the MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
print('MCC: {:.3f}'.format(mcc))
# Calculate the accuracy.
accuracy = accuracy_score(flat_true_labels, flat_predictions)
print('Accuracy: {:.3f}'.format(accuracy))
# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(len(flat_true_labels)))
# Create a classification report.
report = classification_report(flat_true_labels, flat_predictions, labels=[0,1], target_names=['negative', 'positive'], output_dict=True)
print(report)
# Save the report
with open('report.json', 'w') as f:
json.dump(report, f)
# Save the predictions
with open('predictions.json', 'w') as f:
json.dump(flat_predictions.tolist(), f)
# Save the true labels
with open('true_labels.json', 'w') as f:
json.dump(flat_true_labels.tolist(), f)
# Save the MCC
with open('mcc.json', 'w') as f:
json.dump(mcc, f)
# Save the accuracy
with open('accuracy.json', 'w') as f:
json.dump(accuracy, f)
# Save the training stats
with open('training_stats.json', 'w') as f:
json.dump(training_stats, f)
# Save the training arguments
with open('training_args.json', 'w') as f:
json.dump(args, f)
# Save the model
model.save_pretrained('model_save')
# Save the tokenizer
tokenizer.save_pretrained('model_save')
# Save the training arguments
torch.save(args, os.path.join('model_save', 'training_args.bin'))
plt.legend()
plt.xticks([i for i in range(0, args.num_train_epochs)])
plt.show()
# Plot the learning curve.
plt.plot(df_stats['Valid. Loss'], 'b-o', label="Training")
plt.title("Validation loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([i for i in range(0, args.num_train_epochs)])
plt.show()
# Plot the learning curve.
plt.plot(df_stats['Valid. Accuracy'], 'b-o', label="Training")
plt.title("Validation accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.xticks([i for i in range(0, args.num_train_epochs)])
plt.show()
# ========================================
# Prediction
# ========================================
# Load the dataset into a pandas dataframe.
df = pd.read_csv("test.csv")
# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(df.shape[0]))
# Create sentence and label lists
sentences = df.text.values
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
# For every sentence...
for sent in sentences:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
)
# Add the encoded sentence to the list.
input_ids.append(encoded_sent)
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=args.max_seq_length, dtype="long",
value=0, truncating="post", padding="post")
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask)
# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
# Set the batch size.
batch_size = 32
# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
# Prediction on test set
print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
# Put model in evaluation mode
model.eval()
# Tracking variables
predictions , true_labels = [], []
# Predict
for batch in prediction_dataloader:
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
b_input_ids, b_input_mask = batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up prediction
with torch.no_grad():
# Forward pass, calculate logit predictions.
# This will return the logits rather than the loss because we have
# not provided labels.
# token_type_ids is the same as the "segment ids", which
# differentiates sentence 1 and 2 in 2-sentence tasks.
# The documentation for this `model` function is here:
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
# Store predictions and true labels
predictions.append(logits)
print(' DONE.')
print('Positive samples: %d of %d (%.2f%%)' % (df.label.sum(), len(df.label), (df.label.sum() / len(df.label) * 100.0)))
# Combine the results across all batches.
flat_predictions = np.concatenate(predictions, axis=0)
# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
# Save the predictions
with open('predictions.json', 'w') as f:
json.dump(flat_predictions.tolist(), f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment