shresthakamal · February 20, 2023 08:56
diff --git a/finetunning-BERT.py b/finetunning-BERT.py
 # standard steps to follow for finetunning BERT
 # 1. Load the pre-trained model
 # 2. Tokenize the input
 # 3. Convert the tokens to their index numbers in the BERT vocabulary
 # 4. Set all of the model’s parameters to their gradients to zero
 # 5. Run the forward pass, calculate the loss, and perform a backward pass to calculate the gradients
 # 6. Clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
 # 7. Update the model’s parameters
 # 8. Update the learning rate.
 # 9. Clear the calculated gradients

 # 1. Load the pre-trained model
 # Load pre-trained model tokenizer (vocabulary)
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

 # Load pre-trained model (weights)
 model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
 )

 # 2. Tokenize the input
 # Print the original sentence.
 print(' Original: ', sentences[0])

 # Print the sentence split into tokens.
 print('Tokenized: ', tokenizer.tokenize(sentences[0]))

 # Print the sentence mapped to token ids.
 print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 # 3. Convert the tokens to their index numbers in the BERT vocabulary
 # Mark each of the 22 tokens as belonging to sentence "1".
 segments_ids = [1] * len(tokenizer.tokenize(sentences[0]))

 # Print the sentence mapped to token ids.
 print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 # 4. Set all of the model’s parameters to their gradients to zero
 # Load pre-trained model (weights)
 model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
 )

 # Tell pytorch to run this model on the GPU.
 model.cuda()

 # 5. Run the forward pass, calculate the loss, and perform a backward pass to calculate the gradients

 # Put the model in "evaluation" mode, meaning feed-forward operation.
 model.eval()

 # Copy the model to the GPU.
 model.to(device)

 # Tokenize all of the sentences and map the tokens to thier word IDs.
 input_ids = []

 # For every sentence...
 for sent in sentences:

    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

 # Print sentence 0, now as a list of IDs.

 print('Original: ', sentences[0])
 print('Token IDs:', input_ids[0])

 # Pad our input tokens
 input_ids = pad_sequences(input_ids, maxlen=MAX_LEN,
                            dtype="long", truncating="post", padding="post")

 # Create attention masks
 attention_masks = []

 # Create a mask of 1s for each token followed by 0s for padding
 for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

 # Use train_test_split to split our data into train and validation sets for training

 # Use 90% for training and 10% for validation.
 train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,
                                                            random_state=2018, test_size=0.1)   

 # Do the same for the masks.
 train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                                random_state=2018, test_size=0.1)

 # Convert all of our data into torch tensors, the required datatype for our model

 train_inputs = torch.tensor(train_inputs)
 validation_inputs = torch.tensor(validation_inputs)
 train_labels = torch.tensor(train_labels)
 validation_labels = torch.tensor(validation_labels)

 train_masks = torch.tensor(train_masks)
 validation_masks = torch.tensor(validation_masks)

 # Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
 batch_size = 32

 # Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
 # with an iterator the entire dataset does not need to be loaded into memory

 # The DataLoader needs to know our batch size for training, so we specify it here.
 # For fine-tuning BERT on a specific task, the authors recommend between 2 and 4 epochs.
 # We chose to run for 4, but we'll see later that this may be over-fitting the training data.

 # Create the DataLoader for our training set.
 train_data = TensorDataset(train_inputs, train_masks, train_labels)
 train_sampler = RandomSampler(train_data)
 train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

 # Create the DataLoader for our validation set.
 validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
 validation_sampler = SequentialSampler(validation_data)
 validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

 # 6. Clip the the gradients to 1.0. It helps in preventing the exploding gradient problem

 # This training code is based on the `run_glue.py` script here:
 #
 #
 # We'll store a number of quantities such as training and validation loss, validation accuracy, and timings.
 training_stats = []

 # Measure the total training time for the whole run.
 total_t0 = time.time()

 # For each epoch...
 for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training vs. test (source).
    model.train()
    model.to(device)
    #
    # As we unpack the batch, we'll also copy each tensor to the GPU using the `to` method.
    #
    # `batch` contains three pytorch tensors:
    #   [0]: input ids 
    #   [1]: attention masks
    #   [2]: labels 
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    # Always clear any previously calculated gradients before performing a backward pass.
    model.zero_grad()        

    # Perform a forward pass (evaluate the model on this training batch).
    # This will return the loss (rather than the model output) because we have provided the `labels`.
    # The documentation for this `model` function is here: 
    # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
    loss, logits = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask,
                        labels=b_labels)
    
    # Accumulate the training loss over all of the batches so that we can calculate the average loss at the end.
    total_train_loss += loss.item()

    # Perform a backward pass to calculate the gradients.
    loss.backward()

    # Clip the norm of the gradients to 1.0 to prevent "exploding gradients".
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # Update parameters and take a step using the computed gradient.
    # The optimizer dictates the "update rule"--how the parameters are modified based on their gradients, the learning rate, etc.
    optimizer.step()

    # Update the learning rate.
    scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently during evaluation.
    model.eval()

    # Tracking variables

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
            
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Tell pytorch not to bother with constructing the compute graph during the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            (loss, logits) = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask,
                                labels=b_labels)

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))

    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

    # Display floats with two decimal places.
    pd.set_option('precision', 2)

    # Create a DataFrame from our training statistics.
    df_stats = pd.DataFrame(data=training_stats)

    # Use the 'epoch' as the row index.
    df_stats = df_stats.set_index('epoch')

    # A hack to force the column headers to wrap.
    #df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

    # Display the table.
    print(df_stats)

    # Save the trained model and the associated configuration

    # If we have a distributed model, save only the encapsulated model
    # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
    model_to_save = model.module if hasattr(model, 'module') else model

    # If we save using the predefined names, we can load using `from_pretrained`

    output_dir = './model_save/'

    # Create output directory if needed 
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving model to %s" % output_dir)

    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model_to_save.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Good practice: save your training arguments together with the trained model
    torch.save(args, os.path.join(output_dir, 'training_args.bin'))

    # Load a trained model and vocabulary that you have fine-tuned
    model = model_class.from_pretrained(output_dir)
    tokenizer = tokenizer_class.from_pretrained(output_dir)

 # Copy the model to the GPU.
 model.to(device)

 # ========================================
 #               Testing
 # ========================================
 # After the completion of each training epoch, measure our performance on our test set.

 print("")
 print("Running Testing...")
 t0 = time.time()

 # Put the model in evaluation mode--the dropout layers behave differently during evaluation.
 model.eval()

 # Tracking variables
 predictions , true_labels = [], []

 # Predict
 for batch in test_dataloader:
    # Unpack this training batch from our dataloader. 
    #
    # As we unpack the batch, we'll also copy each tensor to the GPU using the `to` method.
    #
    # `batch` contains three pytorch tensors:
    #   [0]: input ids 
    #   [1]: attention masks
    #   [2]: labels 
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    # Tell pytorch not to bother with constructing the compute graph during the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        (loss, logits) = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask,
                            labels=b_labels)

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

 print('    DONE.')

 print("Testing took {:}".format(format_time(time.time() - t0)))

 # Combine the results across all batches.
 flat_predictions = np.concatenate(predictions, axis=0)

 # For each sample, pick the label (0 or 1) with the higher score.
 flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

 # Combine the correct labels for each batch into a single list.
 flat_true_labels = np.concatenate(true_labels, axis=0)

 # Calculate the MCC
 mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
 print('MCC: {:.3f}'.format(mcc))

 # Calculate the accuracy.
 accuracy = accuracy_score(flat_true_labels, flat_predictions)
 print('Accuracy: {:.3f}'.format(accuracy))

 # Report the number of sentences.
 print('Number of test sentences: {:,}\n'.format(len(flat_true_labels)))

 # Create a classification report.
 report = classification_report(flat_true_labels, flat_predictions, labels=[0,1], target_names=['negative', 'positive'], output_dict=True)
 print(report)

 # Save the report
 with open('report.json', 'w') as f:
    json.dump(report, f)

 # Save the predictions
 with open('predictions.json', 'w') as f:
    json.dump(flat_predictions.tolist(), f)

 # Save the true labels
 with open('true_labels.json', 'w') as f:
    json.dump(flat_true_labels.tolist(), f)

 # Save the MCC
 with open('mcc.json', 'w') as f:
    json.dump(mcc, f)

 # Save the accuracy
 with open('accuracy.json', 'w') as f:
    json.dump(accuracy, f)

 # Save the training stats
 with open('training_stats.json', 'w') as f:
    json.dump(training_stats, f)

 # Save the training arguments
 with open('training_args.json', 'w') as f:
    json.dump(args, f)

 # Save the model
 model.save_pretrained('model_save')

 # Save the tokenizer
 tokenizer.save_pretrained('model_save')

 # Save the training arguments
 torch.save(args, os.path.join('model_save', 'training_args.bin'))

 # ========================================
 #               Plotting
 # ========================================
 # Plot the training loss and accuracy curves for training and validation
 # Set the style
 plt.style.use(['seaborn-whitegrid'])

 # Increase the plot size and font size.
 plt.rcParams['figure.figsize'] = [12, 4]

 # Plot the learning curve.
 plt.plot(df_stats['Training Loss'], 'b-o', label="Training")

 plt.title("Training loss")
 plt.xlabel("Epoch")
 plt.ylabel("Loss")# Copy the model to the GPU.
 model.to(device)

 # ========================================
 #               Testing
 # ========================================
 # After the completion of each training epoch, measure our performance on our test set.

 print("")
 print("Running Testing...")
 t0 = time.time()

 # Put the model in evaluation mode--the dropout layers behave differently during evaluation.
 model.eval()

 # Tracking variables
 predictions , true_labels = [], []

 # Predict
 for batch in test_dataloader:
    # Unpack this training batch from our dataloader. 
    #
    # As we unpack the batch, we'll also copy each tensor to the GPU using the `to` method.
    #
    # `batch` contains three pytorch tensors:
    #   [0]: input ids 
    #   [1]: attention masks
    #   [2]: labels 
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    # Tell pytorch not to bother with constructing the compute graph during the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        (loss, logits) = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask,
                            labels=b_labels)

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

 print('    DONE.')

 print("Testing took {:}".format(format_time(time.time() - t0)))

 # Combine the results across all batches.
 flat_predictions = np.concatenate(predictions, axis=0)

 # For each sample, pick the label (0 or 1) with the higher score.
 flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

 # Combine the correct labels for each batch into a single list.
 flat_true_labels = np.concatenate(true_labels, axis=0)

 # Calculate the MCC
 mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
 print('MCC: {:.3f}'.format(mcc))

 # Calculate the accuracy.
 accuracy = accuracy_score(flat_true_labels, flat_predictions)
 print('Accuracy: {:.3f}'.format(accuracy))

 # Report the number of sentences.
 print('Number of test sentences: {:,}\n'.format(len(flat_true_labels)))

 # Create a classification report.
 report = classification_report(flat_true_labels, flat_predictions, labels=[0,1], target_names=['negative', 'positive'], output_dict=True)
 print(report)

 # Save the report
 with open('report.json', 'w') as f:
    json.dump(report, f)

 # Save the predictions
 with open('predictions.json', 'w') as f:
    json.dump(flat_predictions.tolist(), f)

 # Save the true labels
 with open('true_labels.json', 'w') as f:
    json.dump(flat_true_labels.tolist(), f)

 # Save the MCC
 with open('mcc.json', 'w') as f:
    json.dump(mcc, f)

 # Save the accuracy
 with open('accuracy.json', 'w') as f:
    json.dump(accuracy, f)

 # Save the training stats
 with open('training_stats.json', 'w') as f:
    json.dump(training_stats, f)

 # Save the training arguments
 with open('training_args.json', 'w') as f:
    json.dump(args, f)

 # Save the model
 model.save_pretrained('model_save')

 # Save the tokenizer
 tokenizer.save_pretrained('model_save')

 # Save the training arguments
 torch.save(args, os.path.join('model_save', 'training_args.bin'))

 plt.legend()
 plt.xticks([i for i in range(0, args.num_train_epochs)])
 plt.show()

 # Plot the learning curve.
 plt.plot(df_stats['Valid. Loss'], 'b-o', label="Training")

 plt.title("Validation loss")
 plt.xlabel("Epoch")
 plt.ylabel("Loss")
 plt.legend()
 plt.xticks([i for i in range(0, args.num_train_epochs)])
 plt.show()

 # Plot the learning curve.
 plt.plot(df_stats['Valid. Accuracy'], 'b-o', label="Training")

 plt.title("Validation accuracy")
 plt.xlabel("Epoch")
 plt.ylabel("Accuracy")
 plt.legend()
 plt.xticks([i for i in range(0, args.num_train_epochs)])
 plt.show()

 # ========================================
 #               Prediction
 # ========================================
 # Load the dataset into a pandas dataframe.
 df = pd.read_csv("test.csv")

 # Report the number of sentences.
 print('Number of test sentences: {:,}\n'.format(df.shape[0]))

 # Create sentence and label lists
 sentences = df.text.values

 # Tokenize all of the sentences and map the tokens to thier word IDs.
 input_ids = []

 # For every sentence...
 for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                    )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_sent)

 # Pad our input tokens
 input_ids = pad_sequences(input_ids, maxlen=args.max_seq_length, dtype="long",
                            value=0, truncating="post", padding="post")

 # Create attention masks
 attention_masks = []

 # Create a mask of 1s for each token followed by 0s for padding
 for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

 # Convert to tensors.
 prediction_inputs = torch.tensor(input_ids)
 prediction_masks = torch.tensor(attention_masks)

 # Set the batch size.
 batch_size = 32

 # Create the DataLoader.
 prediction_data = TensorDataset(prediction_inputs, prediction_masks)
 prediction_sampler = SequentialSampler(prediction_data)
 prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

 # Prediction on test set

 print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

 # Put model in evaluation mode
 model.eval()

 # Tracking variables
 predictions , true_labels = [], []

 # Predict
 for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch
    
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions.
        # This will return the logits rather than the loss because we have
        # not provided labels.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
    
    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()

    # Store predictions and true labels
    predictions.append(logits)

 print('    DONE.')

 print('Positive samples: %d of %d (%.2f%%)' % (df.label.sum(), len(df.label), (df.label.sum() / len(df.label) * 100.0)))

 # Combine the results across all batches.
 flat_predictions = np.concatenate(predictions, axis=0)

 # For each sample, pick the label (0 or 1) with the higher score.
 flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

 # Save the predictions
 with open('predictions.json', 'w') as f:
    json.dump(flat_predictions.tolist(), f)