Last active
March 6, 2023 14:04
-
-
Save john-adeojo/a434b09dccdc9ff345a1fe7367140acd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
# Create labels and features | |
def features_labels(df): | |
y = list(df["gender"].astype("category").cat.codes) | |
X = list((df["general_twitter_text"])) | |
return y, X | |
train_labels, train_text = features_labels(df=expanded_train_df) | |
validation_labels, validation_text = features_labels(df=expanded_validation_df) | |
test_labels, test_text = features_labels(df=expanded_test_df) | |
# Pytorch Dataset class | |
import torch | |
class TwitterDataset(torch.utils.data.Dataset): | |
def __init__(self, encodings, labels): | |
self.encodings = encodings | |
self.labels = labels | |
def __getitem__(self, idx): | |
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | |
item["labels"] = torch.tensor(self.labels[idx]) | |
return item | |
def __len__(self): | |
return len(self.labels) | |
# Model fine tunining | |
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification | |
# set model and tokenizer | |
MODEL ="albert-base-v2" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL) | |
# Create pytorch Dataset | |
train_encodings = tokenizer(train_text, truncation=True, padding=True) | |
val_encodings = tokenizer(validation_text, truncation=True, padding=True) | |
test_encodings = tokenizer(test_text, truncation=True, padding=True) | |
train_dataset = TwitterDataset(train_encodings, train_labels) | |
validation_dataset = TwitterDataset(val_encodings, validation_labels) | |
test_dataset = TwitterDataset(test_encodings, test_labels) | |
# Fine tune model | |
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=3) | |
output_dir = r"C:\Users\johna\anaconda3\envs\twitter-env-2\models\albert-base-v2\results" | |
logging_dir = r"C:\Users\johna\anaconda3\envs\twitter-env-2\models\albert-base-v2\logs" | |
training_args = TrainingArguments( | |
output_dir=output_dir, | |
num_train_epochs=6, # total number of training epochs | |
fp16=True, # precision choice to preserve memory on GPU | |
gradient_accumulation_steps=4, # batch size for calculating gradients | |
per_device_train_batch_size=16, # batch size per device during training | |
per_device_eval_batch_size=64, # batch size for evaluation | |
warmup_steps=500, # number of warmup steps for learning rate scheduler | |
weight_decay=0.01, # strength of weight decay | |
logging_dir=logging_dir, # directory for storing logs | |
logging_steps=25, | |
evaluation_strategy="steps", | |
load_best_model_at_end=True, | |
) | |
trainer = Trainer( | |
model=model, # the instantiated Transformers model to be trained | |
args=training_args, # training arguments, defined above | |
train_dataset=train_dataset, # training dataset | |
eval_dataset=validation_dataset # evaluation dataset | |
) | |
trainer.train() | |
trainer.save_model(r"C:\Users\johna\anaconda3\envs\twitter-env-2\models\albert-base-v2") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment