Skip to content

Instantly share code, notes, and snippets.

@john-adeojo
Last active March 6, 2023 14:04
Show Gist options
  • Save john-adeojo/a434b09dccdc9ff345a1fe7367140acd to your computer and use it in GitHub Desktop.
Save john-adeojo/a434b09dccdc9ff345a1fe7367140acd to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
# Create labels and features
def features_labels(df):
y = list(df["gender"].astype("category").cat.codes)
X = list((df["general_twitter_text"]))
return y, X
train_labels, train_text = features_labels(df=expanded_train_df)
validation_labels, validation_text = features_labels(df=expanded_validation_df)
test_labels, test_text = features_labels(df=expanded_test_df)
# Pytorch Dataset class
import torch
class TwitterDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item["labels"] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
# Model fine tunining
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
# set model and tokenizer
MODEL ="albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
# Create pytorch Dataset
train_encodings = tokenizer(train_text, truncation=True, padding=True)
val_encodings = tokenizer(validation_text, truncation=True, padding=True)
test_encodings = tokenizer(test_text, truncation=True, padding=True)
train_dataset = TwitterDataset(train_encodings, train_labels)
validation_dataset = TwitterDataset(val_encodings, validation_labels)
test_dataset = TwitterDataset(test_encodings, test_labels)
# Fine tune model
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=3)
output_dir = r"C:\Users\johna\anaconda3\envs\twitter-env-2\models\albert-base-v2\results"
logging_dir = r"C:\Users\johna\anaconda3\envs\twitter-env-2\models\albert-base-v2\logs"
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=6, # total number of training epochs
fp16=True, # precision choice to preserve memory on GPU
gradient_accumulation_steps=4, # batch size for calculating gradients
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir=logging_dir, # directory for storing logs
logging_steps=25,
evaluation_strategy="steps",
load_best_model_at_end=True,
)
trainer = Trainer(
model=model, # the instantiated Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=validation_dataset # evaluation dataset
)
trainer.train()
trainer.save_model(r"C:\Users\johna\anaconda3\envs\twitter-env-2\models\albert-base-v2")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment