Last active
November 2, 2024 12:14
-
-
Save shresthakamal/1d1a725bce509cf000021c3cb0d12f97 to your computer and use it in GitHub Desktop.
Custom Dataset in Pytorch from Pandas Dataframe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from torch.utils.data import Dataset | |
class CustomTrainDataset(Dataset): | |
def __init__(self, df, tokenizer): | |
self.df = df | |
self.tokenizer = tokenizer | |
def __len__(self): | |
return len(self.df) | |
def __getitem__(self, idx): | |
# get item | |
item = df.iloc[idx] | |
text = item['text'] | |
label = item['label'] | |
# encode text | |
encoding = self.tokenizer(text, padding="max_length", max_length=128, truncation=True, return_tensors="pt") | |
# remove batch dimension which the tokenizer automatically adds | |
encoding = {k:v.squeeze() for k,v in encoding.items()} | |
# add label | |
encoding["label"] = torch.tensor(label) | |
return encoding | |
from transformers import BertTokenizer | |
import pandas as pd | |
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | |
df = pd.read_csv("path_to_your_csv") | |
train_dataset = CustomTrainDataset(df=df, tokenizer=tokenizer) | |
from torch.utils.data import DataLoader | |
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True) | |
## CHECK | |
batch = next(iter(train_dataloader)) | |
for k,v in batch.items(): | |
print(k, v.shape) | |
# decode the input_ids of the first example of the batch | |
print(tokenizer.decode(batch['input_ids'][0].tolist()) | |
import torch | |
from transformers import BertForSequenceClassification | |
# Instantiate pre-trained BERT model with randomly initialized classification head | |
model = BertForSequenceClassification.from_pretrained("bert-base-uncased") | |
# I almost always use a learning rate of 5e-5 when fine-tuning Transformer based models | |
optimizer = torch.optim.Adam(model.parameters(), lr=5-e5) | |
# put model on GPU, if available | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
for epoch in range(epochs): | |
model.train() | |
train_loss = 0.0 | |
for batch in train_dataloader: | |
# put batch on device | |
batch = {k:v.to(device) for k,v in batch.items()} | |
# forward pass | |
outputs = model(**batch) | |
loss = outputs.loss | |
train_loss += loss.item() | |
loss.backward() | |
optimizer.step() | |
optimizer.zero_grad() | |
print("Loss after epoch {epoch}:", train_loss/len(train_dataloader)) | |
model.eval() | |
val_loss = 0.0 | |
with torch.no_grad(): | |
for batch in eval_dataloader: | |
# put batch on device | |
batch = {k:v.to(device) for k,v in batch.items()} | |
# forward pass | |
outputs = model(**batch) | |
loss = outputs.logits | |
val_loss += loss.item() | |
print("Validation loss after epoch {epoch}:", val_loss/len(eval_dataloader)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment