Skip to content

Instantly share code, notes, and snippets.

@davidgilbertson
Created January 5, 2025 23:42
Show Gist options
  • Save davidgilbertson/88ec9d89d132e53c657d099274d36ea0 to your computer and use it in GitHub Desktop.
Save davidgilbertson/88ec9d89d132e53c657d099274d36ea0 to your computer and use it in GitHub Desktop.
import pandas as pd
from datasets import Dataset
# Requires transformers 4.48
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
DataCollatorWithPadding,
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
df = pd.read_csv("messages.csv")
dataset = Dataset.from_pandas(df)
dataset = dataset.rename_column("Target", "label")
dataset = dataset.class_encode_column("label")
dataset = dataset.train_test_split(test_size=0.25, shuffle=False)
model_name = "answerdotai/ModernBERT-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to("cuda")
def tokenize_function(examples):
return tokenizer(
examples["Message"],
padding="max_length",
truncation=True,
max_length=512,
)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, predictions, average="binary"
)
acc = accuracy_score(labels, predictions)
return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
trainer = Trainer(
model=model,
args=TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
),
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
tokenizer=tokenizer,
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
compute_metrics=compute_metrics,
)
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)
model_save_path = "./saved_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment