|
# 1. Install dependencies |
|
# ------------------------------------------------------------------------------ |
|
# pip install datasets pandas transformers |
|
|
|
# 2. Import libraries and modules |
|
# ------------------------------------------------------------------------------ |
|
from datasets import Dataset, DatasetDict |
|
import pandas as pd |
|
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer |
|
|
|
# 3. Set model, tokenizer, and data_collator variables |
|
# ------------------------------------------------------------------------------ |
|
# See options besides t5-base here: |
|
# https://huggingface.co/docs/transformers/model_doc/t5 |
|
# ------------------------------------------------------------------------------ |
|
tokenizer = T5Tokenizer.from_pretrained("t5-base") |
|
model = T5ForConditionalGeneration.from_pretrained("t5-base") |
|
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) |
|
|
|
# 4. Get data and divide into train, eval, and test sets |
|
# ------------------------------------------------------------------------------ |
|
# Replace the dataframe with your own data. Ensure you use "source_text" and |
|
# "target_text" as column names or you'll need to change the code below. |
|
# Also note that we we use 80% of the data for training, 10% for evaluation, |
|
# and 10% for testing. You can modify this as well. |
|
# ------------------------------------------------------------------------------ |
|
df = pd.DataFrame({"source_text": [], "target_text": []}) |
|
train_df = df.sample(frac = 0.8) |
|
eval_df = df.drop(train_df.index).sample(frac = 0.5) |
|
test_df = df.drop(train_df.index).drop(eval_df.index) |
|
|
|
# 5. Create a dataset dict from the dataframes |
|
# ------------------------------------------------------------------------------ |
|
dataset = DatasetDict({ |
|
"train": Dataset.from_pandas(train_df), |
|
"eval": Dataset.from_pandas(eval_df), |
|
"test": Dataset.from_pandas(test_df), |
|
}) |
|
|
|
# 6. Tokenize the dataset |
|
# ------------------------------------------------------------------------------ |
|
# You can change the max_length to whatever makes sense for your data. |
|
# ------------------------------------------------------------------------------ |
|
def tokenize(source_texts, target_texts): |
|
model_inputs = tokenizer(text=source_texts, max_length=512, truncation=True) |
|
labels = tokenizer(text_target=target_texts, max_length=512, truncation=True) |
|
model_inputs["labels"] = labels["input_ids"] |
|
return model_inputs |
|
tokenized_dataset = dataset.map(tokenize, input_columns=["source_text", "target_text"], remove_columns=["source_text", "target_text"]) |
|
|
|
# 7. Set training arguments |
|
# ------------------------------------------------------------------------------ |
|
# Change "output_directory" to your desired output directory. You can also change the |
|
# batch_size, learning_rate, num_train_epochs and other parameters here. See the |
|
# documentation for more details: |
|
# https://huggingface.co/docs/transformers/v4.21.3/en/main_classes/trainer#transformers.TrainingArguments |
|
# ------------------------------------------------------------------------------ |
|
training_arguments = Seq2SeqTrainingArguments( |
|
"output_directory", |
|
learning_rate=0.0001, |
|
weight_decay=0.01, |
|
fp16=True, |
|
per_device_train_batch_size=4, |
|
per_device_eval_batch_size=4, |
|
gradient_accumulation_steps=2, |
|
num_train_epochs=20, |
|
evaluation_strategy="epoch", |
|
report_to="all" |
|
) |
|
|
|
# 8. Create a trainer |
|
# ------------------------------------------------------------------------------ |
|
trainer = Seq2SeqTrainer( |
|
model, |
|
training_arguments, |
|
train_dataset=tokenized_dataset["train"], |
|
eval_dataset=tokenized_dataset["eval"], |
|
data_collator=data_collator, |
|
tokenizer=tokenizer |
|
) |
|
|
|
# 9. Train the model |
|
# ------------------------------------------------------------------------------ |
|
trainer.train() |
|
|
|
# 10. Save the tokenizer and model |
|
# ------------------------------------------------------------------------------ |
|
# Change "output_directory" to your desired output directory. Note that you can |
|
# also run evaluation before or after saving the tokenizer and model. |
|
# ------------------------------------------------------------------------------ |
|
tokenizer.save_pretrained("output_directory") |
|
model.save_pretrained("output_directory") |