|
import os |
|
import argparse |
|
import numpy as np |
|
from transformers import ( |
|
AutoModelForCausalLM, |
|
AutoTokenizer, |
|
set_seed, |
|
default_data_collator, |
|
) |
|
from datasets import load_from_disk |
|
import torch |
|
import numpy as np |
|
from huggingface_hub import HfFolder |
|
from transformers import Trainer, TrainingArguments |
|
|
|
|
|
def parse_arge(): |
|
"""Parse the arguments.""" |
|
parser = argparse.ArgumentParser() |
|
# add model id and dataset path argument |
|
parser.add_argument( |
|
"--model_id", |
|
type=str, |
|
default="google/flan-t5-xl", |
|
help="Model id to use for training.", |
|
) |
|
parser.add_argument("--dataset_path", type=str, default="lm_dataset", help="Path to dataset.") |
|
# add training hyperparameters for epochs, batch size, learning rate, and seed |
|
parser.add_argument( |
|
"--epochs", type=int, default=3, help="Number of epochs to train for." |
|
) |
|
parser.add_argument( |
|
"--max_train_samples", type=int, default=None, help="Number of samples to train on." |
|
) |
|
parser.add_argument( |
|
"--per_device_train_batch_size", |
|
type=int, |
|
default=8, |
|
help="Batch size to use for training.", |
|
) |
|
parser.add_argument( |
|
"--lr", type=float, default=5e-5, help="Learning rate to use for training." |
|
) |
|
parser.add_argument( |
|
"--seed", type=int, default=42, help="Seed to use for training." |
|
) |
|
parser.add_argument( |
|
"--gradient_checkpointing", |
|
type=bool, |
|
default=True, |
|
help="Path to deepspeed config file.", |
|
) |
|
parser.add_argument( |
|
"--bf16", |
|
type=bool, |
|
default=True if torch.cuda.get_device_capability()[0] == 8 else False, |
|
help="Whether to use bf16.", |
|
) |
|
parser.add_argument( |
|
"--hf_token", |
|
type=str, |
|
default=HfFolder.get_token(), |
|
help="Token to use for uploading models to Hugging Face Hub.", |
|
) |
|
args = parser.parse_known_args() |
|
return args |
|
|
|
def training_function(args): |
|
# set seed |
|
set_seed(args.seed) |
|
|
|
dataset = load_from_disk(args.dataset_path) |
|
# load dataset from disk and tokenizer |
|
if args.max_train_samples is not None: |
|
dataset = dataset.select(range(args.max_train_samples)) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(args.model_id) |
|
# load model from the hub |
|
model = AutoModelForCausalLM.from_pretrained( |
|
args.model_id, |
|
use_cache=False |
|
if args.gradient_checkpointing |
|
else True, # this is needed for gradient checkpointing |
|
) |
|
|
|
# Define training args |
|
# output_dir = args.repository_id if args.repository_id else args.model_id.split("/")[-1] |
|
output_dir = args.model_id.split("/")[-1] |
|
training_args = TrainingArguments( |
|
output_dir=output_dir, |
|
overwrite_output_dir=True, |
|
per_device_train_batch_size=args.per_device_train_batch_size, |
|
bf16=args.bf16, # Use BF16 if available |
|
learning_rate=args.lr, |
|
num_train_epochs=args.epochs, |
|
gradient_checkpointing=args.gradient_checkpointing, |
|
gradient_accumulation_steps=1, |
|
# logging strategies |
|
logging_dir=f"{output_dir}/logs", |
|
logging_strategy="steps", |
|
logging_steps=10, |
|
save_strategy="epoch", |
|
save_total_limit=2, |
|
# optim="adafactor", |
|
optim="adamw_bnb_8bit", |
|
report_to="tensorboard", |
|
# fsdp parameters |
|
fsdp="full_shard auto_wrap", |
|
fsdp_config={"fsdp_transformer_layer_cls_to_wrap": ["GPTJBlock"]}, |
|
) |
|
|
|
# Create Trainer instance |
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=dataset, |
|
data_collator=default_data_collator, |
|
# optimizers=(adam_bnb_optim, None), |
|
) |
|
|
|
# Start training |
|
trainer.train() |
|
|
|
# Save our tokenizer and create model card |
|
tokenizer.save_pretrained(output_dir) |
|
|
|
|
|
def main(): |
|
args, _ = parse_arge() |
|
training_function(args) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |