Skip to content

Instantly share code, notes, and snippets.

@philschmid
Last active March 24, 2023 22:10
Show Gist options
  • Save philschmid/99410e8bf66d34e52bb0cd5270b07989 to your computer and use it in GitHub Desktop.
Save philschmid/99410e8bf66d34e52bb0cd5270b07989 to your computer and use it in GitHub Desktop.
run_clm_bnb8.py

Hugging Face Transformers FSDP & bitsandbytes AdamWInt8 optimizer

process dataset

python process_dataset.py

run training

torchrun --nproc_per_node=4 run_clm_bnb8.py \
  --model_id philschmid/gpt-j-6B-fp16-sharded \
  --dataset_path lm_dataset \
  --max_train_samples 1000 \
  --per_device_train_batch_size 1

Note: using adafactor the config above fits on 4x A10G (24GB)

from datasets import load_dataset
from transformers import AutoTokenizer
from itertools import chain
from functools import partial
# Load Tokenizer
model_id = "philschmid/gpt-j-6B-fp16-sharded"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Load dataset from huggingface.co
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
# empty list to save remainder from batches to use in next batch
remainder = {"input_ids": [], "attention_mask": []}
def chunk(sample, chunk_length=2048):
# define global remainder variable to save remainder from batches to use in next batch
global remainder
# Concatenate all texts and add remainder from previous batch
concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
# get total number of tokens for batch
batch_total_length = len(concatenated_examples[list(sample.keys())[0]])
# get max number of chunks for batch
if batch_total_length >= chunk_length:
batch_chunk_length = (batch_total_length // chunk_length) * chunk_length
# Split by chunks of max_len.
result = {
k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
for k, t in concatenated_examples.items()
}
# add remainder to global variable for next batch
remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
# prepare labels
result["labels"] = result["input_ids"].copy()
return result
# tokenize and chunk dataset
lm_dataset = dataset.map(
lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)
).map(
partial(chunk, chunk_length=2048),
batched=True,
)
# Print total number of samples
print(f"Total number of samples: {len(lm_dataset)}")
lm_dataset.save_to_disk("lm_dataset")
import os
import argparse
import numpy as np
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
set_seed,
default_data_collator,
)
from datasets import load_from_disk
import torch
import numpy as np
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments
def parse_arge():
"""Parse the arguments."""
parser = argparse.ArgumentParser()
# add model id and dataset path argument
parser.add_argument(
"--model_id",
type=str,
default="google/flan-t5-xl",
help="Model id to use for training.",
)
parser.add_argument("--dataset_path", type=str, default="lm_dataset", help="Path to dataset.")
# add training hyperparameters for epochs, batch size, learning rate, and seed
parser.add_argument(
"--epochs", type=int, default=3, help="Number of epochs to train for."
)
parser.add_argument(
"--max_train_samples", type=int, default=None, help="Number of samples to train on."
)
parser.add_argument(
"--per_device_train_batch_size",
type=int,
default=8,
help="Batch size to use for training.",
)
parser.add_argument(
"--lr", type=float, default=5e-5, help="Learning rate to use for training."
)
parser.add_argument(
"--seed", type=int, default=42, help="Seed to use for training."
)
parser.add_argument(
"--gradient_checkpointing",
type=bool,
default=True,
help="Path to deepspeed config file.",
)
parser.add_argument(
"--bf16",
type=bool,
default=True if torch.cuda.get_device_capability()[0] == 8 else False,
help="Whether to use bf16.",
)
parser.add_argument(
"--hf_token",
type=str,
default=HfFolder.get_token(),
help="Token to use for uploading models to Hugging Face Hub.",
)
args = parser.parse_known_args()
return args
def training_function(args):
# set seed
set_seed(args.seed)
dataset = load_from_disk(args.dataset_path)
# load dataset from disk and tokenizer
if args.max_train_samples is not None:
dataset = dataset.select(range(args.max_train_samples))
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
# load model from the hub
model = AutoModelForCausalLM.from_pretrained(
args.model_id,
use_cache=False
if args.gradient_checkpointing
else True, # this is needed for gradient checkpointing
)
# Define training args
# output_dir = args.repository_id if args.repository_id else args.model_id.split("/")[-1]
output_dir = args.model_id.split("/")[-1]
training_args = TrainingArguments(
output_dir=output_dir,
overwrite_output_dir=True,
per_device_train_batch_size=args.per_device_train_batch_size,
bf16=args.bf16, # Use BF16 if available
learning_rate=args.lr,
num_train_epochs=args.epochs,
gradient_checkpointing=args.gradient_checkpointing,
gradient_accumulation_steps=1,
# logging strategies
logging_dir=f"{output_dir}/logs",
logging_strategy="steps",
logging_steps=10,
save_strategy="epoch",
save_total_limit=2,
# optim="adafactor",
optim="adamw_bnb_8bit",
report_to="tensorboard",
# fsdp parameters
fsdp="full_shard auto_wrap",
fsdp_config={"fsdp_transformer_layer_cls_to_wrap": ["GPTJBlock"]},
)
# Create Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
data_collator=default_data_collator,
# optimizers=(adam_bnb_optim, None),
)
# Start training
trainer.train()
# Save our tokenizer and create model card
tokenizer.save_pretrained(output_dir)
def main():
args, _ = parse_arge()
training_function(args)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment