Skip to content

Instantly share code, notes, and snippets.

@muellerzr
Created September 11, 2024 17:22
Show Gist options
  • Save muellerzr/f3f4536841df95ca9a89541c2b53096f to your computer and use it in GitHub Desktop.
Save muellerzr/f3f4536841df95ca9a89541c2b53096f to your computer and use it in GitHub Desktop.
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import deepspeed
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
from accelerate import PartialState
MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32
zero2_ds_config = {
"bf16": {
"enabled": True
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": True
},
"allgather_partitions": True,
"allgather_bucket_size": 2e8,
"overlap_comm": True,
"reduce_scatter": True,
"reduce_bucket_size": "auto",
"contiguous_gradients": True
},
"gradient_accumulation_steps": 1,
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": 32,
"train_micro_batch_size_per_gpu": 16,
"wall_clock_breakdown": False
}
zero3_ds_config = {
"bf16": {
"enabled": True
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": True
},
"offload_param": {
"device": "cpu",
"pin_memory": True
},
"overlap_comm": True,
"contiguous_gradients": True,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": "auto",
"memory_efficient_linear": False
},
"gradient_accumulation_steps": 1,
"gradient_clipping": "auto",
"steps_per_print": 2000,
"wall_clock_breakdown": False,
"train_batch_size": 32,
"train_micro_batch_size_per_gpu": 16
}
def get_dataloaders(batch_size: int = 16, model_name: str = "bert-base-cased"):
"""
Creates a set of `DataLoader`s for the `glue` dataset.
Args:
accelerator (`Accelerator`):
An `Accelerator` object
batch_size (`int`, *optional*):
The batch size for the train and validation DataLoaders.
model_name (`str`, *optional*):
"""
tokenizer = AutoTokenizer.from_pretrained(model_name)
datasets = load_dataset("glue", "mrpc")
def tokenize_function(examples):
# max_length=None => use the model max length (it's actually the default)
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
return outputs
# Apply the method we just defined to all the examples in all the splits of the dataset
tokenized_datasets = datasets.map(
tokenize_function, batched=True, remove_columns=["idx", "sentence1", "sentence2"], load_from_cache_file=False
)
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
# transformers library
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
def collate_fn(examples):
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
# Instantiate dataloaders.
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
)
return train_dataloader, eval_dataloader
def multiple_model_training(model_name_or_path):
# This will essentially be like a k-fold model, but one model is Zero-2 and another model is Zero-3
num_epochs = 2
lr = 2e-5
batch_size = 16
zero2_model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
train_dataloader, _ = get_dataloaders(batch_size=batch_size, model_name=model_name_or_path)
max_training_steps = len(train_dataloader) * num_epochs
zero2_optimizer = AdamW(zero2_model.parameters(), lr=lr)
# Then convert to DS
from deepspeed.ops.adam import DeepSpeedCPUAdam
defaults = {k: v for k, v in zero2_optimizer.defaults.items() if k in ["lr", "weight_decay"]}
zero2_optimizer = DeepSpeedCPUAdam(zero2_model.parameters(), **defaults)
zero2_lr_scheduler = get_linear_schedule_with_warmup(zero2_optimizer, num_warmup_steps=0, num_training_steps=max_training_steps)
zero2_model, zero2_optimizer, _, zero2_lr_scheduler = deepspeed.initialize(
model=zero2_model,
model_parameters=zero2_model.parameters(),
config=zero2_ds_config,
optimizer=zero2_optimizer,
lr_scheduler=zero2_lr_scheduler,
)
# Now that we have DS, get the device
state = PartialState()
device = state.device
# Uncomment to use zero3 instead of zero2
zero3_config = zero2_ds_config
# Manually enable zero3 in the env
# zero3_config = HfDeepSpeedConfig(zero3_ds_config)
# zero3_config.config["train_micro_batch_size_per_gpu"] = zero2_ds_config["train_micro_batch_size_per_gpu"]
zero3_model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
zero3_optimizer = AdamW(zero3_model.parameters(), lr=lr)
defaults = {k: v for k, v in zero3_optimizer.defaults.items() if k in ["lr", "weight_decay"]}
zero3_optimizer = DeepSpeedCPUAdam(zero3_model.parameters(), **defaults)
zero3_lr_scheduler = get_linear_schedule_with_warmup(zero3_optimizer, num_warmup_steps=0, num_training_steps=max_training_steps)
zero3_model, zero3_optimizer, _, zero3_lr_scheduler = deepspeed.initialize(
model=zero3_model,
model_parameters=zero3_model.parameters(),
config=zero3_config,
optimizer=zero3_optimizer,
lr_scheduler=zero3_lr_scheduler,
)
for epoch in range(num_epochs):
zero2_model.train()
zero3_model.train()
for step, batch in enumerate(train_dataloader):
batch = batch.to(device)
outputs_1 = zero2_model(**batch)
outputs_2 = zero3_model(**batch)
loss = outputs_1.loss + outputs_2.loss / 2
zero2_model.backward(loss, retain_graph=True)
zero3_model.backward(loss)
zero2_model.step()
zero3_model.step()
def main():
parser = argparse.ArgumentParser(description="Simple example of training script tracking peak GPU memory usage.")
parser.add_argument(
"--model_name_or_path",
type=str,
default="bert-base-cased",
help="Path to pretrained model or model identifier from huggingface.co/models.",
required=False,
)
args = parser.parse_args()
multiple_model_training(args.model_name_or_path)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment