Forked from AmgadHasan/gist:72b06cb8adc2d2217cca8c6790858685
Created
August 14, 2023 21:53
-
-
Save rhulha/d33c75a78913921d38be9c5376c1cf80 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import time | |
import argparse | |
start = time.time() | |
os.system("nvidia-smi") | |
# import libraries | |
import torch | |
import transformers | |
from datasets import load_dataset, Dataset | |
from trl import SFTTrainer | |
import pandas as pd | |
def training_function(args): | |
lr = args.lr | |
num_epochs = args.num_epochs | |
seed = args.seed | |
transformers.set_seed(seed) | |
# print GPU available memory | |
free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3) | |
max_memory = f"{free_in_GB-2}GB" | |
n_gpus = torch.cuda.device_count() | |
max_memory = {i: max_memory for i in range(n_gpus)} | |
print("Max VRAM (GB): ", max_memory) | |
pandas_dataset_stable_diffusion = pd.read_csv(args.dataset) | |
pandas_dataset_stable_diffusion = pandas_dataset_stable_diffusion.sample(frac=1) | |
dataset_stable_diffusion_train = Dataset.from_pandas( | |
pandas_dataset_stable_diffusion.iloc[0:140, :] | |
) | |
# remove old text cols | |
dataset_stable_diffusion_train = dataset_stable_diffusion_train.remove_columns( | |
[ | |
col | |
for col in dataset_stable_diffusion_train.column_names | |
if col not in ["prompt", "response"] | |
] | |
) | |
print("Print an example in the train dataset:") | |
print(dataset_stable_diffusion_train) | |
print(dataset_stable_diffusion_train[0]) | |
print("Final train dataset:") | |
train_dataset = dataset_stable_diffusion_train.shuffle(seed=43) | |
print(train_dataset) | |
print(train_dataset[0]) | |
print(train_dataset[-1]) | |
dataset_stable_diffusion_eval = Dataset.from_pandas(pandas_dataset_stable_diffusion.iloc[140:, :]) | |
# remove old text cols | |
dataset_stable_diffusion_eval = dataset_stable_diffusion_eval.remove_columns( | |
[ | |
col | |
for col in dataset_stable_diffusion_eval.column_names | |
if col not in ["prompt", "response"] | |
] | |
) | |
print("Print an example in the eval dataset:") | |
print(dataset_stable_diffusion_eval) | |
print(dataset_stable_diffusion_eval[0]) | |
print("Final eval dataset:") | |
eval_dataset = dataset_stable_diffusion_eval.shuffle(seed=43) | |
print(eval_dataset) | |
print(eval_dataset[0]) | |
print(eval_dataset[-1]) | |
# let's now write a function to format the dataset for instruction fine-tuning | |
def formatting_prompts_func(dataset): | |
instructions = [] | |
for i in range(len(dataset["prompt"])): | |
text = f"{dataset['prompt'][i]}\n{dataset['response'][i]}" | |
instructions.append(text) | |
return instructions | |
""" | |
## Loading the model | |
In this section we will load the [MPT-7B model](https://huggingface.co/mosaicml/mpt-7b). | |
""" | |
# load assets | |
model_id = args.model | |
print(type(model_id)) | |
# mpt tokenizer load | |
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) | |
# set mpt tokenizer padding token to eos token | |
tokenizer.pad_token = tokenizer.eos_token | |
tokenizer.pad_token_id = tokenizer.eos_token_id | |
print(f"{model_id} tokenizer eos_token: ", tokenizer.eos_token) | |
print(f"{model_id} tokenizer pad_token: ", tokenizer.pad_token) | |
print(f"{model_id} tokenizer model_max_length: ", tokenizer.model_max_length) | |
model = transformers.AutoModelForCausalLM.from_pretrained( | |
model_id, | |
torch_dtype=torch.bfloat16, | |
trust_remote_code=True, | |
device_map="auto", | |
) | |
free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3) | |
max_memory = f"{free_in_GB-2}GB" | |
n_gpus = torch.cuda.device_count() | |
max_memory = {i: max_memory for i in range(n_gpus)} | |
print("Max VRAM (GB): ", max_memory) | |
""" | |
## Loading the trainer | |
Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets. Let's first load the training arguments below. | |
from transformers import TrainingArguments | |
# see https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments | |
""" | |
output_dir = "./results" | |
num_train_epochs = num_epochs | |
auto_find_batch_size = False | |
per_device_train_batch_size=1 | |
per_device_eval_batch_size=1 | |
gradient_accumulation_steps = 1 | |
save_strategy = "epoch" | |
learning_rate = lr | |
lr_scheduler_type = "linear" | |
warmup_ratio = 0.03 | |
logging_strategy = "steps" | |
logging_steps = 50 | |
do_eval = True | |
evaluation_strategy = "steps" | |
prediction_loss_only = True | |
eval_steps = 0.2 | |
training_arguments = transformers.TrainingArguments( | |
output_dir=output_dir, | |
num_train_epochs=num_train_epochs, | |
auto_find_batch_size=auto_find_batch_size, | |
gradient_accumulation_steps=gradient_accumulation_steps, | |
save_strategy=save_strategy, | |
learning_rate=learning_rate, | |
lr_scheduler_type=lr_scheduler_type, | |
warmup_ratio=warmup_ratio, | |
logging_strategy=logging_strategy, | |
logging_steps=logging_steps, | |
do_eval=do_eval, | |
evaluation_strategy=evaluation_strategy, | |
prediction_loss_only=prediction_loss_only, | |
eval_steps=eval_steps, | |
) | |
""" | |
Then finally pass everything to the trainer | |
""" | |
max_seq_length = tokenizer.model_max_length | |
trainer = SFTTrainer( | |
model=model, | |
train_dataset=train_dataset, | |
eval_dataset=eval_dataset, | |
formatting_func=formatting_prompts_func, | |
max_seq_length=max_seq_length, | |
tokenizer=tokenizer, | |
args=training_arguments, | |
) | |
""" | |
## Train the model | |
Now let's train the model! Simply call `trainer.train()` | |
""" | |
trainer.train() | |
# finished: print GPU available memory and total time | |
free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3) | |
max_memory = f"{free_in_GB-2}GB" | |
n_gpus = torch.cuda.device_count() | |
max_memory = {i: max_memory for i in range(n_gpus)} | |
print("Max VRAM (GB): ", max_memory) | |
end = time.time() | |
print("Total time (sec): ", end - start) | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Simple example of a single GPU training script." | |
) | |
parser.add_argument( | |
"--model", | |
type=str, | |
help="Path for model folder or HF repository", | |
) | |
parser.add_argument( | |
"--dataset", | |
type=str, | |
help="Path for dataset", | |
) | |
parser.add_argument( | |
"--lr", | |
type=float, | |
default=2e-5, | |
help="Learning rate for training.", | |
) | |
parser.add_argument( | |
"--num_epochs", | |
type=int, | |
default=1, | |
help="Num training epochs.", | |
) | |
parser.add_argument( | |
"--seed", | |
type=int, | |
default=43, | |
help="Random seed.", | |
) | |
args = parser.parse_args() | |
print(f"Training args: {args}") | |
training_function(args) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment