Last active
September 22, 2023 11:54
-
-
Save acalatrava/1ff41115c2372668dc1cda41ee78f3af to your computer and use it in GitHub Desktop.
QLORA FineTuning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
# | |
# Original file https://github.com/jzhang38/TinyLlama/blob/main/sft/finetune.py | |
# | |
""" | |
pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu | |
pip install accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 | |
pip install scipy evaluate tqdm pandas packaging | |
Usage: | |
CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch --multi_gpu --num_processes 4 --main_process_port 1234 finetune.py \ | |
--model_name_or_path PY007/TinyLlama-1.1B-intermediate-step-240k-503b \ | |
--output_dir ./output/503B_FT_lr1e-5_ep5_top1_2023-08-25 \ | |
--logging_steps 10 \ | |
--save_strategy steps \ | |
--data_seed 42 \ | |
--save_total_limit 6 \ | |
--evaluation_strategy epoch \ | |
--eval_dataset_size 512 \ | |
--max_eval_samples 1000 \ | |
--per_device_eval_batch_size 1 \ | |
--max_new_tokens 32 \ | |
--dataloader_num_workers 3 \ | |
--group_by_length=False \ | |
--logging_strategy steps \ | |
--remove_unused_columns False \ | |
--do_train \ | |
--do_eval \ | |
--warmup_ratio 0.05 \ | |
--lr_scheduler_type constant \ | |
--dataset OpenAssistant/oasst_top1_2023-08-25 \ | |
--dataset_format oasst1 \ | |
--source_max_len 16 \ | |
--target_max_len 512 \ | |
--per_device_train_batch_size 4 \ | |
--max_steps 0 \ | |
--num_train_epochs 5 \ | |
--learning_rate 1e-5 \ | |
--adam_beta2 0.999 \ | |
--max_grad_norm 1.0 \ | |
--weight_decay 0.0 \ | |
--seed 0 \ | |
--trust_remote_code \ | |
--report_to wandb | |
""" | |
from collections import defaultdict | |
import copy | |
import json | |
import os | |
from os.path import exists, join, isdir | |
from dataclasses import dataclass, field | |
import sys | |
from typing import Optional, Dict, Sequence | |
import numpy as np | |
from tqdm import tqdm | |
import logging | |
import pandas as pd | |
import importlib | |
from packaging import version | |
from packaging.version import parse | |
import torch | |
import transformers | |
from torch.nn.utils.rnn import pad_sequence | |
import argparse | |
from transformers import ( | |
AutoTokenizer, | |
AutoModelForCausalLM, | |
set_seed, | |
Seq2SeqTrainer, | |
BitsAndBytesConfig, | |
LlamaTokenizer | |
) | |
from datasets import load_dataset, Dataset | |
import evaluate | |
from trl import SFTTrainer | |
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR | |
if torch.cuda.is_available(): | |
torch.backends.cuda.matmul.allow_tf32 = True | |
logger = logging.getLogger(__name__) | |
IGNORE_INDEX = -100 | |
DEFAULT_PAD_TOKEN = "[PAD]" | |
@dataclass | |
class ModelArguments: | |
model_name_or_path: Optional[str] = field( | |
default="EleutherAI/pythia-12b" | |
) | |
trust_remote_code: Optional[bool] = field( | |
default=False, | |
metadata={"help": "Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained."} | |
) | |
@dataclass | |
class DataArguments: | |
eval_dataset_size: int = field( | |
default=1024, metadata={"help": "Size of validation dataset."} | |
) | |
max_train_samples: Optional[int] = field( | |
default=None, | |
metadata={ | |
"help": "For debugging purposes or quicker training, truncate the number of training examples to this " | |
"value if set." | |
}, | |
) | |
max_eval_samples: Optional[int] = field( | |
default=None, | |
metadata={ | |
"help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " | |
"value if set." | |
}, | |
) | |
source_max_len: int = field( | |
default=1024, | |
metadata={"help": "Maximum source sequence length. Sequences will be right padded (and possibly truncated)."}, | |
) | |
target_max_len: int = field( | |
default=256, | |
metadata={"help": "Maximum target sequence length. Sequences will be right padded (and possibly truncated)."}, | |
) | |
dataset: str = field( | |
default='alpaca', | |
metadata={"help": "Which dataset to finetune on. See datamodule for options."} | |
) | |
dataset_format: Optional[str] = field( | |
default=None, | |
metadata={"help": "Which dataset format is used. [alpaca|chip2|self-instruct|hh-rlhf]"} | |
) | |
@dataclass | |
class TrainingArguments(transformers.Seq2SeqTrainingArguments): | |
train_on_source: Optional[bool] = field( | |
default=False, | |
metadata={"help": "Whether to train on the input in addition to the target text."} | |
) | |
report_to: str = field( | |
default='none', | |
metadata={"help": "To use wandb or something else for reporting."} | |
) | |
output_dir: str = field(default='./output', metadata={"help": 'The output dir for logs and checkpoints'}) | |
optim: str = field(default='adamw_torch', metadata={"help": 'The optimizer to be used'}) | |
per_device_train_batch_size: int = field(default=16, metadata={"help": 'The training batch size per GPU. Increase for better speed.'}) | |
gradient_accumulation_steps: int = field(default=1, metadata={"help": 'How many gradients to accumulate before to perform an optimizer step'}) | |
max_steps: int = field(default=10000, metadata={"help": 'How many optimizer update steps to take'}) | |
weight_decay: float = field(default=0.0, metadata={"help": 'The L2 weight decay rate of AdamW'}) | |
learning_rate: float = field(default=0.0002, metadata={"help": 'The learnign rate'}) | |
remove_unused_columns: bool = field(default=False, metadata={"help": 'Removed unused columns. Needed to make this codebase work.'}) | |
max_grad_norm: float = field(default=0.3, metadata={"help": 'Gradient clipping max norm. This is tuned and works well for all models tested.'}) | |
gradient_checkpointing: bool = field(default=True, metadata={"help": 'Use gradient checkpointing. You want to use this.'}) | |
do_train: bool = field(default=True, metadata={"help": 'To train or not to train, that is the question?'}) | |
lr_scheduler_type: str = field(default='constant', metadata={"help": 'Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis'}) | |
warmup_ratio: float = field(default=0.03, metadata={"help": 'Fraction of steps to do a warmup for'}) | |
logging_steps: int = field(default=10, metadata={"help": 'The frequency of update steps after which to log the loss'}) | |
group_by_length: bool = field(default=True, metadata={"help": 'Group sequences into batches with same length. Saves memory and speeds up training considerably.'}) | |
save_strategy: str = field(default='steps', metadata={"help": 'When to save checkpoints'}) | |
save_steps: int = field(default=250, metadata={"help": 'How often to save a model'}) | |
save_total_limit: int = field(default=40, metadata={"help": 'How many checkpoints to save before the oldest is overwritten'}) | |
use_4_bit: bool = field(default=False, metadata={"help": 'Load the model in 4bit format to save RAM'}) | |
@dataclass | |
class GenerationArguments: | |
# For more hyperparameters check: | |
# https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig | |
# Length arguments | |
max_new_tokens: Optional[int] = field( | |
default=256, | |
metadata={"help": "Maximum number of new tokens to be generated in evaluation or prediction loops" | |
"if predict_with_generate is set."} | |
) | |
min_new_tokens : Optional[int] = field( | |
default=None, | |
metadata={"help": "Minimum number of new tokens to generate."} | |
) | |
# Generation strategy | |
do_sample: Optional[bool] = field(default=False) | |
num_beams: Optional[int] = field(default=1) | |
num_beam_groups: Optional[int] = field(default=1) | |
penalty_alpha: Optional[float] = field(default=None) | |
use_cache: Optional[bool] = field(default=True) | |
# Hyperparameters for logit manipulation | |
temperature: Optional[float] = field(default=1.0) | |
top_k: Optional[int] = field(default=50) | |
top_p: Optional[float] = field(default=1.0) | |
typical_p: Optional[float] = field(default=1.0) | |
diversity_penalty: Optional[float] = field(default=0.0) | |
repetition_penalty: Optional[float] = field(default=1.0) | |
length_penalty: Optional[float] = field(default=1.0) | |
no_repeat_ngram_size: Optional[int] = field(default=0) | |
def get_accelerate_model(args, checkpoint_dir): | |
device_map = "auto" | |
# if we are in a distributed setting, we need to set the device map and max memory per device | |
if os.environ.get('LOCAL_RANK') is not None: | |
local_rank = int(os.environ.get('LOCAL_RANK', '0')) | |
device_map = {'': local_rank} | |
# Quantize? | |
if args.use_4_bit: | |
bnb_4bit_compute_dtype="float16" | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=getattr(torch, bnb_4bit_compute_dtype), | |
bnb_4bit_use_double_quant=False, | |
) | |
print(f'loading base model {args.model_name_or_path}...') | |
model = AutoModelForCausalLM.from_pretrained( | |
args.model_name_or_path, | |
device_map=device_map, | |
quantization_config=bnb_config, | |
trust_remote_code=args.trust_remote_code, | |
) | |
else: | |
print(f'loading base model {args.model_name_or_path}...') | |
model = AutoModelForCausalLM.from_pretrained( | |
args.model_name_or_path, | |
device_map=device_map, | |
trust_remote_code=args.trust_remote_code, | |
) | |
# Tokenizer | |
tokenizer = AutoTokenizer.from_pretrained( | |
args.model_name_or_path, | |
padding_side="right", | |
use_fast=True, # Fast tokenizer giving issues. | |
trust_remote_code=args.trust_remote_code, | |
) | |
if tokenizer._pad_token is None: | |
non_special_tokens = [] | |
if args.dataset == "OpenAssistant/oasst_top1_2023-08-25" or args.dataset == "squad_es": | |
non_special_tokens = ["<|im_start|>", "<|im_end|>",] | |
smart_tokenizer_and_embedding_resize( | |
special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN), | |
tokenizer=tokenizer, | |
model=model, | |
non_special_tokens=non_special_tokens, | |
) | |
return model, tokenizer | |
def print_trainable_parameters(args, model): | |
""" | |
Prints the number of trainable parameters in the model. | |
""" | |
trainable_params = 0 | |
all_param = 0 | |
for _, param in model.named_parameters(): | |
all_param += param.numel() | |
if param.requires_grad: | |
trainable_params += param.numel() | |
print( | |
f"trainable params: {trainable_params} || " | |
f"all params: {all_param} || " | |
) | |
def smart_tokenizer_and_embedding_resize( | |
special_tokens_dict: Dict, | |
tokenizer: transformers.PreTrainedTokenizer, | |
model: transformers.PreTrainedModel, | |
non_special_tokens = None, | |
): | |
"""Resize tokenizer and embedding. | |
Note: This is the unoptimized version that may make your embedding size not be divisible by 64. | |
""" | |
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) + tokenizer.add_tokens(non_special_tokens) | |
model.resize_token_embeddings(len(tokenizer)) | |
if num_new_tokens > 0: | |
input_embeddings_data = model.get_input_embeddings().weight.data | |
output_embeddings_data = model.get_output_embeddings().weight.data | |
input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True) | |
output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True) | |
input_embeddings_data[-num_new_tokens:] = input_embeddings_avg | |
output_embeddings_data[-num_new_tokens:] = output_embeddings_avg | |
print(f"Resized tokenizer and embedding to {len(tokenizer)} tokens.") | |
@dataclass | |
class DataCollatorForCausalLM(object): | |
tokenizer: transformers.PreTrainedTokenizer | |
source_max_len: int | |
target_max_len: int | |
train_on_source: bool | |
predict_with_generate: bool | |
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: | |
# Extract elements | |
sources = [f"{self.tokenizer.bos_token}{example['input']}" for example in instances] | |
targets = [f"{example['output']}{self.tokenizer.eos_token}" for example in instances] | |
# Tokenize | |
tokenized_sources_with_prompt = self.tokenizer( | |
sources, | |
max_length=self.source_max_len, | |
truncation=True, | |
add_special_tokens=False, | |
) | |
tokenized_targets = self.tokenizer( | |
targets, | |
max_length=self.target_max_len, | |
truncation=True, | |
add_special_tokens=False, | |
) | |
# Build the input and labels for causal LM | |
input_ids = [] | |
labels = [] | |
for tokenized_source, tokenized_target in zip( | |
tokenized_sources_with_prompt['input_ids'], | |
tokenized_targets['input_ids'] | |
): | |
if not self.predict_with_generate: | |
input_ids.append(torch.tensor(tokenized_source + tokenized_target)) | |
if not self.train_on_source: | |
labels.append( | |
torch.tensor([IGNORE_INDEX for _ in range(len(tokenized_source))] + copy.deepcopy(tokenized_target)) | |
) | |
else: | |
labels.append(torch.tensor(copy.deepcopy(tokenized_source + tokenized_target))) | |
else: | |
input_ids.append(torch.tensor(tokenized_source)) | |
# Apply padding | |
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id) | |
labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) if not self.predict_with_generate else None | |
data_dict = { | |
'input_ids': input_ids, | |
'attention_mask':input_ids.ne(self.tokenizer.pad_token_id), | |
} | |
if labels is not None: | |
data_dict['labels'] = labels | |
return data_dict | |
def extract_unnatural_instructions_data(examples, extract_reformulations=False): | |
out = { | |
'input': [], | |
'output': [], | |
} | |
for example_instances in examples['instances']: | |
for instance in example_instances: | |
out['input'].append(instance['instruction_with_input']) | |
out['output'].append(instance['output']) | |
if extract_reformulations: | |
for example_reformulations in examples['reformulations']: | |
if example_reformulations is not None: | |
for instance in example_reformulations: | |
out['input'].append(instance['instruction_with_input']) | |
out['output'].append(instance['output']) | |
return out | |
ALPACA_PROMPT_DICT = { | |
"prompt_input": ( | |
"Below is an instruction that describes a task, paired with an input that provides further context. " | |
"Write a response that appropriately completes the request.\n\n" | |
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: " | |
), | |
"prompt_no_input": ( | |
"Below is an instruction that describes a task. " | |
"Write a response that appropriately completes the request.\n\n" | |
"### Instruction:\n{instruction}\n\n### Response: " | |
), | |
} | |
def extract_alpaca_dataset(example): | |
if example.get("input", "") != "": | |
prompt_format = ALPACA_PROMPT_DICT["prompt_input"] | |
else: | |
prompt_format = ALPACA_PROMPT_DICT["prompt_no_input"] | |
return {'input': prompt_format.format(**example)} | |
def local_dataset(dataset_name): | |
if dataset_name.endswith('.json') or dataset_name.endswith('.jsonl'): | |
full_dataset = Dataset.from_json(path_or_paths=dataset_name) | |
elif dataset_name.endswith('.csv'): | |
full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name)) | |
elif dataset_name.endswith('.tsv'): | |
full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name, delimiter='\t')) | |
else: | |
raise ValueError(f"Unsupported dataset format: {dataset_name}") | |
split_dataset = full_dataset.train_test_split(test_size=0.1) | |
return split_dataset | |
def make_data_module(tokenizer: transformers.PreTrainedTokenizer, args) -> Dict: | |
""" | |
Make dataset and collator for supervised fine-tuning. | |
Datasets are expected to have the following columns: { `input`, `output` } | |
Available datasets to be selected with `dataset` argument: | |
- alpaca, 52002 examples | |
- alpaca cleaned, 51942 examples | |
- chip2 (OIG), 210289 examples | |
- self-instruct, 82612 examples | |
- hh-rlhf (Anthropic), 160800 examples | |
- longform, 23.7k examples | |
- oasst1 (OpenAssistant) primary message tree only, 9,846 examples | |
Coming soon: | |
- unnatural instructions core, 66010 examples | |
- unnatural instructions full, 240670 examples | |
- alpaca-gpt4, 52002 examples | |
- unnatural-instructions-gpt4, 9000 examples | |
- supernatural-instructions, 69624 examples (same as paper with 100 ex/task more can be used) | |
- flan (FLAN v2), up to 20M examples available | |
- vicuna | |
""" | |
def load_data(dataset_name): | |
if dataset_name == 'alpaca': | |
return load_dataset("tatsu-lab/alpaca") | |
elif dataset_name == 'alpaca-clean': | |
return load_dataset("yahma/alpaca-cleaned") | |
elif dataset_name == 'squad_es': | |
return load_dataset("squad_es", "v1.1.0") | |
elif dataset_name == 'chip2': | |
return load_dataset("laion/OIG", data_files='unified_chip2.jsonl') | |
elif dataset_name == 'hh-rlhf': | |
return load_dataset("Anthropic/hh-rlhf") | |
elif dataset_name == 'longform': | |
return load_dataset("akoksal/LongForm") | |
elif dataset_name == 'oasst1': | |
return load_dataset("timdettmers/openassistant-guanaco") | |
elif dataset_name == "OpenAssistant/oasst_top1_2023-08-25": | |
return load_dataset("OpenAssistant/oasst_top1_2023-08-25") | |
elif dataset_name == 'vicuna': | |
raise NotImplementedError("Vicuna data was not released.") | |
else: | |
if os.path.exists(dataset_name): | |
try: | |
args.dataset_format = args.dataset_format if args.dataset_format else "input-output" | |
full_dataset = local_dataset(dataset_name) | |
return full_dataset | |
except: | |
raise ValueError(f"Error loading dataset from {dataset_name}") | |
else: | |
raise NotImplementedError(f"Dataset {dataset_name} not implemented yet.") | |
def format_dataset(dataset, dataset_format): | |
if ( | |
dataset_format == 'alpaca' or dataset_format == 'alpaca-clean' or | |
(dataset_format is None and args.dataset in ['alpaca', 'alpaca-clean']) | |
): | |
dataset = dataset.map(extract_alpaca_dataset, remove_columns=['instruction']) | |
elif dataset_format == 'chip2' or (dataset_format is None and args.dataset == 'chip2'): | |
dataset = dataset.map(lambda x: { | |
'input': x['text'].split('\n<bot>: ')[0].replace('<human>: ', ''), | |
'output': x['text'].split('\n<bot>: ')[1], | |
}) | |
elif dataset_format == 'self-instruct' or (dataset_format is None and args.dataset == 'self-instruct'): | |
for old, new in [["prompt", "input"], ["completion", "output"]]: | |
dataset = dataset.rename_column(old, new) | |
elif dataset_format == 'hh-rlhf' or (dataset_format is None and args.dataset == 'hh-rlhf'): | |
dataset = dataset.map(lambda x: { | |
'input': '', | |
'output': x['chosen'] | |
}) | |
elif dataset_format == 'oasst1' or (dataset_format is None and args.dataset == 'oasst1'): | |
dataset = dataset.map(lambda x: { | |
'input': '', | |
'output': x['text'], | |
}) | |
elif ( | |
dataset_format == 'squad_es_chatml' | |
): | |
dataset = dataset.flatten() | |
dataset = dataset.map(lambda x: { | |
'input': ( | |
'<|im_start|>system Eres una IA especialista en extraer información relevante de un contexto. Se te proporcionará un contexto y una pregunta y extraerás la información relevante del contexto para responder a la pregunta\n' | |
'<|im_start|>user Contexto: ```' + x['context'] + '``` Pregunta: "' + x['question'] + '"\n' | |
), | |
'output': ( | |
'<|im_start|>assistant ' + x['answers.text'][0] + '<|im_end|>' | |
), | |
}) | |
elif ( | |
dataset_format == 'squad_en_chatml' | |
): | |
dataset = dataset.flatten() | |
dataset = dataset.map(lambda x: { | |
'input': ( | |
'<|im_start|>system You are an AI specialized on extract relevant information from a context. You will be provided with a context and a question and you will extract the relevant information from the context to reply to the question\n' | |
'<|im_start|>user Context: ```' + x['context'] + '``` Question: "' + x['question'] + '"\n' | |
), | |
'output': ( | |
'<|im_start|>assistant ' + x['answers.text'][0] + '<|im_end|>' | |
), | |
}) | |
elif dataset_format == 'input-output': | |
# leave as is | |
pass | |
# Remove unused columns. | |
dataset = dataset.remove_columns( | |
[col for col in dataset.column_names['train'] if col not in ['input', 'output']] | |
) | |
return dataset | |
# Load dataset. | |
dataset = load_data(args.dataset) | |
dataset = format_dataset(dataset, args.dataset_format) | |
# Split train/eval, reduce size | |
if args.do_eval or args.do_predict: | |
if 'eval' in dataset: | |
eval_dataset = dataset['eval'] | |
elif 'validation' in dataset: | |
eval_dataset = dataset['validation'] | |
else: | |
print('Splitting train dataset in train and validation according to `eval_dataset_size`') | |
dataset = dataset["train"].train_test_split( | |
test_size=args.eval_dataset_size, shuffle=True, seed=42 | |
) | |
eval_dataset = dataset['test'] | |
if args.max_eval_samples is not None and len(eval_dataset) > args.max_eval_samples: | |
eval_dataset = eval_dataset.select(range(args.max_eval_samples)) | |
if args.group_by_length: | |
eval_dataset = eval_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])}) | |
if args.do_train: | |
train_dataset = dataset['train'] | |
if args.max_train_samples is not None and len(train_dataset) > args.max_train_samples: | |
train_dataset = train_dataset.select(range(args.max_train_samples)) | |
if args.group_by_length: | |
train_dataset = train_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])}) | |
data_collator = DataCollatorForCausalLM( | |
tokenizer=tokenizer, | |
source_max_len=args.source_max_len, | |
target_max_len=args.target_max_len, | |
train_on_source=args.train_on_source, | |
predict_with_generate=args.predict_with_generate, | |
) | |
return dict( | |
train_dataset=train_dataset if args.do_train else None, | |
eval_dataset=eval_dataset if args.do_eval else None, | |
predict_dataset=eval_dataset if args.do_predict else None, | |
data_collator=data_collator if not args.use_4_bit else None | |
) | |
def get_last_checkpoint(checkpoint_dir): | |
if isdir(checkpoint_dir): | |
is_completed = exists(join(checkpoint_dir, 'completed')) | |
if is_completed: return None, True # already finished | |
max_step = 0 | |
for filename in os.listdir(checkpoint_dir): | |
if isdir(join(checkpoint_dir, filename)) and filename.startswith('checkpoint'): | |
max_step = max(max_step, int(filename.replace('checkpoint-', ''))) | |
if max_step == 0: return None, is_completed # training started, but no checkpoint | |
checkpoint_dir = join(checkpoint_dir, f'checkpoint-{max_step}') | |
print(f"Found a previous checkpoint at: {checkpoint_dir}") | |
return checkpoint_dir, is_completed # checkpoint found! | |
return None, False # first training | |
def train(): | |
hfparser = transformers.HfArgumentParser(( | |
ModelArguments, DataArguments, TrainingArguments, GenerationArguments | |
)) | |
model_args, data_args, training_args, generation_args, extra_args = \ | |
hfparser.parse_args_into_dataclasses(return_remaining_strings=True) | |
training_args.generation_config = transformers.GenerationConfig(**vars(generation_args)) | |
args = argparse.Namespace( | |
**vars(model_args), **vars(data_args), **vars(training_args) | |
) | |
print(args) | |
checkpoint_dir, completed_training = get_last_checkpoint(args.output_dir) | |
if completed_training: | |
print('Detected that training was already completed!') | |
model, tokenizer = get_accelerate_model(args, checkpoint_dir) | |
model.config.use_cache = False | |
print('loaded model') | |
set_seed(args.seed) | |
data_module = make_data_module(tokenizer=tokenizer, args=args) | |
if args.use_4_bit: | |
def sft_func(data): | |
output_texts = [] | |
for i in range(len(data['input'])): | |
text = f"{data['input'][i]}{data['output'][i]}" | |
output_texts.append(text) | |
return output_texts | |
trainer = SFTTrainer( | |
model=model, | |
tokenizer=tokenizer, | |
args=training_args, | |
formatting_func=sft_func, | |
**{k:v for k,v in data_module.items() if k != 'predict_dataset'}, | |
) | |
else: | |
trainer = Seq2SeqTrainer( | |
model=model, | |
tokenizer=tokenizer, | |
args=training_args, | |
**{k:v for k,v in data_module.items() if k != 'predict_dataset'}, | |
) | |
# Verifying the datatypes and parameter counts before training. | |
print_trainable_parameters(args, model) | |
dtypes = {} | |
for _, p in model.named_parameters(): | |
dtype = p.dtype | |
if dtype not in dtypes: dtypes[dtype] = 0 | |
dtypes[dtype] += p.numel() | |
total = 0 | |
for k, v in dtypes.items(): total+= v | |
for k, v in dtypes.items(): | |
print(k, v, v/total) | |
all_metrics = {"run_name": args.run_name} | |
# Training | |
if args.do_train: | |
logger.info("*** Train ***") | |
# Note: `resume_from_checkpoint` not supported for adapter checkpoints by HF. | |
# Currently adapter checkpoint is reloaded as expected but optimizer/scheduler states are not. | |
train_result = trainer.train() | |
metrics = train_result.metrics | |
trainer.log_metrics("train", metrics) | |
trainer.save_metrics("train", metrics) | |
trainer.save_state() | |
all_metrics.update(metrics) | |
# Evaluation | |
if args.do_eval: | |
logger.info("*** Evaluate ***") | |
metrics = trainer.evaluate(metric_key_prefix="eval") | |
trainer.log_metrics("eval", metrics) | |
trainer.save_metrics("eval", metrics) | |
all_metrics.update(metrics) | |
# Prediction | |
if args.do_predict: | |
logger.info("*** Predict ***") | |
prediction_output = trainer.predict(test_dataset=data_module['predict_dataset'],metric_key_prefix="predict") | |
prediction_metrics = prediction_output.metrics | |
predictions = prediction_output.predictions | |
predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id) | |
predictions = tokenizer.batch_decode( | |
predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True | |
) | |
with open(os.path.join(args.output_dir, 'predictions.jsonl'), 'w') as fout: | |
for i, example in enumerate(data_module['predict_dataset']): | |
example['prediction_with_input'] = predictions[i].strip() | |
example['prediction'] = predictions[i].replace(example['input'], '').strip() | |
fout.write(json.dumps(example) + '\n') | |
print(prediction_metrics) | |
trainer.log_metrics("predict", prediction_metrics) | |
trainer.save_metrics("predict", prediction_metrics) | |
all_metrics.update(prediction_metrics) | |
if (args.do_train or args.do_eval or args.do_predict): | |
with open(os.path.join(args.output_dir, "metrics.json"), "w") as fout: | |
fout.write(json.dumps(all_metrics)) | |
if __name__ == "__main__": | |
train() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment