Skip to content

Instantly share code, notes, and snippets.

@SunMarc
Last active July 26, 2024 15:37
Show Gist options
  • Save SunMarc/dcdb499ac16d355a8f265aa497645996 to your computer and use it in GitHub Desktop.
Save SunMarc/dcdb499ac16d355a8f265aa497645996 to your computer and use it in GitHub Desktop.
Finetune GPTQ model with peft and tlr
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from dataclasses import dataclass, field
from typing import Optional
import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
HfArgumentParser,
AutoTokenizer,
TrainingArguments,
)
from peft import prepare_model_for_kbit_training, get_peft_model
from transformers import GPTQConfig
from trl import SFTTrainer
# This example fine-tunes Llama 2 model on Guanaco dataset
# using GPTQ and peft.
# Use it by correctly passing --model_name argument when running the
# script. The default model is ybelkada/llama-7b-GPTQ-test
# Versions used:
# accelerate == 0.21.0
# auto-gptq == 0.4.2
# trl == 0.4.7
# peft from source
# transformers from source
# optimum from source
# For models that have `config.pretraining_tp > 1` install:
# pip install git+https://github.com/huggingface/transformers.git
@dataclass
class ScriptArguments:
"""
These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train.
"""
local_rank: Optional[int] = field(default=-1, metadata={"help": "Used for multi-gpu"})
per_device_train_batch_size: Optional[int] = field(default=4)
per_device_eval_batch_size: Optional[int] = field(default=1)
gradient_accumulation_steps: Optional[int] = field(default=4)
learning_rate: Optional[float] = field(default=2e-4)
max_grad_norm: Optional[float] = field(default=0.3)
weight_decay: Optional[int] = field(default=0.001)
lora_alpha: Optional[int] = field(default=16)
lora_dropout: Optional[float] = field(default=0.1)
lora_r: Optional[int] = field(default=64)
max_seq_length: Optional[int] = field(default=512)
model_name: Optional[str] = field(
default="ybelkada/llama-7b-GPTQ-test",
metadata={
"help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."
}
)
dataset_name: Optional[str] = field(
default="timdettmers/openassistant-guanaco",
metadata={"help": "The preference dataset to use."},
)
num_train_epochs: Optional[int] = field(
default=1,
metadata={"help": "The number of training epochs for the reward model."},
)
fp16: Optional[bool] = field(
default=False,
metadata={"help": "Enables fp16 training."},
)
bf16: Optional[bool] = field(
default=False,
metadata={"help": "Enables bf16 training."},
)
packing: Optional[bool] = field(
default=False,
metadata={"help": "Use packing dataset creating."},
)
gradient_checkpointing: Optional[bool] = field(
default=True,
metadata={"help": "Enables gradient checkpointing."},
)
optim: Optional[str] = field(
default="adamw_hf",
metadata={"help": "The optimizer to use."},
)
lr_scheduler_type: str = field(
default="constant",
metadata={"help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis"},
)
max_steps: int = field(default=10000, metadata={"help": "How many optimizer update steps to take"})
warmup_ratio: float = field(default=0.03, metadata={"help": "Fraction of steps to do a warmup for"})
group_by_length: bool = field(
default=True,
metadata={
"help": "Group sequences into batches with same length. Saves memory and speeds up training considerably."
},
)
save_steps: int = field(default=10, metadata={"help": "Save checkpoint every X updates steps."})
logging_steps: int = field(default=10, metadata={"help": "Log every X updates steps."})
merge_and_push: Optional[bool] = field(
default=False,
metadata={"help": "Merge and push weights after training"},
)
output_dir: str = field(
default="./results",
metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
)
parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses()[0]
def create_and_prepare_model(args):
major, _ = torch.cuda.get_device_capability()
if major >= 8:
print("=" * 80)
print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
print("=" * 80)
# Load the entire model on the GPU 0
device_map = {"":0}
# switch to `device_map = "auto"` for multi-GPU
# device_map = "auto"
# need to disable exllama kernel
# exllama kernel are not very stable for training
model = AutoModelForCausalLM.from_pretrained(
args.model_name,
device_map=device_map,
quantization_config= GPTQConfig(bits=4, disable_exllama=True)
)
# check: https://github.com/huggingface/transformers/pull/24906
model.config.pretraining_tp = 1
peft_config = LoraConfig(
lora_alpha=script_args.lora_alpha,
lora_dropout=script_args.lora_dropout,
r=script_args.lora_r,
bias="none",
task_type="CAUSAL_LM",
)
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
return model, peft_config, tokenizer
training_arguments = TrainingArguments(
output_dir=script_args.output_dir,
per_device_train_batch_size=script_args.per_device_train_batch_size,
gradient_accumulation_steps=script_args.gradient_accumulation_steps,
optim=script_args.optim,
save_steps=script_args.save_steps,
logging_steps=script_args.logging_steps,
learning_rate=script_args.learning_rate,
fp16=script_args.fp16,
bf16=script_args.bf16,
max_grad_norm=script_args.max_grad_norm,
max_steps=script_args.max_steps,
warmup_ratio=script_args.warmup_ratio,
group_by_length=script_args.group_by_length,
lr_scheduler_type=script_args.lr_scheduler_type,
)
model, peft_config, tokenizer = create_and_prepare_model(script_args)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.config.use_cache = False
dataset = load_dataset(script_args.dataset_name, split="train")
# Fix weird overflow issue with fp16 training
tokenizer.padding_side = "right"
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=script_args.max_seq_length,
tokenizer=tokenizer,
args=training_arguments,
packing=script_args.packing,
)
trainer.train()
if script_args.merge_and_push:
output_dir = os.path.join(script_args.output_dir, "final_checkpoints")
trainer.model.save_pretrained(output_dir)
# Free memory for merging weights
del model
torch.cuda.empty_cache()
@SunMarc
Copy link
Author

SunMarc commented Dec 27, 2023

Hi @glf1030, this happens because qwen model is not supported by default. See supported list here. Hence you need to pass target_modules in peft_config. For your mdoel, you can pass target_modules = ["c_attn"]

@glf1030
Copy link

glf1030 commented Dec 28, 2023

Hi @glf1030, this happens because qwen model is not supported by default. See supported list here. Hence you need to pass target_modules in peft_config. For your mdoel, you can pass target_modules = ["c_attn"]

hi, thanks for your reply. I passed target_modules=['c_attn'], and it works for training;
but for inference, I used following code:

model_id = "/data/lifan/Qwen-72B-Chat-Int4"
adapter_model_id = "/data/lifan/cicc_lora_after_gptq_training_checkpoint"

tokenizer = AutoTokenizer.from_pretrained(model_id,use_fast=True,
    trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

model = AutoModelForCausalLM.from_pretrained(model_id,device_map="auto",
    trust_remote_code=True)
peft_config = PeftConfig.from_pretrained(adapter_model_id)
# to initiate with random weights
peft_config.init_lora_weights = False

model.add_adapter(peft_config)
model.enable_adapters()
output = model.generate(**inputs)

return tokenizer.decode(output[0])

============================= I got following error ======================================

File "/data/lifan/miniconda3/envs/llama_factory/lib/python3.9/site-packages/peft/tuners/tuners_utils.py", line 90, in init
self.inject_adapter(self.model, adapter_name)
File "/data/lifan/miniconda3/envs/llama_factory/lib/python3.9/site-packages/peft/tuners/tuners_utils.py", line 247, in inject_adapter
self._create_and_replace(peft_config, adapter_name, target, target_name, parent, **optional_kwargs)
File "/data/lifan/miniconda3/envs/llama_factory/lib/python3.9/site-packages/peft/tuners/lora/model.py", line 202, in _create_and_replace
new_module = self._create_new_module(lora_config, adapter_name, target, **kwargs)
File "/data/lifan/miniconda3/envs/llama_factory/lib/python3.9/site-packages/peft/tuners/lora/model.py", line 355, in _create_new_module
raise ValueError(
ValueError: Target module QuantLinear() is not supported. Currently, only the following modules are supported: torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv2d, transformers.pytorch_utils.Conv1D.

Would you please take a look at it ? Best.

@SunMarc
Copy link
Author

SunMarc commented Dec 28, 2023

Hi @glf1030, make sure you have the latest version of peft.

@glf1030
Copy link

glf1030 commented Jan 2, 2024

Hi @glf1030, make sure you have the latest version of peft.

hi, my version is 0.7.1.
(llama_factory) [lifan@iZ0jld5hy53xg1wwoistghZ ~]$ pip show peft
Name: peft
Version: 0.7.1
Summary: Parameter-Efficient Fine-Tuning (PEFT)
Home-page: https://github.com/huggingface/peft
Author: The HuggingFace team
Author-email: [email protected]
License: Apache
Location: /data/lifan/miniconda3/envs/llama_factory/lib/python3.9/site-packages
Requires: accelerate, huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch, tqdm, transformers
Required-by: auto-gptq

@DimensionZer0
Copy link

working for me with quantized model . Is it possible to merge the resulting adapter with the base model of gptq ? any example for this ?

@abpani
Copy link

abpani commented Jul 26, 2024

@SunMarc
I am trying this with llama3.1 8B gptq My model gets loaded unevenly on the GPU so not able to use more than 1 batch size on a 4 A10 GPU machine. huggingface/transformers#32199

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment