-
-
Save SunMarc/dcdb499ac16d355a8f265aa497645996 to your computer and use it in GitHub Desktop.
# coding=utf-8 | |
# Copyright 2023 The HuggingFace Inc. team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import os | |
from dataclasses import dataclass, field | |
from typing import Optional | |
import torch | |
from datasets import load_dataset | |
from peft import LoraConfig | |
from transformers import ( | |
AutoModelForCausalLM, | |
AutoTokenizer, | |
HfArgumentParser, | |
AutoTokenizer, | |
TrainingArguments, | |
) | |
from peft import prepare_model_for_kbit_training, get_peft_model | |
from transformers import GPTQConfig | |
from trl import SFTTrainer | |
# This example fine-tunes Llama 2 model on Guanaco dataset | |
# using GPTQ and peft. | |
# Use it by correctly passing --model_name argument when running the | |
# script. The default model is ybelkada/llama-7b-GPTQ-test | |
# Versions used: | |
# accelerate == 0.21.0 | |
# auto-gptq == 0.4.2 | |
# trl == 0.4.7 | |
# peft from source | |
# transformers from source | |
# optimum from source | |
# For models that have `config.pretraining_tp > 1` install: | |
# pip install git+https://github.com/huggingface/transformers.git | |
@dataclass | |
class ScriptArguments: | |
""" | |
These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train. | |
""" | |
local_rank: Optional[int] = field(default=-1, metadata={"help": "Used for multi-gpu"}) | |
per_device_train_batch_size: Optional[int] = field(default=4) | |
per_device_eval_batch_size: Optional[int] = field(default=1) | |
gradient_accumulation_steps: Optional[int] = field(default=4) | |
learning_rate: Optional[float] = field(default=2e-4) | |
max_grad_norm: Optional[float] = field(default=0.3) | |
weight_decay: Optional[int] = field(default=0.001) | |
lora_alpha: Optional[int] = field(default=16) | |
lora_dropout: Optional[float] = field(default=0.1) | |
lora_r: Optional[int] = field(default=64) | |
max_seq_length: Optional[int] = field(default=512) | |
model_name: Optional[str] = field( | |
default="ybelkada/llama-7b-GPTQ-test", | |
metadata={ | |
"help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc." | |
} | |
) | |
dataset_name: Optional[str] = field( | |
default="timdettmers/openassistant-guanaco", | |
metadata={"help": "The preference dataset to use."}, | |
) | |
num_train_epochs: Optional[int] = field( | |
default=1, | |
metadata={"help": "The number of training epochs for the reward model."}, | |
) | |
fp16: Optional[bool] = field( | |
default=False, | |
metadata={"help": "Enables fp16 training."}, | |
) | |
bf16: Optional[bool] = field( | |
default=False, | |
metadata={"help": "Enables bf16 training."}, | |
) | |
packing: Optional[bool] = field( | |
default=False, | |
metadata={"help": "Use packing dataset creating."}, | |
) | |
gradient_checkpointing: Optional[bool] = field( | |
default=True, | |
metadata={"help": "Enables gradient checkpointing."}, | |
) | |
optim: Optional[str] = field( | |
default="adamw_hf", | |
metadata={"help": "The optimizer to use."}, | |
) | |
lr_scheduler_type: str = field( | |
default="constant", | |
metadata={"help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis"}, | |
) | |
max_steps: int = field(default=10000, metadata={"help": "How many optimizer update steps to take"}) | |
warmup_ratio: float = field(default=0.03, metadata={"help": "Fraction of steps to do a warmup for"}) | |
group_by_length: bool = field( | |
default=True, | |
metadata={ | |
"help": "Group sequences into batches with same length. Saves memory and speeds up training considerably." | |
}, | |
) | |
save_steps: int = field(default=10, metadata={"help": "Save checkpoint every X updates steps."}) | |
logging_steps: int = field(default=10, metadata={"help": "Log every X updates steps."}) | |
merge_and_push: Optional[bool] = field( | |
default=False, | |
metadata={"help": "Merge and push weights after training"}, | |
) | |
output_dir: str = field( | |
default="./results", | |
metadata={"help": "The output directory where the model predictions and checkpoints will be written."}, | |
) | |
parser = HfArgumentParser(ScriptArguments) | |
script_args = parser.parse_args_into_dataclasses()[0] | |
def create_and_prepare_model(args): | |
major, _ = torch.cuda.get_device_capability() | |
if major >= 8: | |
print("=" * 80) | |
print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16") | |
print("=" * 80) | |
# Load the entire model on the GPU 0 | |
device_map = {"":0} | |
# switch to `device_map = "auto"` for multi-GPU | |
# device_map = "auto" | |
# need to disable exllama kernel | |
# exllama kernel are not very stable for training | |
model = AutoModelForCausalLM.from_pretrained( | |
args.model_name, | |
device_map=device_map, | |
quantization_config= GPTQConfig(bits=4, disable_exllama=True) | |
) | |
# check: https://github.com/huggingface/transformers/pull/24906 | |
model.config.pretraining_tp = 1 | |
peft_config = LoraConfig( | |
lora_alpha=script_args.lora_alpha, | |
lora_dropout=script_args.lora_dropout, | |
r=script_args.lora_r, | |
bias="none", | |
task_type="CAUSAL_LM", | |
) | |
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name, trust_remote_code=True) | |
tokenizer.pad_token = tokenizer.eos_token | |
return model, peft_config, tokenizer | |
training_arguments = TrainingArguments( | |
output_dir=script_args.output_dir, | |
per_device_train_batch_size=script_args.per_device_train_batch_size, | |
gradient_accumulation_steps=script_args.gradient_accumulation_steps, | |
optim=script_args.optim, | |
save_steps=script_args.save_steps, | |
logging_steps=script_args.logging_steps, | |
learning_rate=script_args.learning_rate, | |
fp16=script_args.fp16, | |
bf16=script_args.bf16, | |
max_grad_norm=script_args.max_grad_norm, | |
max_steps=script_args.max_steps, | |
warmup_ratio=script_args.warmup_ratio, | |
group_by_length=script_args.group_by_length, | |
lr_scheduler_type=script_args.lr_scheduler_type, | |
) | |
model, peft_config, tokenizer = create_and_prepare_model(script_args) | |
model = prepare_model_for_kbit_training(model) | |
model = get_peft_model(model, peft_config) | |
model.config.use_cache = False | |
dataset = load_dataset(script_args.dataset_name, split="train") | |
# Fix weird overflow issue with fp16 training | |
tokenizer.padding_side = "right" | |
trainer = SFTTrainer( | |
model=model, | |
train_dataset=dataset, | |
dataset_text_field="text", | |
max_seq_length=script_args.max_seq_length, | |
tokenizer=tokenizer, | |
args=training_arguments, | |
packing=script_args.packing, | |
) | |
trainer.train() | |
if script_args.merge_and_push: | |
output_dir = os.path.join(script_args.output_dir, "final_checkpoints") | |
trainer.model.save_pretrained(output_dir) | |
# Free memory for merging weights | |
del model | |
torch.cuda.empty_cache() |
Hi @glf1030, this happens because qwen model is not supported by default. See supported list here. Hence you need to pass
target_modules
inpeft_config
. For your mdoel, you can passtarget_modules = ["c_attn"]
hi, thanks for your reply. I passed target_modules=['c_attn'], and it works for training;
but for inference, I used following code:
model_id = "/data/lifan/Qwen-72B-Chat-Int4"
adapter_model_id = "/data/lifan/cicc_lora_after_gptq_training_checkpoint"
tokenizer = AutoTokenizer.from_pretrained(model_id,use_fast=True,
trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
model = AutoModelForCausalLM.from_pretrained(model_id,device_map="auto",
trust_remote_code=True)
peft_config = PeftConfig.from_pretrained(adapter_model_id)
# to initiate with random weights
peft_config.init_lora_weights = False
model.add_adapter(peft_config)
model.enable_adapters()
output = model.generate(**inputs)
return tokenizer.decode(output[0])
============================= I got following error ======================================
File "/data/lifan/miniconda3/envs/llama_factory/lib/python3.9/site-packages/peft/tuners/tuners_utils.py", line 90, in init
self.inject_adapter(self.model, adapter_name)
File "/data/lifan/miniconda3/envs/llama_factory/lib/python3.9/site-packages/peft/tuners/tuners_utils.py", line 247, in inject_adapter
self._create_and_replace(peft_config, adapter_name, target, target_name, parent, **optional_kwargs)
File "/data/lifan/miniconda3/envs/llama_factory/lib/python3.9/site-packages/peft/tuners/lora/model.py", line 202, in _create_and_replace
new_module = self._create_new_module(lora_config, adapter_name, target, **kwargs)
File "/data/lifan/miniconda3/envs/llama_factory/lib/python3.9/site-packages/peft/tuners/lora/model.py", line 355, in _create_new_module
raise ValueError(
ValueError: Target module QuantLinear() is not supported. Currently, only the following modules are supported: torch.nn.Linear
, torch.nn.Embedding
, torch.nn.Conv2d
, transformers.pytorch_utils.Conv1D
.
Would you please take a look at it ? Best.
Hi @glf1030, make sure you have the latest version of peft.
Hi @glf1030, make sure you have the latest version of peft.
hi, my version is 0.7.1.
(llama_factory) [lifan@iZ0jld5hy53xg1wwoistghZ ~]$ pip show peft
Name: peft
Version: 0.7.1
Summary: Parameter-Efficient Fine-Tuning (PEFT)
Home-page: https://github.com/huggingface/peft
Author: The HuggingFace team
Author-email: [email protected]
License: Apache
Location: /data/lifan/miniconda3/envs/llama_factory/lib/python3.9/site-packages
Requires: accelerate, huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch, tqdm, transformers
Required-by: auto-gptq
working for me with quantized model . Is it possible to merge the resulting adapter with the base model of gptq ? any example for this ?
@SunMarc
I am trying this with llama3.1 8B gptq My model gets loaded unevenly on the GPU so not able to use more than 1 batch size on a 4 A10 GPU machine. huggingface/transformers#32199
ValueError: Please specify
target_modules
inpeft_config
. hi, I am wondering how to fix this, my quantized model is :QWenLMHeadModel((transformer): QWenModel(
(wte): Embedding(152064, 8192)
(drop): Dropout(p=0.0, inplace=False)
(rotary_emb): RotaryEmbedding()
(h): ModuleList(
(0-79): 80 x QWenBlock(
(ln_1): RMSNorm()
(attn): QWenAttention(
(attn_dropout): Dropout(p=0.0, inplace=False)
(c_attn): QuantLinear()
(c_proj): QuantLinear()
)
(ln_2): RMSNorm()
(mlp): QWenMLP(
(c_proj): QuantLinear()
(w1): QuantLinear()
(w2): QuantLinear()
)
)
)
(ln_f): RMSNorm()
)
(lm_head): Linear(in_features=8192, out_features=152064, bias=False)
)