-
-
Save bitsnaps/3e419cc8908a82abd37b8c12e04243f2 to your computer and use it in GitHub Desktop.
train a gptq model using peft/lora
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
# this is run from /notebooks on paperspace | |
from huggingface_hub import login | |
from dotenv import load_dotenv | |
load_dotenv("/notebooks/.env") | |
import os | |
os.environ["TOKENIZERS_PARALLELISM"]="false" | |
login(token=os.getenv("HUGGINGFACE_TOKEN")) | |
# %% | |
from datasets import load_dataset | |
dataset_name = "knkarthick/dialogsum" | |
ds = load_dataset(dataset_name) | |
# %% | |
train_ds, test_ds = load_dataset(dataset_name, split=["train", "test[0:200]"]) | |
# %% | |
import pandas as pd | |
# convert to pandas | |
train_df = pd.DataFrame(train_ds) | |
test_df = pd.DataFrame(test_ds) | |
train_df.head() | |
# %% | |
# instruction finetuning data preparation function | |
def prepare_dataset(df, split="train"): | |
text_col = [] | |
instruction = """Write a concise summary of the below input text.Ensure that responses covers the key points of the text.Only provide full sentence responses.""" # change instuction according to the task | |
if split == "train": | |
for _, row in df.iterrows(): | |
input_q = row["dialogue"] | |
output = row["summary"] | |
text = ( | |
"### Instruction: \n" | |
+ instruction | |
+ "\n### Input: \n" | |
+ input_q | |
+ "\n### Response :\n" | |
+ output | |
+ "\n### End" | |
) # keeping output column in training dataset | |
text_col.append(text) | |
df.loc[:, "text"] = text_col | |
else: | |
for _, row in df.iterrows(): | |
input_q = row["dialogue"] | |
text = ( | |
"### Instruction: \n" | |
+ instruction | |
+ "\n### Input: \n" | |
+ input_q | |
+ "\n### Response :\n" | |
) # not keeping output column in test dataset | |
text_col.append(text) | |
df.loc[:, "text"] = text_col | |
return df | |
train_df = prepare_dataset(train_df, "train") | |
test_df = prepare_dataset(test_df, "test") | |
print(train_df.iloc[0].text) | |
# %% | |
# coverting the dataframe to huggingface dataset for easy finetuning | |
from datasets import Dataset | |
dataset = Dataset.from_pandas(train_df) | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
# sharded model path in hugging face | |
# model_name = "TinyPixel/Llama-2-7B-bf16-sharded" | |
# model_name = 'NousResearch/Llama-2-7b-hf' | |
# Quantization config | |
# bnb_config = BitsAndBytesConfig( | |
# load_in_4bit=True, | |
# bnb_4bit_quant_type="nf4", | |
# bnb_4bit_compute_dtype="float16", | |
# ) | |
# take pre quantized model from hugging face | |
model_id = "TheBloke/Llama-2-7B-GPTQ" | |
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") | |
# tokenizer = AutoTokenizer.from_pretrained(model_id) | |
# loading the model with quantization config | |
# model = AutoModelForCausalLM.from_pretrained( | |
# model_name, | |
# quantization_config=bnb_config, | |
# trust_remote_code=True, | |
# device_map='auto' | |
# ) | |
# can change to False if need the newest model update. | |
# %% | |
from peft import prepare_model_for_kbit_training | |
from transformers import GPTQConfig | |
# model_id = "TheBloke/Llama-2-7B-GPTQ" | |
# model_id = "TheBloke/Llama-2-7b-Chat-GPTQ" | |
quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, quantization_config=quantization_config_loading, device_map="auto" | |
) | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
model.config.use_cache = False | |
model.config.pretraining_tp = 1 | |
# %% | |
model.gradient_checkpointing_enable() | |
model = prepare_model_for_kbit_training(model) | |
from peft import LoraConfig, get_peft_model | |
config = LoraConfig( | |
r=8, | |
lora_alpha=32, | |
target_modules=["k_proj","o_proj","q_proj","v_proj"], | |
lora_dropout=0.05, | |
bias="none", | |
task_type="CAUSAL_LM" | |
) | |
model = get_peft_model(model, config) | |
model.print_trainable_parameters() | |
# %% | |
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling | |
# needed for llama 2 tokenizer | |
tokenizer.pad_token = tokenizer.eos_token | |
tokenizer.padding_side = "right" | |
model.config.use_cache = False # silence the warnings. Please re-enable for inference! | |
args=TrainingArguments( | |
per_device_train_batch_size=4, | |
gradient_accumulation_steps=4, | |
warmup_steps=2, | |
max_steps=100, | |
learning_rate=2e-4, | |
fp16=True, #use mixed precision training | |
logging_steps=1, | |
output_dir="outputs_gptq_training", | |
optim="adamw_hf", | |
save_strategy="epoch", | |
report_to="none") | |
from trl import SFTTrainer | |
trainer = SFTTrainer( | |
model=model, | |
args=args, | |
train_dataset=dataset, | |
peft_config=config, | |
dataset_text_field="text", | |
tokenizer=tokenizer, | |
packing=False, | |
max_seq_length=512) | |
# %% | |
train_result = trainer.train() | |
# %% | |
checkpoint_name ="final_checkpoints_gptqsummarizer_7b_peft" | |
#to merge and save the model | |
output_dir = os.path.join(args.output_dir, checkpoint_name) | |
trainer.model.save_pretrained(output_dir) | |
# To perform inference on the test dataset | |
# this is one way, but loading from dir is better! | |
# from peft import PeftModel | |
# from rich import print | |
# base_model = AutoModelForCausalLM.from_pretrained( | |
# model_id, | |
# return_dict=True, | |
# torch_dtype=torch.float16, | |
# device_map='auto' | |
# ) | |
# model = PeftModel.from_pretrained(base_model, | |
# model_id=output_dir, | |
# device_map='auto', | |
# ) | |
# from peft import LoraConfig, get_peft_model | |
# lora_config = LoraConfig.from_pretrained(output_dir) | |
# model = get_peft_model(model, lora_config) | |
from peft import AutoPeftModelForCausalLM | |
# To perform inference on the test dataset example load the model from the checkpoint | |
persisted_model = AutoPeftModelForCausalLM.from_pretrained( | |
output_dir, | |
low_cpu_mem_usage=True, | |
return_dict=True, | |
torch_dtype=torch.float16, | |
device_map="cuda", | |
) | |
# %% | |
#inference on test data example | |
from time import perf_counter | |
from rich import print | |
from transformers import GenerationConfig | |
text = test_df['text'][4] | |
inputs = tokenizer(text, return_tensors="pt").to('cuda') | |
generation_config = GenerationConfig( | |
penalty_alpha=0.6, | |
do_sample = True, | |
top_k=5, | |
temperature=0.5, | |
repetition_penalty=1.2, | |
max_new_tokens=100 | |
) | |
start_time = perf_counter() | |
outputs = persisted_model.generate(**inputs, generation_config=generation_config) | |
print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
end_time = perf_counter() | |
output_time = end_time - start_time | |
print(f"Time taken for inference: {round(output_time,2)} seconds") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment