Skip to content

Instantly share code, notes, and snippets.

@datasciencemonkey
Created October 22, 2023 23:45
Show Gist options
  • Save datasciencemonkey/192403f0a63949d221a16f2463e663ac to your computer and use it in GitHub Desktop.
Save datasciencemonkey/192403f0a63949d221a16f2463e663ac to your computer and use it in GitHub Desktop.
Start with qlora training and then quantize to disk with gptq
# %%
from huggingface_hub import login
from dotenv import load_dotenv
load_dotenv("/notebooks/.env")
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
login(token=os.getenv("HUGGINGFACE_TOKEN"))
# %%
from datasets import load_dataset
dataset_name = "knkarthick/dialogsum"
ds = load_dataset(dataset_name)
# %%
train_ds, test_ds = load_dataset(dataset_name, split=["train", "test[0:200]"])
# %%
import pandas as pd
# convert to pandas
train_df = pd.DataFrame(train_ds)
test_df = pd.DataFrame(test_ds)
train_df.head()
# %%
# instruction finetuning data preparation function
def prepare_dataset(df, split="train"):
text_col = []
instruction = """Write a concise summary of the below input text.Ensure that responses covers the key points of the text.Only provide full sentence responses.""" # change instuction according to the task
if split == "train":
for _, row in df.iterrows():
input_q = row["dialogue"]
output = row["summary"]
text = (
"### Instruction: \n"
+ instruction
+ "\n### Input: \n"
+ input_q
+ "\n### Response :\n"
+ output
+ "\n### End"
) # keeping output column in training dataset
text_col.append(text)
df.loc[:, "text"] = text_col
else:
for _, row in df.iterrows():
input_q = row["dialogue"]
text = (
"### Instruction: \n"
+ instruction
+ "\n### Input: \n"
+ input_q
+ "\n### Response :\n"
) # not keeping output column in test dataset
text_col.append(text)
df.loc[:, "text"] = text_col
return df
train_df = prepare_dataset(train_df, "train")
test_df = prepare_dataset(test_df, "test")
print(test_df.iloc[0].text)
# %%
# coverting the dataframe to huggingface dataset for easy finetuning
from datasets import Dataset
dataset = Dataset.from_pandas(train_df)
# %%
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# sharded model path in hugging face
# model_name = "TinyPixel/Llama-2-7B-bf16-sharded"
model_name = "meta-llama/Llama-2-7b-hf"
# Quantization config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype="float16",
)
# loading the model with quantization config
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
trust_remote_code=True,
device_map="auto",
)
# can change to False if need the newest model update.
model.config.use_cache = True
# apparently needed because of
# https://github.com/huggingface/transformers/pull/24906
#disable tensor parallelism
model.config.pretraining_tp = 1
# %%
from rich import print
tokenizer = AutoTokenizer.from_pretrained(
model_name, trust_remote_code=True, return_token_type_ids=False
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
print(model)
# %%
from peft import LoraConfig, get_peft_model
lora_alpha = 16
lora_dropout = 0.05
lora_r = 8 # rank
# Parameter efficient finetuning for LoRA configuration
peft_config = LoraConfig(
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
target_modules=[
"q_proj",
"v_proj",
], # we will only create adopters for q, v metrices of attention module
r=lora_r,
bias="none",
task_type="CAUSAL_LM",
)
import transformers
output_dir = "llama2_qlora_finetuned_7b_hf"
training_arguments = transformers.TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
optim="paged_adamw_8bit",
learning_rate=2e-4,
lr_scheduler_type="linear",
save_strategy="epoch",
logging_steps=10,
num_train_epochs=1,
max_steps=100,
fp16=True,
push_to_hub=False,
)
# %%
# creating trainer with the training agruments
from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
from trl import SFTTrainer
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
peft_config=peft_config, # passing peft config
dataset_text_field="text", # mentioned the required column
args=training_arguments, # training agruments
tokenizer=tokenizer, # tokenizer
packing=False,
max_seq_length=512,
)
# %%
from time import perf_counter
start_time = perf_counter()
trainer.train()
end_time = perf_counter()
training_time = end_time - start_time
print(f"Time taken for training: {training_time} seconds")
# %%
checkpoint_name = "final_check_qlora_7b_hf_base"
# to merge and save the model
output_dir = os.path.join(output_dir, checkpoint_name)
trainer.model.save_pretrained(output_dir)
# %%
# from peft import LoraConfig, get_peft_model
# lora_config = LoraConfig.from_pretrained(output_dir)
# lmodel = get_peft_model(model, lora_config)
# # %%
# from time import perf_counter
# from rich import print
# start_time = perf_counter()
# text = test_df['text'][0]
# inputs = tokenizer(text, return_tensors="pt").to('cuda')
# outputs = lmodel.generate(**inputs, max_new_tokens=100)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))
# end_time = perf_counter()
# output_time = end_time - start_time
# print(f"Time taken for inference: {output_time} seconds")
# %%
from peft import AutoPeftModelForCausalLM
persisted_model = AutoPeftModelForCausalLM.from_pretrained(
output_dir,
low_cpu_mem_usage=True,
return_dict=True,
torch_dtype=torch.float16,
device_map="cuda",
)
# %%
from rich import print
from time import perf_counter
from transformers import GenerationConfig
start_time = perf_counter()
text = test_df["text"][3]
inputs = tokenizer(text, return_tensors="pt").to("cuda")
generation_config = GenerationConfig(
penalty_alpha=0.6,
do_sample = True,
top_k=5,
temperature=0.5,
repetition_penalty=1.2
)
outputs = persisted_model.generate(
**inputs, max_new_tokens=100, generation_config=generation_config
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
end_time = perf_counter()
output_time = end_time - start_time
print(f"Time taken for inference: {output_time} seconds")
# %%
# Merge LoRA and base model
merged_model = persisted_model.merge_and_unload()
# %%
# Save the merged model
merged_model.save_pretrained("merged_model", safe_serialization=True)
tokenizer.save_pretrained("merged_model")
# %%
# now lets apply gptq on the merged model
from transformers import GPTQConfig
quantization_config = GPTQConfig(
bits=4,
dataset=["c4"],
desc_act=False,
)
tokenizer = AutoTokenizer.from_pretrained("merged_model")
quant_model = AutoModelForCausalLM.from_pretrained(
"merged_model", quantization_config=quantization_config,
device_map="auto"
)
# Save the quantized model
quant_model.save_pretrained("quant_model", safe_serialization=True)
tokenizer.save_pretrained("quant_model")
#%%
from rich import print
from time import perf_counter
from transformers import GenerationConfig
start_time = perf_counter()
text = test_df["text"][3]
inputs = tokenizer(text, return_tensors="pt").to("cuda")
generation_config = GenerationConfig(
penalty_alpha=0.6,
do_sample = True,
top_k=5,
temperature=0.5,
repetition_penalty=1.2
)
outputs = quant_model.generate(
**inputs, max_new_tokens=100, generation_config=generation_config
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
end_time = perf_counter()
output_time = end_time - start_time
print(f"Time taken for inference: {output_time} seconds")
# %%
hf_model_repo = "dsmonk/llama2-7b-ftqlora-gptq"
merged_model.push_to_hub(hf_model_repo)
tokenizer.push_to_hub(hf_model_repo)
# %%
quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True)
from_disk_model = AutoModelForCausalLM.from_pretrained(
"quant_model", quantization_config=quantization_config_loading, device_map="auto"
)
# %%
from rich import print
from time import perf_counter
from transformers import GenerationConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from_disk_model = AutoModelForCausalLM.from_pretrained( "quant_model",
device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("quant_model")
#run inference code
start_time = perf_counter()
text = test_df["text"][3]
inputs = tokenizer(text, return_tensors="pt").to("cuda")
generation_config = GenerationConfig(
penalty_alpha=0.6,
do_sample = True,
top_k=5,
temperature=0.5,
repetition_penalty=1.2
)
outputs = from_disk_model.generate(
**inputs, max_new_tokens=100, generation_config=generation_config
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
end_time = perf_counter()
output_time = end_time - start_time
print(f"Time taken for inference: {output_time} seconds")
# %%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment