Created
October 22, 2023 23:45
-
-
Save datasciencemonkey/192403f0a63949d221a16f2463e663ac to your computer and use it in GitHub Desktop.
Start with qlora training and then quantize to disk with gptq
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
from huggingface_hub import login | |
from dotenv import load_dotenv | |
load_dotenv("/notebooks/.env") | |
import os | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
login(token=os.getenv("HUGGINGFACE_TOKEN")) | |
# %% | |
from datasets import load_dataset | |
dataset_name = "knkarthick/dialogsum" | |
ds = load_dataset(dataset_name) | |
# %% | |
train_ds, test_ds = load_dataset(dataset_name, split=["train", "test[0:200]"]) | |
# %% | |
import pandas as pd | |
# convert to pandas | |
train_df = pd.DataFrame(train_ds) | |
test_df = pd.DataFrame(test_ds) | |
train_df.head() | |
# %% | |
# instruction finetuning data preparation function | |
def prepare_dataset(df, split="train"): | |
text_col = [] | |
instruction = """Write a concise summary of the below input text.Ensure that responses covers the key points of the text.Only provide full sentence responses.""" # change instuction according to the task | |
if split == "train": | |
for _, row in df.iterrows(): | |
input_q = row["dialogue"] | |
output = row["summary"] | |
text = ( | |
"### Instruction: \n" | |
+ instruction | |
+ "\n### Input: \n" | |
+ input_q | |
+ "\n### Response :\n" | |
+ output | |
+ "\n### End" | |
) # keeping output column in training dataset | |
text_col.append(text) | |
df.loc[:, "text"] = text_col | |
else: | |
for _, row in df.iterrows(): | |
input_q = row["dialogue"] | |
text = ( | |
"### Instruction: \n" | |
+ instruction | |
+ "\n### Input: \n" | |
+ input_q | |
+ "\n### Response :\n" | |
) # not keeping output column in test dataset | |
text_col.append(text) | |
df.loc[:, "text"] = text_col | |
return df | |
train_df = prepare_dataset(train_df, "train") | |
test_df = prepare_dataset(test_df, "test") | |
print(test_df.iloc[0].text) | |
# %% | |
# coverting the dataframe to huggingface dataset for easy finetuning | |
from datasets import Dataset | |
dataset = Dataset.from_pandas(train_df) | |
# %% | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
# sharded model path in hugging face | |
# model_name = "TinyPixel/Llama-2-7B-bf16-sharded" | |
model_name = "meta-llama/Llama-2-7b-hf" | |
# Quantization config | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype="float16", | |
) | |
# loading the model with quantization config | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
quantization_config=bnb_config, | |
trust_remote_code=True, | |
device_map="auto", | |
) | |
# can change to False if need the newest model update. | |
model.config.use_cache = True | |
# apparently needed because of | |
# https://github.com/huggingface/transformers/pull/24906 | |
#disable tensor parallelism | |
model.config.pretraining_tp = 1 | |
# %% | |
from rich import print | |
tokenizer = AutoTokenizer.from_pretrained( | |
model_name, trust_remote_code=True, return_token_type_ids=False | |
) | |
tokenizer.pad_token = tokenizer.eos_token | |
tokenizer.padding_side = "right" | |
print(model) | |
# %% | |
from peft import LoraConfig, get_peft_model | |
lora_alpha = 16 | |
lora_dropout = 0.05 | |
lora_r = 8 # rank | |
# Parameter efficient finetuning for LoRA configuration | |
peft_config = LoraConfig( | |
lora_alpha=lora_alpha, | |
lora_dropout=lora_dropout, | |
target_modules=[ | |
"q_proj", | |
"v_proj", | |
], # we will only create adopters for q, v metrices of attention module | |
r=lora_r, | |
bias="none", | |
task_type="CAUSAL_LM", | |
) | |
import transformers | |
output_dir = "llama2_qlora_finetuned_7b_hf" | |
training_arguments = transformers.TrainingArguments( | |
output_dir=output_dir, | |
per_device_train_batch_size=4, | |
gradient_accumulation_steps=4, | |
optim="paged_adamw_8bit", | |
learning_rate=2e-4, | |
lr_scheduler_type="linear", | |
save_strategy="epoch", | |
logging_steps=10, | |
num_train_epochs=1, | |
max_steps=100, | |
fp16=True, | |
push_to_hub=False, | |
) | |
# %% | |
# creating trainer with the training agruments | |
from peft import prepare_model_for_kbit_training | |
model.gradient_checkpointing_enable() | |
model = prepare_model_for_kbit_training(model) | |
from trl import SFTTrainer | |
trainer = SFTTrainer( | |
model=model, | |
train_dataset=dataset, | |
peft_config=peft_config, # passing peft config | |
dataset_text_field="text", # mentioned the required column | |
args=training_arguments, # training agruments | |
tokenizer=tokenizer, # tokenizer | |
packing=False, | |
max_seq_length=512, | |
) | |
# %% | |
from time import perf_counter | |
start_time = perf_counter() | |
trainer.train() | |
end_time = perf_counter() | |
training_time = end_time - start_time | |
print(f"Time taken for training: {training_time} seconds") | |
# %% | |
checkpoint_name = "final_check_qlora_7b_hf_base" | |
# to merge and save the model | |
output_dir = os.path.join(output_dir, checkpoint_name) | |
trainer.model.save_pretrained(output_dir) | |
# %% | |
# from peft import LoraConfig, get_peft_model | |
# lora_config = LoraConfig.from_pretrained(output_dir) | |
# lmodel = get_peft_model(model, lora_config) | |
# # %% | |
# from time import perf_counter | |
# from rich import print | |
# start_time = perf_counter() | |
# text = test_df['text'][0] | |
# inputs = tokenizer(text, return_tensors="pt").to('cuda') | |
# outputs = lmodel.generate(**inputs, max_new_tokens=100) | |
# print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
# end_time = perf_counter() | |
# output_time = end_time - start_time | |
# print(f"Time taken for inference: {output_time} seconds") | |
# %% | |
from peft import AutoPeftModelForCausalLM | |
persisted_model = AutoPeftModelForCausalLM.from_pretrained( | |
output_dir, | |
low_cpu_mem_usage=True, | |
return_dict=True, | |
torch_dtype=torch.float16, | |
device_map="cuda", | |
) | |
# %% | |
from rich import print | |
from time import perf_counter | |
from transformers import GenerationConfig | |
start_time = perf_counter() | |
text = test_df["text"][3] | |
inputs = tokenizer(text, return_tensors="pt").to("cuda") | |
generation_config = GenerationConfig( | |
penalty_alpha=0.6, | |
do_sample = True, | |
top_k=5, | |
temperature=0.5, | |
repetition_penalty=1.2 | |
) | |
outputs = persisted_model.generate( | |
**inputs, max_new_tokens=100, generation_config=generation_config | |
) | |
print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
end_time = perf_counter() | |
output_time = end_time - start_time | |
print(f"Time taken for inference: {output_time} seconds") | |
# %% | |
# Merge LoRA and base model | |
merged_model = persisted_model.merge_and_unload() | |
# %% | |
# Save the merged model | |
merged_model.save_pretrained("merged_model", safe_serialization=True) | |
tokenizer.save_pretrained("merged_model") | |
# %% | |
# now lets apply gptq on the merged model | |
from transformers import GPTQConfig | |
quantization_config = GPTQConfig( | |
bits=4, | |
dataset=["c4"], | |
desc_act=False, | |
) | |
tokenizer = AutoTokenizer.from_pretrained("merged_model") | |
quant_model = AutoModelForCausalLM.from_pretrained( | |
"merged_model", quantization_config=quantization_config, | |
device_map="auto" | |
) | |
# Save the quantized model | |
quant_model.save_pretrained("quant_model", safe_serialization=True) | |
tokenizer.save_pretrained("quant_model") | |
#%% | |
from rich import print | |
from time import perf_counter | |
from transformers import GenerationConfig | |
start_time = perf_counter() | |
text = test_df["text"][3] | |
inputs = tokenizer(text, return_tensors="pt").to("cuda") | |
generation_config = GenerationConfig( | |
penalty_alpha=0.6, | |
do_sample = True, | |
top_k=5, | |
temperature=0.5, | |
repetition_penalty=1.2 | |
) | |
outputs = quant_model.generate( | |
**inputs, max_new_tokens=100, generation_config=generation_config | |
) | |
print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
end_time = perf_counter() | |
output_time = end_time - start_time | |
print(f"Time taken for inference: {output_time} seconds") | |
# %% | |
hf_model_repo = "dsmonk/llama2-7b-ftqlora-gptq" | |
merged_model.push_to_hub(hf_model_repo) | |
tokenizer.push_to_hub(hf_model_repo) | |
# %% | |
quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True) | |
from_disk_model = AutoModelForCausalLM.from_pretrained( | |
"quant_model", quantization_config=quantization_config_loading, device_map="auto" | |
) | |
# %% | |
from rich import print | |
from time import perf_counter | |
from transformers import GenerationConfig | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from_disk_model = AutoModelForCausalLM.from_pretrained( "quant_model", | |
device_map="auto") | |
tokenizer = AutoTokenizer.from_pretrained("quant_model") | |
#run inference code | |
start_time = perf_counter() | |
text = test_df["text"][3] | |
inputs = tokenizer(text, return_tensors="pt").to("cuda") | |
generation_config = GenerationConfig( | |
penalty_alpha=0.6, | |
do_sample = True, | |
top_k=5, | |
temperature=0.5, | |
repetition_penalty=1.2 | |
) | |
outputs = from_disk_model.generate( | |
**inputs, max_new_tokens=100, generation_config=generation_config | |
) | |
print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
end_time = perf_counter() | |
output_time = end_time - start_time | |
print(f"Time taken for inference: {output_time} seconds") | |
# %% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment