bitsnaps · December 19, 2023 18:35
diff --git a/gptq_lora.py b/gptq_lora.py
 # %%
 # this is run from /notebooks on paperspace
 from huggingface_hub import login
 from dotenv import load_dotenv

 load_dotenv("/notebooks/.env")
 import os
 os.environ["TOKENIZERS_PARALLELISM"]="false"
 login(token=os.getenv("HUGGINGFACE_TOKEN"))

 # %%
 from datasets import load_dataset

 dataset_name = "knkarthick/dialogsum"

 ds = load_dataset(dataset_name)
 # %%
 train_ds, test_ds = load_dataset(dataset_name, split=["train", "test[0:200]"])
 # %%
 import pandas as pd

 # convert to pandas
 train_df = pd.DataFrame(train_ds)
 test_df = pd.DataFrame(test_ds)
 train_df.head()
 # %%
 # instruction finetuning data preparation function


 def prepare_dataset(df, split="train"):
    text_col = []
    instruction = """Write a concise summary of the below input text.Ensure that responses covers the key points of the text.Only provide full sentence responses."""  # change instuction according to the task
    if split == "train":
        for _, row in df.iterrows():
            input_q = row["dialogue"]
            output = row["summary"]
            text = (
                "### Instruction: \n"
                + instruction
                + "\n### Input: \n"
                + input_q
                + "\n### Response :\n"
                + output
                + "\n### End"
            )  # keeping output column in training dataset
            text_col.append(text)
        df.loc[:, "text"] = text_col
    else:
        for _, row in df.iterrows():
            input_q = row["dialogue"]
            text = (
                "### Instruction: \n"
                + instruction
                + "\n### Input: \n"
                + input_q
                + "\n### Response :\n"
            )  # not keeping output column in test dataset
            text_col.append(text)
        df.loc[:, "text"] = text_col
    return df

 train_df = prepare_dataset(train_df, "train")
 test_df = prepare_dataset(test_df, "test")
 print(train_df.iloc[0].text)

 # %%
 # coverting the dataframe to huggingface dataset for easy finetuning
 from datasets import Dataset
 dataset = Dataset.from_pandas(train_df)
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

 # sharded model path in hugging face
 # model_name = "TinyPixel/Llama-2-7B-bf16-sharded"


 # model_name = 'NousResearch/Llama-2-7b-hf'

 # Quantization config
 # bnb_config = BitsAndBytesConfig(
 #     load_in_4bit=True,
 #     bnb_4bit_quant_type="nf4",
 #     bnb_4bit_compute_dtype="float16",
 # )
 # take pre quantized model from hugging face
 model_id = "TheBloke/Llama-2-7B-GPTQ"
 # model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
 # tokenizer = AutoTokenizer.from_pretrained(model_id)

 # loading the model with quantization config
 # model = AutoModelForCausalLM.from_pretrained(
 #     model_name,
 #     quantization_config=bnb_config,
 #     trust_remote_code=True,
 #     device_map='auto'
 # )


 # can change to False if need the newest model update.

 # %%
 from peft import prepare_model_for_kbit_training
 from transformers import GPTQConfig

 # model_id = "TheBloke/Llama-2-7B-GPTQ"
 # model_id = "TheBloke/Llama-2-7b-Chat-GPTQ"
 quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True)
 model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=quantization_config_loading, device_map="auto"
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model.config.use_cache = False
 model.config.pretraining_tp = 1
 # %%
 model.gradient_checkpointing_enable()
 model = prepare_model_for_kbit_training(model)
 from peft import LoraConfig, get_peft_model
 config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["k_proj","o_proj","q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
 )

 model = get_peft_model(model, config)
 model.print_trainable_parameters()
 # %%
 from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

 # needed for llama 2 tokenizer
 tokenizer.pad_token = tokenizer.eos_token
 tokenizer.padding_side = "right"
 model.config.use_cache = False # silence the warnings. Please re-enable for inference!
 args=TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True, #use mixed precision training
        logging_steps=1,
        output_dir="outputs_gptq_training",
        optim="adamw_hf",
        save_strategy="epoch",
        report_to="none")

 from trl import SFTTrainer
 trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=512)


 # %%
 train_result = trainer.train()

 # %%
 checkpoint_name ="final_checkpoints_gptqsummarizer_7b_peft"
 #to merge and save the model
 output_dir = os.path.join(args.output_dir, checkpoint_name)
 trainer.model.save_pretrained(output_dir)

 # To perform inference on the test dataset
 # this is one way, but loading from dir is better!
 # from peft import PeftModel
 # from rich import print
 # base_model = AutoModelForCausalLM.from_pretrained(
 #         model_id,
 #         return_dict=True,
 #         torch_dtype=torch.float16,
 #         device_map='auto'
 #     )

 # model = PeftModel.from_pretrained(base_model,
 #                                   model_id=output_dir,
 #                                   device_map='auto',
 # )
 # from peft import LoraConfig, get_peft_model
 # lora_config = LoraConfig.from_pretrained(output_dir)
 # model  = get_peft_model(model, lora_config)

 from peft import AutoPeftModelForCausalLM
 # To perform inference on the test dataset example load the model from the checkpoint
 persisted_model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda",
 )

 # %%
 #inference on test data example
 from time import perf_counter
 from rich import print
 from transformers import GenerationConfig
 text = test_df['text'][4]
 inputs = tokenizer(text, return_tensors="pt").to('cuda')
 generation_config = GenerationConfig(
    penalty_alpha=0.6, 
    do_sample = True, 
    top_k=5, 
    temperature=0.5, 
    repetition_penalty=1.2,
    max_new_tokens=100
 )
 start_time = perf_counter()
 outputs = persisted_model.generate(**inputs, generation_config=generation_config)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 end_time = perf_counter()
 output_time = end_time - start_time
 print(f"Time taken for inference: {round(output_time,2)} seconds")
	# %%
	# this is run from /notebooks on paperspace
	from huggingface_hub import login
	from dotenv import load_dotenv

	load_dotenv("/notebooks/.env")
	import os
	os.environ["TOKENIZERS_PARALLELISM"]="false"
	login(token=os.getenv("HUGGINGFACE_TOKEN"))

	# %%
	from datasets import load_dataset

	dataset_name = "knkarthick/dialogsum"

	ds = load_dataset(dataset_name)
	# %%
	train_ds, test_ds = load_dataset(dataset_name, split=["train", "test[0:200]"])
	# %%
	import pandas as pd

	# convert to pandas
	train_df = pd.DataFrame(train_ds)
	test_df = pd.DataFrame(test_ds)
	train_df.head()
	# %%
	# instruction finetuning data preparation function


	def prepare_dataset(df, split="train"):
	text_col = []
	instruction = """Write a concise summary of the below input text.Ensure that responses covers the key points of the text.Only provide full sentence responses.""" # change instuction according to the task
	if split == "train":
	for _, row in df.iterrows():
	input_q = row["dialogue"]
	output = row["summary"]
	text = (
	"### Instruction: \n"
	+ instruction
	+ "\n### Input: \n"
	+ input_q
	+ "\n### Response :\n"
	+ output
	+ "\n### End"
	) # keeping output column in training dataset
	text_col.append(text)
	df.loc[:, "text"] = text_col
	else:
	for _, row in df.iterrows():
	input_q = row["dialogue"]
	text = (
	"### Instruction: \n"
	+ instruction
	+ "\n### Input: \n"
	+ input_q
	+ "\n### Response :\n"
	) # not keeping output column in test dataset
	text_col.append(text)
	df.loc[:, "text"] = text_col
	return df

	train_df = prepare_dataset(train_df, "train")
	test_df = prepare_dataset(test_df, "test")
	print(train_df.iloc[0].text)

	# %%
	# coverting the dataframe to huggingface dataset for easy finetuning
	from datasets import Dataset
	dataset = Dataset.from_pandas(train_df)
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

	# sharded model path in hugging face
	# model_name = "TinyPixel/Llama-2-7B-bf16-sharded"


	# model_name = 'NousResearch/Llama-2-7b-hf'

	# Quantization config
	# bnb_config = BitsAndBytesConfig(
	# load_in_4bit=True,
	# bnb_4bit_quant_type="nf4",
	# bnb_4bit_compute_dtype="float16",
	# )
	# take pre quantized model from hugging face
	model_id = "TheBloke/Llama-2-7B-GPTQ"
	# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
	# tokenizer = AutoTokenizer.from_pretrained(model_id)

	# loading the model with quantization config
	# model = AutoModelForCausalLM.from_pretrained(
	# model_name,
	# quantization_config=bnb_config,
	# trust_remote_code=True,
	# device_map='auto'
	# )


	# can change to False if need the newest model update.

	# %%
	from peft import prepare_model_for_kbit_training
	from transformers import GPTQConfig

	# model_id = "TheBloke/Llama-2-7B-GPTQ"
	# model_id = "TheBloke/Llama-2-7b-Chat-GPTQ"
	quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_id, quantization_config=quantization_config_loading, device_map="auto"
	)
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model.config.use_cache = False
	model.config.pretraining_tp = 1
	# %%
	model.gradient_checkpointing_enable()
	model = prepare_model_for_kbit_training(model)
	from peft import LoraConfig, get_peft_model
	config = LoraConfig(
	r=8,
	lora_alpha=32,
	target_modules=["k_proj","o_proj","q_proj","v_proj"],
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM"
	)

	model = get_peft_model(model, config)
	model.print_trainable_parameters()
	# %%
	from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

	# needed for llama 2 tokenizer
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right"
	model.config.use_cache = False # silence the warnings. Please re-enable for inference!
	args=TrainingArguments(
	per_device_train_batch_size=4,
	gradient_accumulation_steps=4,
	warmup_steps=2,
	max_steps=100,
	learning_rate=2e-4,
	fp16=True, #use mixed precision training
	logging_steps=1,
	output_dir="outputs_gptq_training",
	optim="adamw_hf",
	save_strategy="epoch",
	report_to="none")

	from trl import SFTTrainer
	trainer = SFTTrainer(
	model=model,
	args=args,
	train_dataset=dataset,
	peft_config=config,
	dataset_text_field="text",
	tokenizer=tokenizer,
	packing=False,
	max_seq_length=512)


	# %%
	train_result = trainer.train()

	# %%
	checkpoint_name ="final_checkpoints_gptqsummarizer_7b_peft"
	#to merge and save the model
	output_dir = os.path.join(args.output_dir, checkpoint_name)
	trainer.model.save_pretrained(output_dir)

	# To perform inference on the test dataset
	# this is one way, but loading from dir is better!
	# from peft import PeftModel
	# from rich import print
	# base_model = AutoModelForCausalLM.from_pretrained(
	# model_id,
	# return_dict=True,
	# torch_dtype=torch.float16,
	# device_map='auto'
	# )

	# model = PeftModel.from_pretrained(base_model,
	# model_id=output_dir,
	# device_map='auto',
	# )
	# from peft import LoraConfig, get_peft_model
	# lora_config = LoraConfig.from_pretrained(output_dir)
	# model = get_peft_model(model, lora_config)

	from peft import AutoPeftModelForCausalLM
	# To perform inference on the test dataset example load the model from the checkpoint
	persisted_model = AutoPeftModelForCausalLM.from_pretrained(
	output_dir,
	low_cpu_mem_usage=True,
	return_dict=True,
	torch_dtype=torch.float16,
	device_map="cuda",
	)

	# %%
	#inference on test data example
	from time import perf_counter
	from rich import print
	from transformers import GenerationConfig
	text = test_df['text'][4]
	inputs = tokenizer(text, return_tensors="pt").to('cuda')
	generation_config = GenerationConfig(
	penalty_alpha=0.6,
	do_sample = True,
	top_k=5,
	temperature=0.5,
	repetition_penalty=1.2,
	max_new_tokens=100
	)
	start_time = perf_counter()
	outputs = persisted_model.generate(**inputs, generation_config=generation_config)
	print(tokenizer.decode(outputs[0], skip_special_tokens=True))
	end_time = perf_counter()
	output_time = end_time - start_time
	print(f"Time taken for inference: {round(output_time,2)} seconds")