younesbelkada · November 26, 2024 16:24
diff --git a/benchmark-bnb-gptq.py b/benchmark-bnb-gptq.py
 # You need the following libraries 
 # transformers == 4.32.0
 # bitsandbytes == 0.41.0
 # auto-gptq == 0.4.2
 # optimum == 1.12.0
 import torch
 import matplotlib.pyplot as plt
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import seaborn as sns

 N_BATCHES = 10
 MAX_NEW_TOKENS = 30
 BATCH_SIZE = [1, 2, 4, 8, 16, 32]

 bnb_model_id = "meta-llama/Llama-2-7b-hf"
 gptq_model_id = "TheBloke/Llama-2-7B-GPTQ"

 def warmup_and_benchmark(model, inputs):
    _ = model.generate(**inputs, max_new_tokens=20, eos_token_id=-1)

    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

    torch.cuda.empty_cache()
    torch.cuda.synchronize()

    start_event.record()
    for _ in range(N_BATCHES):
        _ = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, eos_token_id=-1)
    end_event.record()
    torch.cuda.synchronize()

    return (start_event.elapsed_time(end_event) * 1.0e-3) / N_BATCHES

 quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
 )

 tokenizer = AutoTokenizer.from_pretrained(bnb_model_id)

 bnb_model = AutoModelForCausalLM.from_pretrained(
    bnb_model_id, 
    quantization_config=quantization_config, 
    device_map={"":0}, 
    use_auth_token=True
 )
 gptq_model = AutoModelForCausalLM.from_pretrained(
    gptq_model_id, 
    device_map={"":0}
 )

 bnb_total_time_dict = {}
 gptq_total_time_dict = {}

 for batch_size in tqdm(BATCH_SIZE):
    text = [
        "hello"
    ] * batch_size
    inputs = tokenizer(text, return_tensors="pt").to("cuda:0")

    # warmup
    bnb_timing = warmup_and_benchmark(bnb_model, inputs)
    bnb_total_time_dict[f"{batch_size}"] = bnb_timing

    gptq_timing = warmup_and_benchmark(gptq_model, inputs)
    gptq_total_time_dict[f"{batch_size}"] = gptq_timing


 sns.set(style="darkgrid")
 # plot both lines
 sns.lineplot(data=bnb_total_time_dict, color="blue", label="bitsandbytes-QLoRA")
 sns.lineplot(data=gptq_total_time_dict, color="orange", label="GPTQ-4bit")

 plt.ylabel("Average inference time (s)")
 plt.xlabel("Batch size")
 plt.title("Comparing average inference time between bnb-4bit model vs GPTQ model", fontsize = 8)

 plt.legend()

 # save plot
 plt.savefig("seaborn_comparison_plot.jpg", dpi=300)
	# You need the following libraries
	# transformers == 4.32.0
	# bitsandbytes == 0.41.0
	# auto-gptq == 0.4.2
	# optimum == 1.12.0
	import torch
	import matplotlib.pyplot as plt
	from tqdm import tqdm
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	import seaborn as sns

	N_BATCHES = 10
	MAX_NEW_TOKENS = 30
	BATCH_SIZE = [1, 2, 4, 8, 16, 32]

	bnb_model_id = "meta-llama/Llama-2-7b-hf"
	gptq_model_id = "TheBloke/Llama-2-7B-GPTQ"

	def warmup_and_benchmark(model, inputs):
	_ = model.generate(**inputs, max_new_tokens=20, eos_token_id=-1)

	start_event = torch.cuda.Event(enable_timing=True)
	end_event = torch.cuda.Event(enable_timing=True)

	torch.cuda.empty_cache()
	torch.cuda.synchronize()

	start_event.record()
	for _ in range(N_BATCHES):
	_ = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, eos_token_id=-1)
	end_event.record()
	torch.cuda.synchronize()

	return (start_event.elapsed_time(end_event) * 1.0e-3) / N_BATCHES

	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16
	)

	tokenizer = AutoTokenizer.from_pretrained(bnb_model_id)

	bnb_model = AutoModelForCausalLM.from_pretrained(
	bnb_model_id,
	quantization_config=quantization_config,
	device_map={"":0},
	use_auth_token=True
	)
	gptq_model = AutoModelForCausalLM.from_pretrained(
	gptq_model_id,
	device_map={"":0}
	)

	bnb_total_time_dict = {}
	gptq_total_time_dict = {}

	for batch_size in tqdm(BATCH_SIZE):
	text = [
	"hello"
	] * batch_size
	inputs = tokenizer(text, return_tensors="pt").to("cuda:0")

	# warmup
	bnb_timing = warmup_and_benchmark(bnb_model, inputs)
	bnb_total_time_dict[f"{batch_size}"] = bnb_timing

	gptq_timing = warmup_and_benchmark(gptq_model, inputs)
	gptq_total_time_dict[f"{batch_size}"] = gptq_timing


	sns.set(style="darkgrid")
	# plot both lines
	sns.lineplot(data=bnb_total_time_dict, color="blue", label="bitsandbytes-QLoRA")
	sns.lineplot(data=gptq_total_time_dict, color="orange", label="GPTQ-4bit")

	plt.ylabel("Average inference time (s)")
	plt.xlabel("Batch size")
	plt.title("Comparing average inference time between bnb-4bit model vs GPTQ model", fontsize = 8)

	plt.legend()

	# save plot
	plt.savefig("seaborn_comparison_plot.jpg", dpi=300)
No results found