-
-
Save funkytaco/0cd03e959ecb38bc1c8a5ccede3e7ec4 to your computer and use it in GitHub Desktop.
Create a huggingface pipeline with a lora-trained alpaca
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Optional, Any | |
import torch | |
from transformers.utils import is_accelerate_available, is_bitsandbytes_available | |
from transformers import ( | |
AutoTokenizer, | |
AutoModelForCausalLM, | |
GenerationConfig, | |
pipeline, | |
) | |
from peft import PeftModel | |
ALPACA_TEMPLATE = ( | |
"Below is an instruction that describes a task, paired with an input that provides " | |
"further context. Write a response that appropriately completes the request.\n\n" | |
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n" | |
) | |
def load_adapted_hf_generation_pipeline( | |
base_model_name, | |
lora_model_name, | |
temperature: float = 0, | |
top_p: float = 1., | |
max_tokens: int = 50, | |
batch_size: int = 16, | |
device: str = "cpu", | |
load_in_8bit: bool = True, | |
generation_kwargs: Optional[dict] = None, | |
): | |
""" | |
Load a huggingface model & adapt with PEFT. | |
Borrowed from https://github.com/tloen/alpaca-lora/blob/main/generate.py | |
""" | |
if device == "cuda": | |
if not is_accelerate_available(): | |
raise ValueError("Install `accelerate`") | |
if load_in_8bit and not is_bitsandbytes_available(): | |
raise ValueError("Install `bitsandbytes`") | |
tokenizer = AutoTokenizer.from_pretrained(base_model_name) | |
task = "text-generation" | |
if device == "cuda": | |
model = AutoModelForCausalLM.from_pretrained( | |
base_model_name, | |
load_in_8bit=load_in_8bit, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
) | |
model = PeftModel.from_pretrained( | |
model, | |
lora_model_name, | |
torch_dtype=torch.float16, | |
) | |
elif device == "mps": | |
model = AutoModelForCausalLM.from_pretrained( | |
base_model_name, | |
device_map={"": device}, | |
torch_dtype=torch.float16, | |
) | |
model = PeftModel.from_pretrained( | |
model, | |
lora_model_name, | |
device_map={"": device}, | |
torch_dtype=torch.float16, | |
) | |
else: | |
model = AutoModelForCausalLM.from_pretrained( | |
base_model_name, device_map={"": device}, low_cpu_mem_usage=True | |
) | |
model = PeftModel.from_pretrained( | |
model, | |
lora_model_name, | |
device_map={"": device}, | |
) | |
# unwind broken decapoda-research config | |
model.config.pad_token_id = tokenizer.pad_token_id = 0 # unk | |
model.config.bos_token_id = 1 | |
model.config.eos_token_id = 2 | |
if not load_in_8bit: | |
model.half() # seems to fix bugs for some users. | |
model.eval() | |
generation_kwargs = generation_kwargs if generation_kwargs is not None else {} | |
config = GenerationConfig( | |
do_sample=True, | |
temperature=temperature, | |
max_new_tokens=max_tokens, | |
top_p=top_p, | |
**generation_kwargs, | |
) | |
pipe = pipeline( | |
task, | |
model=model, | |
tokenizer=tokenizer, | |
batch_size=16, # TODO: make a parameter | |
generation_config=config, | |
framework="pt", | |
) | |
return pipe | |
if __name__ == "__main__": | |
pipe = load_adapted_hf_generation_pipeline( | |
base_model_name="decapoda-research/llama-7b-hf", | |
lora_model_name="tloen/alpaca-lora-7b", | |
) | |
prompt = ALPACA_TEMPLATE.format( | |
instruction="Paraphrase the sentence." | |
input="The quick brown fox jumped over the lazy dog." | |
) | |
print(pipe(prompt)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment