funkytaco · July 26, 2023 05:09 · Apr 25, 2023 · Apr 25, 2023
diff --git a/prompt_alpaca_lora.py b/prompt_alpaca_lora.py
@@ -38,7 +38,7 @@ def load_adapted_hf_generation_pipeline(
     if device == "cuda":
         if not is_accelerate_available():
             raise ValueError("Install `accelerate`")
-    if load_in_8bit and not is_accelerate_available():
+    if load_in_8bit and not is_bitsandbytes_available():
             raise ValueError("Install `bitsandbytes`")
 
     tokenizer = AutoTokenizer.from_pretrained(base_model_name)
@@ -96,7 +96,6 @@ def load_adapted_hf_generation_pipeline(
         top_p=top_p,
         **generation_kwargs,
     )
-    # TODO: add generation config to pipeline
     pipe = pipeline(
         task,
         model=model,

diff --git a/prompt_alpaca_lora.py b/prompt_alpaca_lora.py
@@ -0,0 +1,120 @@
+from typing import Optional, Any
+
+import torch
+
+from transformers.utils import is_accelerate_available, is_bitsandbytes_available
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    GenerationConfig,
+    pipeline,
+)
+
+from peft import PeftModel
+
+ALPACA_TEMPLATE = (
+    "Below is an instruction that describes a task, paired with an input that provides "
+    "further context. Write a response that appropriately completes the request.\n\n"
+    "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
+)
+
+
+def load_adapted_hf_generation_pipeline(
+    base_model_name,
+    lora_model_name,
+    temperature: float = 0,
+    top_p: float = 1.,
+    max_tokens: int = 50,
+    batch_size: int = 16,
+    device: str = "cpu",
+    load_in_8bit: bool = True,
+    generation_kwargs: Optional[dict] = None,
+):
+    """
+    Load a huggingface model & adapt with PEFT.
+    Borrowed from https://github.com/tloen/alpaca-lora/blob/main/generate.py
+    """
+
+    if device == "cuda":
+        if not is_accelerate_available():
+            raise ValueError("Install `accelerate`")
+    if load_in_8bit and not is_accelerate_available():
+            raise ValueError("Install `bitsandbytes`")
+
+    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
+    task = "text-generation"
+
+    if device == "cuda":
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model_name,
+            load_in_8bit=load_in_8bit,
+            torch_dtype=torch.float16,
+            device_map="auto",
+        )
+        model = PeftModel.from_pretrained(
+            model,
+            lora_model_name,
+            torch_dtype=torch.float16,
+        )
+    elif device == "mps":
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model_name,
+            device_map={"": device},
+            torch_dtype=torch.float16,
+        )
+        model = PeftModel.from_pretrained(
+            model,
+            lora_model_name,
+            device_map={"": device},
+            torch_dtype=torch.float16,
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model_name, device_map={"": device}, low_cpu_mem_usage=True
+        )
+        model = PeftModel.from_pretrained(
+            model,
+            lora_model_name,
+            device_map={"": device},
+        )
+
+    # unwind broken decapoda-research config
+    model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
+    model.config.bos_token_id = 1
+    model.config.eos_token_id = 2
+
+    if not load_in_8bit:
+        model.half()  # seems to fix bugs for some users.
+
+    model.eval()
+
+    generation_kwargs = generation_kwargs if generation_kwargs is not None else {}
+    config = GenerationConfig(
+        do_sample=True,
+        temperature=temperature,
+        max_new_tokens=max_tokens,
+        top_p=top_p,
+        **generation_kwargs,
+    )
+    # TODO: add generation config to pipeline
+    pipe = pipeline(
+        task,
+        model=model,
+        tokenizer=tokenizer,
+        batch_size=16, # TODO: make a parameter
+        generation_config=config,
+        framework="pt",
+    )
+
+    return pipe
+
+if __name__ == "__main__":
+    pipe = load_adapted_hf_generation_pipeline(
+    base_model_name="decapoda-research/llama-7b-hf",
+    lora_model_name="tloen/alpaca-lora-7b",
+    )
+    prompt = ALPACA_TEMPLATE.format(
+        instruction="Paraphrase the sentence."
+        input="The quick brown fox jumped over the lazy dog."
+    )
+    print(pipe(prompt))