Last active
December 27, 2024 21:07
-
-
Save grahama1970/f832bbddb1edaa78ccc939a6f2ddd8a1 to your computer and use it in GitHub Desktop.
For dynamic adaptor loading and inferencing, the Unsloth Inference works fine--using Hugging Face does not work--outputs garbled
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Doesn't Work. Outputs are garbled | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from peft import PeftModel | |
from loguru import logger | |
# Configuration | |
BASE_MODEL_NAME = "unsloth/Phi-3.5-mini-instruct" | |
ADAPTER_PATH = "/home/grahama/dev/vllm_lora/training_output/Phi-3.5-mini-instruct_touch-rugby-rules_adapter/final_model" | |
def setup_model_and_tokenizer(base_model_name: str, adapter_path: str): | |
try: | |
logger.info("Loading base model...") | |
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, load_in_4bit=True) | |
logger.info("Loading tokenizer...") | |
tokenizer = AutoTokenizer.from_pretrained(base_model_name) | |
logger.info("Loading adapter and merging...") | |
peft_model = PeftModel.from_pretrained(base_model, adapter_path) | |
merged_model = peft_model.merge_and_unload() | |
logger.info("Applying Phi-3 chat template...") | |
tokenizer.chat_template = "{% for message in messages %}{{'Human: ' if message['role'] == 'user' else 'Assistant: '}}{{ message['content'] }}{% if not loop.last %}\n\n{% endif %}{% endfor %}\n\nAssistant:" | |
return merged_model, tokenizer | |
except Exception as e: | |
logger.error(f"Error in setup: {str(e)}") | |
raise | |
def generate_response(model, tokenizer, question: str): | |
try: | |
input_conversation = [{"role": "user", "content": question}] | |
formatted_prompt = tokenizer.apply_chat_template(input_conversation, tokenize=False, add_generation_prompt=True) | |
inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=512, padding=True) | |
inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
with torch.no_grad(): | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=64, | |
temperature=0.7, | |
top_p=0.9, | |
do_sample=True, | |
pad_token_id=tokenizer.pad_token_id, | |
eos_token_id=tokenizer.eos_token_id | |
) | |
response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return response.split("Assistant:")[-1].strip() | |
except Exception as e: | |
logger.error(f"Error in generation: {str(e)}") | |
return "Sorry, I encountered an error while generating the response." | |
def main(): | |
try: | |
logger.info("Setting up model and tokenizer...") | |
model, tokenizer = setup_model_and_tokenizer(BASE_MODEL_NAME, ADAPTER_PATH) | |
questions = [ | |
"What is a touchdown in Touch Rugby?", | |
"How many players are on a Touch Rugby team?", | |
"What happens after a touchdown in Touch Rugby?" | |
] | |
for question in questions: | |
logger.info(f"Q: {question}") | |
response = generate_response(model, tokenizer, question) | |
logger.info(f"A: {response}") | |
logger.info("-" * 80) | |
except Exception as e: | |
logger.error(f"An error occurred in main: {str(e)}") | |
if __name__ == "__main__": | |
main() | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Unsloth Inference woks fine | |
import torch | |
from unsloth import FastLanguageModel | |
from unsloth.chat_templates import get_chat_template | |
from loguru import logger | |
# Configuration | |
MODEL_NAME = "unsloth/Phi-3.5-mini-instruct" # Base model | |
ADAPTER_PATH = "/home/grahama/dev/vllm_lora/training_output/Phi-3.5-mini-instruct_touch-rugby-rules_adapter/final_model" # Adapter path | |
def setup_model_and_tokenizer(model_name: str, adapter_path: str): | |
""" | |
Load the model and tokenizer and apply the adapter. | |
Args: | |
model_name (str): The name of the base model. | |
adapter_path (str): The path to the trained LoRA adapter. | |
Returns: | |
Tuple: Configured model and tokenizer. | |
""" | |
logger.info("Loading base model and tokenizer...") | |
model, tokenizer = FastLanguageModel.from_pretrained( | |
model_name=model_name, | |
max_seq_length=2048, | |
load_in_4bit=True # Ensure compatibility with LoRA adapter training | |
) | |
logger.info("Loading adapter...") | |
model.load_adapter(adapter_path) | |
logger.info("Configuring tokenizer with Phi-3 chat template...") | |
tokenizer = get_chat_template( | |
tokenizer, | |
chat_template="phi-3", | |
mapping={ | |
"role": "from", | |
"content": "value", | |
"user": "human", | |
"assistant": "gpt" | |
} | |
) | |
logger.info("Preparing model for inference...") | |
FastLanguageModel.for_inference(model) # Prepare the model for Unsloth inference | |
return model, tokenizer | |
def generate_response(model, tokenizer, question: str): | |
""" | |
Generate a response for a given question. | |
Args: | |
model: The configured model. | |
tokenizer: The configured tokenizer. | |
question (str): The input question. | |
Returns: | |
str: Generated response. | |
""" | |
logger.info("Formatting input with chat template...") | |
input_conversation = [{"role": "user", "content": question}] | |
formatted_prompt = tokenizer.apply_chat_template( | |
input_conversation, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
logger.info("Tokenizing input...") | |
inputs = tokenizer( | |
formatted_prompt, | |
return_tensors="pt", | |
truncation=True, | |
max_length=512, | |
padding=True | |
).to(device) | |
logger.info("Generating response...") | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=64, | |
temperature=0.7, | |
top_p=0.9, | |
do_sample=True, | |
pad_token_id=tokenizer.pad_token_id, | |
eos_token_id=tokenizer.eos_token_id | |
) | |
response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return response | |
def main(): | |
logger.info("Setting up model and tokenizer...") | |
model, tokenizer = setup_model_and_tokenizer(MODEL_NAME, ADAPTER_PATH) | |
questions = [ | |
"What is a touchdown in Touch Rugby?", | |
"How many players are on a Touch Rugby team?", | |
"What happens after a touchdown in Touch Rugby?" | |
] | |
for question in questions: | |
logger.info(f"Q: {question}") | |
response = generate_response(model, tokenizer, question) | |
logger.info(f"A: {response}") | |
logger.info("-" * 80) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment