grahama1970 · December 27, 2024 21:07
diff --git a/hf_only_inference_sanity_check.py.py b/hf_only_inference_sanity_check.py.py
 # Doesn't Work. Outputs are garbled
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 from loguru import logger

 # Configuration
 BASE_MODEL_NAME = "unsloth/Phi-3.5-mini-instruct"
 ADAPTER_PATH = "/home/grahama/dev/vllm_lora/training_output/Phi-3.5-mini-instruct_touch-rugby-rules_adapter/final_model"

 def setup_model_and_tokenizer(base_model_name: str, adapter_path: str):
    try:
        logger.info("Loading base model...")
        base_model = AutoModelForCausalLM.from_pretrained(base_model_name, load_in_4bit=True)
        
        logger.info("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        
        logger.info("Loading adapter and merging...")
        peft_model = PeftModel.from_pretrained(base_model, adapter_path)
        merged_model = peft_model.merge_and_unload()
        
        logger.info("Applying Phi-3 chat template...")
        tokenizer.chat_template = "{% for message in messages %}{{'Human: ' if message['role'] == 'user' else 'Assistant: '}}{{ message['content'] }}{% if not loop.last %}\n\n{% endif %}{% endfor %}\n\nAssistant:"
        
        return merged_model, tokenizer
    except Exception as e:
        logger.error(f"Error in setup: {str(e)}")
        raise

 def generate_response(model, tokenizer, question: str):
    try:
        input_conversation = [{"role": "user", "content": question}]
        formatted_prompt = tokenizer.apply_chat_template(input_conversation, tokenize=False, add_generation_prompt=True)
        
        inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=512, padding=True)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response.split("Assistant:")[-1].strip()
    except Exception as e:
        logger.error(f"Error in generation: {str(e)}")
        return "Sorry, I encountered an error while generating the response."

 def main():
    try:
        logger.info("Setting up model and tokenizer...")
        model, tokenizer = setup_model_and_tokenizer(BASE_MODEL_NAME, ADAPTER_PATH)

        questions = [
            "What is a touchdown in Touch Rugby?",
            "How many players are on a Touch Rugby team?",
            "What happens after a touchdown in Touch Rugby?"
        ]

        for question in questions:
            logger.info(f"Q: {question}")
            response = generate_response(model, tokenizer, question)
            logger.info(f"A: {response}")
            logger.info("-" * 80)
    except Exception as e:
        logger.error(f"An error occurred in main: {str(e)}")

 if __name__ == "__main__":
    main()

diff --git a/unlsoth_only_inference_sanity_check.py b/unlsoth_only_inference_sanity_check.py
 # Unsloth Inference woks fine
 import torch
 from unsloth import FastLanguageModel
 from unsloth.chat_templates import get_chat_template
 from loguru import logger

 # Configuration
 MODEL_NAME = "unsloth/Phi-3.5-mini-instruct"  # Base model
 ADAPTER_PATH = "/home/grahama/dev/vllm_lora/training_output/Phi-3.5-mini-instruct_touch-rugby-rules_adapter/final_model"  # Adapter path


 def setup_model_and_tokenizer(model_name: str, adapter_path: str):
    """
    Load the model and tokenizer and apply the adapter.

    Args:
        model_name (str): The name of the base model.
        adapter_path (str): The path to the trained LoRA adapter.

    Returns:
        Tuple: Configured model and tokenizer.
    """
    logger.info("Loading base model and tokenizer...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=2048,
        load_in_4bit=True  # Ensure compatibility with LoRA adapter training
    )

    logger.info("Loading adapter...")
    model.load_adapter(adapter_path)

    logger.info("Configuring tokenizer with Phi-3 chat template...")
    tokenizer = get_chat_template(
        tokenizer,
        chat_template="phi-3",
        mapping={
            "role": "from",
            "content": "value",
            "user": "human",
            "assistant": "gpt"
        }
    )

    logger.info("Preparing model for inference...")
    FastLanguageModel.for_inference(model)  # Prepare the model for Unsloth inference

    return model, tokenizer


 def generate_response(model, tokenizer, question: str):
    """
    Generate a response for a given question.

    Args:
        model: The configured model.
        tokenizer: The configured tokenizer.
        question (str): The input question.

    Returns:
        str: Generated response.
    """
    logger.info("Formatting input with chat template...")
    input_conversation = [{"role": "user", "content": question}]
    formatted_prompt = tokenizer.apply_chat_template(
        input_conversation,
        tokenize=False,
        add_generation_prompt=True
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    logger.info("Tokenizing input...")
    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        padding=True
    ).to(device)

    logger.info("Generating response...")
    outputs = model.generate(
        **inputs,
        max_new_tokens=64,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


 def main():
    logger.info("Setting up model and tokenizer...")
    model, tokenizer = setup_model_and_tokenizer(MODEL_NAME, ADAPTER_PATH)

    questions = [
        "What is a touchdown in Touch Rugby?",
        "How many players are on a Touch Rugby team?",
        "What happens after a touchdown in Touch Rugby?"
    ]

    for question in questions:
        logger.info(f"Q: {question}")
        response = generate_response(model, tokenizer, question)
        logger.info(f"A: {response}")
        logger.info("-" * 80)


 if __name__ == "__main__":
    main()
	# Doesn't Work. Outputs are garbled
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel
	from loguru import logger

	# Configuration
	BASE_MODEL_NAME = "unsloth/Phi-3.5-mini-instruct"
	ADAPTER_PATH = "/home/grahama/dev/vllm_lora/training_output/Phi-3.5-mini-instruct_touch-rugby-rules_adapter/final_model"

	def setup_model_and_tokenizer(base_model_name: str, adapter_path: str):
	try:
	logger.info("Loading base model...")
	base_model = AutoModelForCausalLM.from_pretrained(base_model_name, load_in_4bit=True)

	logger.info("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(base_model_name)

	logger.info("Loading adapter and merging...")
	peft_model = PeftModel.from_pretrained(base_model, adapter_path)
	merged_model = peft_model.merge_and_unload()

	logger.info("Applying Phi-3 chat template...")
	tokenizer.chat_template = "{% for message in messages %}{{'Human: ' if message['role'] == 'user' else 'Assistant: '}}{{ message['content'] }}{% if not loop.last %}\n\n{% endif %}{% endfor %}\n\nAssistant:"

	return merged_model, tokenizer
	except Exception as e:
	logger.error(f"Error in setup: {str(e)}")
	raise

	def generate_response(model, tokenizer, question: str):
	try:
	input_conversation = [{"role": "user", "content": question}]
	formatted_prompt = tokenizer.apply_chat_template(input_conversation, tokenize=False, add_generation_prompt=True)

	inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=512, padding=True)
	inputs = {k: v.to(model.device) for k, v in inputs.items()}

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=64,
	temperature=0.7,
	top_p=0.9,
	do_sample=True,
	pad_token_id=tokenizer.pad_token_id,
	eos_token_id=tokenizer.eos_token_id
	)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return response.split("Assistant:")[-1].strip()
	except Exception as e:
	logger.error(f"Error in generation: {str(e)}")
	return "Sorry, I encountered an error while generating the response."

	def main():
	try:
	logger.info("Setting up model and tokenizer...")
	model, tokenizer = setup_model_and_tokenizer(BASE_MODEL_NAME, ADAPTER_PATH)

	questions = [
	"What is a touchdown in Touch Rugby?",
	"How many players are on a Touch Rugby team?",
	"What happens after a touchdown in Touch Rugby?"
	]

	for question in questions:
	logger.info(f"Q: {question}")
	response = generate_response(model, tokenizer, question)
	logger.info(f"A: {response}")
	logger.info("-" * 80)
	except Exception as e:
	logger.error(f"An error occurred in main: {str(e)}")

	if __name__ == "__main__":
	main()
	# Unsloth Inference woks fine
	import torch
	from unsloth import FastLanguageModel
	from unsloth.chat_templates import get_chat_template
	from loguru import logger

	# Configuration
	MODEL_NAME = "unsloth/Phi-3.5-mini-instruct" # Base model
	ADAPTER_PATH = "/home/grahama/dev/vllm_lora/training_output/Phi-3.5-mini-instruct_touch-rugby-rules_adapter/final_model" # Adapter path


	def setup_model_and_tokenizer(model_name: str, adapter_path: str):
	"""
	Load the model and tokenizer and apply the adapter.

	Args:
	model_name (str): The name of the base model.
	adapter_path (str): The path to the trained LoRA adapter.

	Returns:
	Tuple: Configured model and tokenizer.
	"""
	logger.info("Loading base model and tokenizer...")
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=model_name,
	max_seq_length=2048,
	load_in_4bit=True # Ensure compatibility with LoRA adapter training
	)

	logger.info("Loading adapter...")
	model.load_adapter(adapter_path)

	logger.info("Configuring tokenizer with Phi-3 chat template...")
	tokenizer = get_chat_template(
	tokenizer,
	chat_template="phi-3",
	mapping={
	"role": "from",
	"content": "value",
	"user": "human",
	"assistant": "gpt"
	}
	)

	logger.info("Preparing model for inference...")
	FastLanguageModel.for_inference(model) # Prepare the model for Unsloth inference

	return model, tokenizer


	def generate_response(model, tokenizer, question: str):
	"""
	Generate a response for a given question.

	Args:
	model: The configured model.
	tokenizer: The configured tokenizer.
	question (str): The input question.

	Returns:
	str: Generated response.
	"""
	logger.info("Formatting input with chat template...")
	input_conversation = [{"role": "user", "content": question}]
	formatted_prompt = tokenizer.apply_chat_template(
	input_conversation,
	tokenize=False,
	add_generation_prompt=True
	)

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	logger.info("Tokenizing input...")
	inputs = tokenizer(
	formatted_prompt,
	return_tensors="pt",
	truncation=True,
	max_length=512,
	padding=True
	).to(device)

	logger.info("Generating response...")
	outputs = model.generate(
	**inputs,
	max_new_tokens=64,
	temperature=0.7,
	top_p=0.9,
	do_sample=True,
	pad_token_id=tokenizer.pad_token_id,
	eos_token_id=tokenizer.eos_token_id
	)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return response


	def main():
	logger.info("Setting up model and tokenizer...")
	model, tokenizer = setup_model_and_tokenizer(MODEL_NAME, ADAPTER_PATH)

	questions = [
	"What is a touchdown in Touch Rugby?",
	"How many players are on a Touch Rugby team?",
	"What happens after a touchdown in Touch Rugby?"
	]

	for question in questions:
	logger.info(f"Q: {question}")
	response = generate_response(model, tokenizer, question)
	logger.info(f"A: {response}")
	logger.info("-" * 80)


	if __name__ == "__main__":
	main()