swayson · November 11, 2025 03:33
diff --git a/dspy_gepa_openrouter.py b/dspy_gepa_openrouter.py
 import dspy
 from typing import List
 import os
 from dotenv import load_dotenv


 load_dotenv()


 class QuestionAnswerer(dspy.Signature):
    """Answer questions with detailed, accurate responses."""
    question = dspy.InputField()
    answer = dspy.OutputField(desc="detailed answer to the question")


 class JudgeQuality(dspy.Signature):
    """Evaluate the quality of an answer on a scale of 1-10."""
    question = dspy.InputField()
    answer = dspy.InputField()
    score = dspy.OutputField(desc="score from 1-10")
    reasoning = dspy.OutputField(desc="brief explanation of the score")


 class QAModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought(QuestionAnswerer)

    def forward(self, question):
        result = self.generate_answer(question=question)
        return dspy.Prediction(answer=result.answer)


 class LLMJudge(dspy.Module):
    """LLM-based judge for evaluating answer quality."""
    def __init__(self):
        super().__init__()
        self.judge = dspy.ChainOfThought(JudgeQuality)

    def forward(self, question, answer):
        result = self.judge(question=question, answer=answer)
        try:
            score = float(result.score)
            if score < 1:
                score = 1
            elif score > 10:
                score = 10
        except (ValueError, TypeError):
            score = 5.0

        return score, result.reasoning


 def create_training_data():
    """Create a small dataset for optimization."""
    return [
        dspy.Example(
            question="What is the capital of France?",
            answer="Paris"
        ).with_inputs("question"),
        dspy.Example(
            question="What causes rain?",
            answer="Rain is caused by water vapor in the atmosphere condensing into droplets that become heavy enough to fall."
        ).with_inputs("question"),
        dspy.Example(
            question="Who wrote Romeo and Juliet?",
            answer="William Shakespeare"
        ).with_inputs("question"),
        dspy.Example(
            question="What is photosynthesis?",
            answer="Photosynthesis is the process by which plants use sunlight, water, and carbon dioxide to produce oxygen and energy in the form of sugar."
        ).with_inputs("question"),
        dspy.Example(
            question="How many continents are there?",
            answer="There are seven continents: Africa, Antarctica, Asia, Europe, North America, Oceania, and South America."
        ).with_inputs("question"),
    ]


 def judge_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
    """Metric that uses LLM judge to evaluate predictions with feedback."""
    judge = LLMJudge()
    score, reasoning = judge(question=gold.question, answer=pred.answer)

    print(f"\nQuestion: {gold.question}")
    print(f"Answer: {pred.answer}")
    print(f"Judge Score: {score}/10")
    print(f"Reasoning: {reasoning}")

    normalized_score = score / 10.0

    if hasattr(gold, 'answer'):
        feedback = f"Score: {score}/10. {reasoning}\nExpected answer: {gold.answer}"
    else:
        feedback = f"Score: {score}/10. {reasoning}"

    return dspy.Prediction(score=normalized_score, feedback=feedback)


 def main():
    """Demonstrate GEPA optimizer with LLM-as-judge."""

    dspy.configure(
        lm=dspy.LM(
            model="openrouter/qwen/qwen3-235b-a22b-2507",
            api_key=os.getenv("OPENROUTER_API_KEY"),
            api_base=os.getenv("OPENAI_API_BASE"),
        )
    )
    print("Creating training dataset...")
    trainset = create_training_data()

    print("\n" + "="*60)
    print("Testing baseline model (before optimization)")
    print("="*60)

    baseline_model = QAModule()

    test_question = "What is machine learning?"
    print(f"\nTest question: {test_question}")
    baseline_result = baseline_model(question=test_question)
    print(f"Baseline answer: {baseline_result.answer}")

    judge = LLMJudge()
    baseline_score, baseline_reasoning = judge(
        question=test_question,
        answer=baseline_result.answer
    )
    print(f"Judge score: {baseline_score}/10")
    print(f"Reasoning: {baseline_reasoning}")

    print("\n" + "="*60)
    print("Running GEPA optimization with LLM judge")
    print("="*60)

    reflection_lm = dspy.LM(
        model="openrouter/moonshotai/kimi-linear-48b-a3b-instruct",
        api_key=os.getenv("OPENROUTER_API_KEY"),
        api_base=os.getenv("OPENAI_API_BASE"),
        temperature=1.0,
        max_tokens=10000
    )

    optimizer = dspy.GEPA(
        metric=judge_metric,
        reflection_lm=reflection_lm,
        auto="medium",
        num_threads=1,
        track_stats=True,
        log_dir="./gepa_logs",
        seed=42
    )

    print("\nOptimizing model...")
    optimized_model = optimizer.compile(
        student=QAModule(),
        trainset=trainset
    )

    print("\n" + "="*60)
    print("Testing optimized model")
    print("="*60)

    print(f"\nTest question: {test_question}")
    optimized_result = optimized_model(question=test_question)
    print(f"Optimized answer: {optimized_result.answer}")

    optimized_score, optimized_reasoning = judge(
        question=test_question,
        answer=optimized_result.answer
    )
    print(f"Judge score: {optimized_score}/10")
    print(f"Reasoning: {optimized_reasoning}")

    print("\n" + "="*60)
    print("Comparison Summary")
    print("="*60)
    print(f"Baseline score: {baseline_score}/10")
    print(f"Optimized score: {optimized_score}/10")
    print(f"Improvement: {optimized_score - baseline_score:+.1f} points")


 if __name__ == "__main__":
    main()
	import dspy
	from typing import List
	import os
	from dotenv import load_dotenv


	load_dotenv()


	class QuestionAnswerer(dspy.Signature):
	"""Answer questions with detailed, accurate responses."""
	question = dspy.InputField()
	answer = dspy.OutputField(desc="detailed answer to the question")


	class JudgeQuality(dspy.Signature):
	"""Evaluate the quality of an answer on a scale of 1-10."""
	question = dspy.InputField()
	answer = dspy.InputField()
	score = dspy.OutputField(desc="score from 1-10")
	reasoning = dspy.OutputField(desc="brief explanation of the score")


	class QAModule(dspy.Module):
	def __init__(self):
	super().__init__()
	self.generate_answer = dspy.ChainOfThought(QuestionAnswerer)

	def forward(self, question):
	result = self.generate_answer(question=question)
	return dspy.Prediction(answer=result.answer)


	class LLMJudge(dspy.Module):
	"""LLM-based judge for evaluating answer quality."""
	def __init__(self):
	super().__init__()
	self.judge = dspy.ChainOfThought(JudgeQuality)

	def forward(self, question, answer):
	result = self.judge(question=question, answer=answer)
	try:
	score = float(result.score)
	if score < 1:
	score = 1
	elif score > 10:
	score = 10
	except (ValueError, TypeError):
	score = 5.0

	return score, result.reasoning


	def create_training_data():
	"""Create a small dataset for optimization."""
	return [
	dspy.Example(
	question="What is the capital of France?",
	answer="Paris"
	).with_inputs("question"),
	dspy.Example(
	question="What causes rain?",
	answer="Rain is caused by water vapor in the atmosphere condensing into droplets that become heavy enough to fall."
	).with_inputs("question"),
	dspy.Example(
	question="Who wrote Romeo and Juliet?",
	answer="William Shakespeare"
	).with_inputs("question"),
	dspy.Example(
	question="What is photosynthesis?",
	answer="Photosynthesis is the process by which plants use sunlight, water, and carbon dioxide to produce oxygen and energy in the form of sugar."
	).with_inputs("question"),
	dspy.Example(
	question="How many continents are there?",
	answer="There are seven continents: Africa, Antarctica, Asia, Europe, North America, Oceania, and South America."
	).with_inputs("question"),
	]


	def judge_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
	"""Metric that uses LLM judge to evaluate predictions with feedback."""
	judge = LLMJudge()
	score, reasoning = judge(question=gold.question, answer=pred.answer)

	print(f"\nQuestion: {gold.question}")
	print(f"Answer: {pred.answer}")
	print(f"Judge Score: {score}/10")
	print(f"Reasoning: {reasoning}")

	normalized_score = score / 10.0

	if hasattr(gold, 'answer'):
	feedback = f"Score: {score}/10. {reasoning}\nExpected answer: {gold.answer}"
	else:
	feedback = f"Score: {score}/10. {reasoning}"

	return dspy.Prediction(score=normalized_score, feedback=feedback)


	def main():
	"""Demonstrate GEPA optimizer with LLM-as-judge."""

	dspy.configure(
	lm=dspy.LM(
	model="openrouter/qwen/qwen3-235b-a22b-2507",
	api_key=os.getenv("OPENROUTER_API_KEY"),
	api_base=os.getenv("OPENAI_API_BASE"),
	)
	)
	print("Creating training dataset...")
	trainset = create_training_data()

	print("\n" + "="*60)
	print("Testing baseline model (before optimization)")
	print("="*60)

	baseline_model = QAModule()

	test_question = "What is machine learning?"
	print(f"\nTest question: {test_question}")
	baseline_result = baseline_model(question=test_question)
	print(f"Baseline answer: {baseline_result.answer}")

	judge = LLMJudge()
	baseline_score, baseline_reasoning = judge(
	question=test_question,
	answer=baseline_result.answer
	)
	print(f"Judge score: {baseline_score}/10")
	print(f"Reasoning: {baseline_reasoning}")

	print("\n" + "="*60)
	print("Running GEPA optimization with LLM judge")
	print("="*60)

	reflection_lm = dspy.LM(
	model="openrouter/moonshotai/kimi-linear-48b-a3b-instruct",
	api_key=os.getenv("OPENROUTER_API_KEY"),
	api_base=os.getenv("OPENAI_API_BASE"),
	temperature=1.0,
	max_tokens=10000
	)

	optimizer = dspy.GEPA(
	metric=judge_metric,
	reflection_lm=reflection_lm,
	auto="medium",
	num_threads=1,
	track_stats=True,
	log_dir="./gepa_logs",
	seed=42
	)

	print("\nOptimizing model...")
	optimized_model = optimizer.compile(
	student=QAModule(),
	trainset=trainset
	)

	print("\n" + "="*60)
	print("Testing optimized model")
	print("="*60)

	print(f"\nTest question: {test_question}")
	optimized_result = optimized_model(question=test_question)
	print(f"Optimized answer: {optimized_result.answer}")

	optimized_score, optimized_reasoning = judge(
	question=test_question,
	answer=optimized_result.answer
	)
	print(f"Judge score: {optimized_score}/10")
	print(f"Reasoning: {optimized_reasoning}")

	print("\n" + "="*60)
	print("Comparison Summary")
	print("="*60)
	print(f"Baseline score: {baseline_score}/10")
	print(f"Optimized score: {optimized_score}/10")
	print(f"Improvement: {optimized_score - baseline_score:+.1f} points")


	if __name__ == "__main__":
	main()
No results found