Created
November 11, 2025 03:33
-
-
Save swayson/0b2cfa9d36a4d2c9ac3d2ccc33c71229 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import dspy | |
| from typing import List | |
| import os | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| class QuestionAnswerer(dspy.Signature): | |
| """Answer questions with detailed, accurate responses.""" | |
| question = dspy.InputField() | |
| answer = dspy.OutputField(desc="detailed answer to the question") | |
| class JudgeQuality(dspy.Signature): | |
| """Evaluate the quality of an answer on a scale of 1-10.""" | |
| question = dspy.InputField() | |
| answer = dspy.InputField() | |
| score = dspy.OutputField(desc="score from 1-10") | |
| reasoning = dspy.OutputField(desc="brief explanation of the score") | |
| class QAModule(dspy.Module): | |
| def __init__(self): | |
| super().__init__() | |
| self.generate_answer = dspy.ChainOfThought(QuestionAnswerer) | |
| def forward(self, question): | |
| result = self.generate_answer(question=question) | |
| return dspy.Prediction(answer=result.answer) | |
| class LLMJudge(dspy.Module): | |
| """LLM-based judge for evaluating answer quality.""" | |
| def __init__(self): | |
| super().__init__() | |
| self.judge = dspy.ChainOfThought(JudgeQuality) | |
| def forward(self, question, answer): | |
| result = self.judge(question=question, answer=answer) | |
| try: | |
| score = float(result.score) | |
| if score < 1: | |
| score = 1 | |
| elif score > 10: | |
| score = 10 | |
| except (ValueError, TypeError): | |
| score = 5.0 | |
| return score, result.reasoning | |
| def create_training_data(): | |
| """Create a small dataset for optimization.""" | |
| return [ | |
| dspy.Example( | |
| question="What is the capital of France?", | |
| answer="Paris" | |
| ).with_inputs("question"), | |
| dspy.Example( | |
| question="What causes rain?", | |
| answer="Rain is caused by water vapor in the atmosphere condensing into droplets that become heavy enough to fall." | |
| ).with_inputs("question"), | |
| dspy.Example( | |
| question="Who wrote Romeo and Juliet?", | |
| answer="William Shakespeare" | |
| ).with_inputs("question"), | |
| dspy.Example( | |
| question="What is photosynthesis?", | |
| answer="Photosynthesis is the process by which plants use sunlight, water, and carbon dioxide to produce oxygen and energy in the form of sugar." | |
| ).with_inputs("question"), | |
| dspy.Example( | |
| question="How many continents are there?", | |
| answer="There are seven continents: Africa, Antarctica, Asia, Europe, North America, Oceania, and South America." | |
| ).with_inputs("question"), | |
| ] | |
| def judge_metric(gold, pred, trace=None, pred_name=None, pred_trace=None): | |
| """Metric that uses LLM judge to evaluate predictions with feedback.""" | |
| judge = LLMJudge() | |
| score, reasoning = judge(question=gold.question, answer=pred.answer) | |
| print(f"\nQuestion: {gold.question}") | |
| print(f"Answer: {pred.answer}") | |
| print(f"Judge Score: {score}/10") | |
| print(f"Reasoning: {reasoning}") | |
| normalized_score = score / 10.0 | |
| if hasattr(gold, 'answer'): | |
| feedback = f"Score: {score}/10. {reasoning}\nExpected answer: {gold.answer}" | |
| else: | |
| feedback = f"Score: {score}/10. {reasoning}" | |
| return dspy.Prediction(score=normalized_score, feedback=feedback) | |
| def main(): | |
| """Demonstrate GEPA optimizer with LLM-as-judge.""" | |
| dspy.configure( | |
| lm=dspy.LM( | |
| model="openrouter/qwen/qwen3-235b-a22b-2507", | |
| api_key=os.getenv("OPENROUTER_API_KEY"), | |
| api_base=os.getenv("OPENAI_API_BASE"), | |
| ) | |
| ) | |
| print("Creating training dataset...") | |
| trainset = create_training_data() | |
| print("\n" + "="*60) | |
| print("Testing baseline model (before optimization)") | |
| print("="*60) | |
| baseline_model = QAModule() | |
| test_question = "What is machine learning?" | |
| print(f"\nTest question: {test_question}") | |
| baseline_result = baseline_model(question=test_question) | |
| print(f"Baseline answer: {baseline_result.answer}") | |
| judge = LLMJudge() | |
| baseline_score, baseline_reasoning = judge( | |
| question=test_question, | |
| answer=baseline_result.answer | |
| ) | |
| print(f"Judge score: {baseline_score}/10") | |
| print(f"Reasoning: {baseline_reasoning}") | |
| print("\n" + "="*60) | |
| print("Running GEPA optimization with LLM judge") | |
| print("="*60) | |
| reflection_lm = dspy.LM( | |
| model="openrouter/moonshotai/kimi-linear-48b-a3b-instruct", | |
| api_key=os.getenv("OPENROUTER_API_KEY"), | |
| api_base=os.getenv("OPENAI_API_BASE"), | |
| temperature=1.0, | |
| max_tokens=10000 | |
| ) | |
| optimizer = dspy.GEPA( | |
| metric=judge_metric, | |
| reflection_lm=reflection_lm, | |
| auto="medium", | |
| num_threads=1, | |
| track_stats=True, | |
| log_dir="./gepa_logs", | |
| seed=42 | |
| ) | |
| print("\nOptimizing model...") | |
| optimized_model = optimizer.compile( | |
| student=QAModule(), | |
| trainset=trainset | |
| ) | |
| print("\n" + "="*60) | |
| print("Testing optimized model") | |
| print("="*60) | |
| print(f"\nTest question: {test_question}") | |
| optimized_result = optimized_model(question=test_question) | |
| print(f"Optimized answer: {optimized_result.answer}") | |
| optimized_score, optimized_reasoning = judge( | |
| question=test_question, | |
| answer=optimized_result.answer | |
| ) | |
| print(f"Judge score: {optimized_score}/10") | |
| print(f"Reasoning: {optimized_reasoning}") | |
| print("\n" + "="*60) | |
| print("Comparison Summary") | |
| print("="*60) | |
| print(f"Baseline score: {baseline_score}/10") | |
| print(f"Optimized score: {optimized_score}/10") | |
| print(f"Improvement: {optimized_score - baseline_score:+.1f} points") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment