Skip to content

Instantly share code, notes, and snippets.

@swayson
Created November 11, 2025 03:33
Show Gist options
  • Select an option

  • Save swayson/0b2cfa9d36a4d2c9ac3d2ccc33c71229 to your computer and use it in GitHub Desktop.

Select an option

Save swayson/0b2cfa9d36a4d2c9ac3d2ccc33c71229 to your computer and use it in GitHub Desktop.
import dspy
from typing import List
import os
from dotenv import load_dotenv
load_dotenv()
class QuestionAnswerer(dspy.Signature):
"""Answer questions with detailed, accurate responses."""
question = dspy.InputField()
answer = dspy.OutputField(desc="detailed answer to the question")
class JudgeQuality(dspy.Signature):
"""Evaluate the quality of an answer on a scale of 1-10."""
question = dspy.InputField()
answer = dspy.InputField()
score = dspy.OutputField(desc="score from 1-10")
reasoning = dspy.OutputField(desc="brief explanation of the score")
class QAModule(dspy.Module):
def __init__(self):
super().__init__()
self.generate_answer = dspy.ChainOfThought(QuestionAnswerer)
def forward(self, question):
result = self.generate_answer(question=question)
return dspy.Prediction(answer=result.answer)
class LLMJudge(dspy.Module):
"""LLM-based judge for evaluating answer quality."""
def __init__(self):
super().__init__()
self.judge = dspy.ChainOfThought(JudgeQuality)
def forward(self, question, answer):
result = self.judge(question=question, answer=answer)
try:
score = float(result.score)
if score < 1:
score = 1
elif score > 10:
score = 10
except (ValueError, TypeError):
score = 5.0
return score, result.reasoning
def create_training_data():
"""Create a small dataset for optimization."""
return [
dspy.Example(
question="What is the capital of France?",
answer="Paris"
).with_inputs("question"),
dspy.Example(
question="What causes rain?",
answer="Rain is caused by water vapor in the atmosphere condensing into droplets that become heavy enough to fall."
).with_inputs("question"),
dspy.Example(
question="Who wrote Romeo and Juliet?",
answer="William Shakespeare"
).with_inputs("question"),
dspy.Example(
question="What is photosynthesis?",
answer="Photosynthesis is the process by which plants use sunlight, water, and carbon dioxide to produce oxygen and energy in the form of sugar."
).with_inputs("question"),
dspy.Example(
question="How many continents are there?",
answer="There are seven continents: Africa, Antarctica, Asia, Europe, North America, Oceania, and South America."
).with_inputs("question"),
]
def judge_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
"""Metric that uses LLM judge to evaluate predictions with feedback."""
judge = LLMJudge()
score, reasoning = judge(question=gold.question, answer=pred.answer)
print(f"\nQuestion: {gold.question}")
print(f"Answer: {pred.answer}")
print(f"Judge Score: {score}/10")
print(f"Reasoning: {reasoning}")
normalized_score = score / 10.0
if hasattr(gold, 'answer'):
feedback = f"Score: {score}/10. {reasoning}\nExpected answer: {gold.answer}"
else:
feedback = f"Score: {score}/10. {reasoning}"
return dspy.Prediction(score=normalized_score, feedback=feedback)
def main():
"""Demonstrate GEPA optimizer with LLM-as-judge."""
dspy.configure(
lm=dspy.LM(
model="openrouter/qwen/qwen3-235b-a22b-2507",
api_key=os.getenv("OPENROUTER_API_KEY"),
api_base=os.getenv("OPENAI_API_BASE"),
)
)
print("Creating training dataset...")
trainset = create_training_data()
print("\n" + "="*60)
print("Testing baseline model (before optimization)")
print("="*60)
baseline_model = QAModule()
test_question = "What is machine learning?"
print(f"\nTest question: {test_question}")
baseline_result = baseline_model(question=test_question)
print(f"Baseline answer: {baseline_result.answer}")
judge = LLMJudge()
baseline_score, baseline_reasoning = judge(
question=test_question,
answer=baseline_result.answer
)
print(f"Judge score: {baseline_score}/10")
print(f"Reasoning: {baseline_reasoning}")
print("\n" + "="*60)
print("Running GEPA optimization with LLM judge")
print("="*60)
reflection_lm = dspy.LM(
model="openrouter/moonshotai/kimi-linear-48b-a3b-instruct",
api_key=os.getenv("OPENROUTER_API_KEY"),
api_base=os.getenv("OPENAI_API_BASE"),
temperature=1.0,
max_tokens=10000
)
optimizer = dspy.GEPA(
metric=judge_metric,
reflection_lm=reflection_lm,
auto="medium",
num_threads=1,
track_stats=True,
log_dir="./gepa_logs",
seed=42
)
print("\nOptimizing model...")
optimized_model = optimizer.compile(
student=QAModule(),
trainset=trainset
)
print("\n" + "="*60)
print("Testing optimized model")
print("="*60)
print(f"\nTest question: {test_question}")
optimized_result = optimized_model(question=test_question)
print(f"Optimized answer: {optimized_result.answer}")
optimized_score, optimized_reasoning = judge(
question=test_question,
answer=optimized_result.answer
)
print(f"Judge score: {optimized_score}/10")
print(f"Reasoning: {optimized_reasoning}")
print("\n" + "="*60)
print("Comparison Summary")
print("="*60)
print(f"Baseline score: {baseline_score}/10")
print(f"Optimized score: {optimized_score}/10")
print(f"Improvement: {optimized_score - baseline_score:+.1f} points")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment