teamdandelion · February 14, 2025 19:38
diff --git a/example.py b/example.py
 class PuzzleSolution(BaseModel):
  weekly: int
  monthly_min: int
  monthly_max: int

 puzzle = inspect.cleandoc("""A factory produces 5 widgets every 
  weekday, and 3  widgets per day on weekends, and one extra widget on Mondays. 
  How many widgets are produced in a week?

  Each month has four weeks, and one holiday (which may be any day of the week).
  The factory is closed on holidays.
  What is the minimum and maximum number of widgets produced in a month?
  """)

 def test_behavior(provider="openai", model="gpt-4o-mini"):
  @llm.call(provider=provider, model=model, response_model=PuzzleSolution)
  def puzzle_one_step(): return puzzle

  @llm.call(provider=provider, model=model, response_model=PuzzleSolution)
  def puzzle_two_step():
    @llm.call(provider=provider, model=model)
    def solve_puzzle(): return puzzle
    output = solve_puzzle()
    return f"Extract the solution: {output}"
  
  def count_errors(solution):
    m = 0
    if solution.weekly != 32: m+=1 # 5 * 5 + 3 * 2 + 1
    if solution.monthly_min != 122: m+=1 # 32 * 4 - 6 (Monday Holiday)
    if solution.monthly_max != 125: m+=1 # 32 * 4 - 3 (Weekend Holiday)
    return m

  print(f"Testing provider {provider}, model {model}:")
  a,b = puzzle_one_step(), puzzle_two_step()
  print(f"    Immediate tool use: {a} ({count_errors(a)} mistakes)")
  print(f"    Extract from text : {b} ({count_errors(b)} mistakes)")

 test_behavior(provider="openai", model="gpt-4o-mini")
 test_behavior(provider="openai", model="gpt-4o")
 test_behavior(provider="anthropic", model="claude-3-5-haiku-latest")
 test_behavior(provider="anthropic", model="claude-3-5-sonnet-latest")
diff --git a/output.txt b/output.txt
 Testing provider openai, model gpt-4o-mini:
    Immediate tool use: weekly=33 monthly_min=132 monthly_max=144 (3 mistakes)
    Extract from text : weekly=32 monthly_min=123 monthly_max=125 (1 mistakes)
 Testing provider openai, model gpt-4o:
    Immediate tool use: weekly=38 monthly_min=147 monthly_max=152 (3 mistakes)
    Extract from text : weekly=32 monthly_min=122 monthly_max=125 (0 mistakes)
 Testing provider anthropic, model claude-3-5-haiku-latest:
    Immediate tool use: weekly=26 monthly_min=104 monthly_max=104 (3 mistakes)
    Extract from text : weekly=32 monthly_min=122 monthly_max=128 (1 mistakes)
 Testing provider anthropic, model claude-3-5-sonnet-latest:
    Immediate tool use: weekly=31 monthly_min=119 monthly_max=121 (3 mistakes)
    Extract from text : weekly=32 monthly_min=122 monthly_max=125 (0 mistakes)
	class PuzzleSolution(BaseModel):
	weekly: int
	monthly_min: int
	monthly_max: int

	puzzle = inspect.cleandoc("""A factory produces 5 widgets every
	weekday, and 3 widgets per day on weekends, and one extra widget on Mondays.
	How many widgets are produced in a week?

	Each month has four weeks, and one holiday (which may be any day of the week).
	The factory is closed on holidays.
	What is the minimum and maximum number of widgets produced in a month?
	""")

	def test_behavior(provider="openai", model="gpt-4o-mini"):
	@llm.call(provider=provider, model=model, response_model=PuzzleSolution)
	def puzzle_one_step(): return puzzle

	@llm.call(provider=provider, model=model, response_model=PuzzleSolution)
	def puzzle_two_step():
	@llm.call(provider=provider, model=model)
	def solve_puzzle(): return puzzle
	output = solve_puzzle()
	return f"Extract the solution: {output}"

	def count_errors(solution):
	m = 0
	if solution.weekly != 32: m+=1 # 5 * 5 + 3 * 2 + 1
	if solution.monthly_min != 122: m+=1 # 32 * 4 - 6 (Monday Holiday)
	if solution.monthly_max != 125: m+=1 # 32 * 4 - 3 (Weekend Holiday)
	return m

	print(f"Testing provider {provider}, model {model}:")
	a,b = puzzle_one_step(), puzzle_two_step()
	print(f" Immediate tool use: {a} ({count_errors(a)} mistakes)")
	print(f" Extract from text : {b} ({count_errors(b)} mistakes)")

	test_behavior(provider="openai", model="gpt-4o-mini")
	test_behavior(provider="openai", model="gpt-4o")
	test_behavior(provider="anthropic", model="claude-3-5-haiku-latest")
	test_behavior(provider="anthropic", model="claude-3-5-sonnet-latest")
	Testing provider openai, model gpt-4o-mini:
	Immediate tool use: weekly=33 monthly_min=132 monthly_max=144 (3 mistakes)
	Extract from text : weekly=32 monthly_min=123 monthly_max=125 (1 mistakes)
	Testing provider openai, model gpt-4o:
	Immediate tool use: weekly=38 monthly_min=147 monthly_max=152 (3 mistakes)
	Extract from text : weekly=32 monthly_min=122 monthly_max=125 (0 mistakes)
	Testing provider anthropic, model claude-3-5-haiku-latest:
	Immediate tool use: weekly=26 monthly_min=104 monthly_max=104 (3 mistakes)
	Extract from text : weekly=32 monthly_min=122 monthly_max=128 (1 mistakes)
	Testing provider anthropic, model claude-3-5-sonnet-latest:
	Immediate tool use: weekly=31 monthly_min=119 monthly_max=121 (3 mistakes)
	Extract from text : weekly=32 monthly_min=122 monthly_max=125 (0 mistakes)