camwest · August 9, 2025 23:26
diff --git a/first-eval.ts b/first-eval.ts
 it("detects when themes haven't been added", async () => {
  const response = await generateAIResponse(context)
  const eval = await evaluatePendingCommandAwareness(response)
  expect(eval.score).toBeGreaterThan(0.85)
 })
diff --git a/github-artifacts.yml b/github-artifacts.yml
 - name: Download Main Branch Scorecards
  uses: dawidd6/action-download-artifact@v3
  with:
    workflow: evals2-tests.yml
    branch: main
    name: eval-scorecards
    path: scorecards-main/
diff --git a/handle-non-determinism.ts b/handle-non-determinism.ts
 const results = await Promise.all(
  Array.from({ length: 10 }, async () => {
    const response = await generateAIResponse(context)
    const eval = await evaluatePendingCommandAwareness(response)
    
    return eval.hasGuidanceText && 
           \!eval.hasStartResearchCommand &&
           eval.mentionsPendingThemes && 
           eval.offersNextSteps
  })
 )

 const successRate = results.filter(r => r).length / results.length
 expect(successRate).toBeGreaterThan(0.85)
diff --git a/llm-as-judge.ts b/llm-as-judge.ts
 const { object } = await generateObject({
  model: googleAI("gemini-2.5-pro"),
  schema: z.object({
    hasGuidanceText: z.boolean(),
    mentionsPendingThemes: z.boolean(),
    // ... other criteria
  }),
  prompt: `Evaluate this response: "${response}"...`
 })
diff --git a/performance-tracking.ts b/performance-tracking.ts
 const start = performance.now()
 const result = await fn()
 const duration = performance.now() - start
diff --git a/scorecard-output.txt b/scorecard-output.txt
 <\!-- EVALS2_SCORECARD -->
 <details>
 <summary><b>🟢 Eval Scorecards: 10/10 passed</b></summary>

 Test Suite / Metric                                      Current  Main      Δ      ✓
 ────────────────────────────────────────────────────────────────────────────────
 Available Actions
  Start Research After Execution                           100%      90%    +10%  ✅ 🟢
  Start Research Includes Leaf Themes                      100%     100%      0%  ✅ ⚪
  Start Research Excludes Parent Themes                    100%     100%      0%  ✅ ⚪
  Start Research Uses Current Theme Context                 96%      96%     +1%  ✅ 🟢
  Start Research Avoids Find Stocks Phrase                 100%     100%      0%  ✅ ⚪
  Start Research Generates Valid Command                   100%     100%      0%  ✅ ⚪
 Command Generation
  First response accuracy (command -> command)             100%     100%      0%  ✅ ⚪
  Second response accuracy (TL;DR -> analysis)             100%     100%      0%  ✅ ⚪
  Overall accuracy (both correct)                          100%     100%      0%  ✅ ⚪
 Context Awareness
  Pending Command Awareness                                100%     100%      0%  ✅ ⚪
  User Override                                            100%     100%      0%  ✅ ⚪
 Continuation Responses
  AddTheme Continuation                                    100%     100%      0%  ✅ ⚪
  StartResearch Continuation                               100%     100%      0%  ✅ ⚪
  Research After Theme                                     100%     100%      0%  ✅ ⚪
 First Interaction
  Success rate                                             100%     100%      0%  ✅ ⚪
 StartResearch Guard Rails
  Avoids False Analysis Claims                             100%     100%      0%  ✅ ⚪
  Explains Thematic Matching Only                          100%     100%      0%  ✅ ⚪
  No Targeted Analysis Promises                            100%     100%      0%  ✅ ⚪
  Provides Honest Limitations                              100%     100%      0%  ✅ ⚪
 Stock Defense Quality
  Information density                                       88%      74%    +14%  ✅ 🟢
  Institutional credibility                                 95%      95%     +0%  ✅ 🟢
  Content quality                                           97%      96%     +1%  ✅ 🟢
  Evidence prioritization                                   89%      84%     +5%  ✅ 🟢
  Protocol URL Correctness                                 100%     100%      0%  ✅ ⚪
  Protocol URL Usage                                       100%     100%      0%  ✅ ⚪
 Theme Analysis Quality
  Overall quality score                                    100%      73%    +27%  ✅ 🟢
  Human-friendly quality                                    63%      58%     +5%  ✅ 🟢
  Number translation judgment                               96%      73%    +23%  ✅ 🟢
  References analysis                                      100%      78%    +22%  ✅ 🟢
 Theme Generation Quality
  Overall quality score                                     91%      96%     -5%  ✅ 🔴
  Reasoning quality score                                   91%      95%     -4%  ✅ 🔴
  Variance score (avoiding rigid counts)                    13%      10%     +3%  ✅ 🟢
 ────────────────────────────────────────────────────────────────────────────────
 Updated: 2025-08-09T04:40:28.297Z

 </details>
diff --git a/write-scorecard.ts b/write-scorecard.ts
 await writeScorecard("pending-command-awareness", {
  title: "Pending Command Awareness",
  metrics: [{
    name: "Success Rate",
    value: successRate * 100,
    threshold: 85,
    unit: "%",
    passed: successRate > 0.85
  }],
  overallPassed: successRate > 0.85,
  timestamp: new Date().toISOString()
 })
	it("detects when themes haven't been added", async () => {
	const response = await generateAIResponse(context)
	const eval = await evaluatePendingCommandAwareness(response)
	expect(eval.score).toBeGreaterThan(0.85)
	})
	- name: Download Main Branch Scorecards
	uses: dawidd6/action-download-artifact@v3
	with:
	workflow: evals2-tests.yml
	branch: main
	name: eval-scorecards
	path: scorecards-main/
	const results = await Promise.all(
	Array.from({ length: 10 }, async () => {
	const response = await generateAIResponse(context)
	const eval = await evaluatePendingCommandAwareness(response)

	return eval.hasGuidanceText &&
	\!eval.hasStartResearchCommand &&
	eval.mentionsPendingThemes &&
	eval.offersNextSteps
	})
	)

	const successRate = results.filter(r => r).length / results.length
	expect(successRate).toBeGreaterThan(0.85)
	const { object } = await generateObject({
	model: googleAI("gemini-2.5-pro"),
	schema: z.object({
	hasGuidanceText: z.boolean(),
	mentionsPendingThemes: z.boolean(),
	// ... other criteria
	}),
	prompt: `Evaluate this response: "${response}"...`
	})
	const start = performance.now()
	const result = await fn()
	const duration = performance.now() - start
	<\!-- EVALS2_SCORECARD -->
	<details>
	<summary><b>🟢 Eval Scorecards: 10/10 passed</b></summary>

	Test Suite / Metric Current Main Δ ✓
	────────────────────────────────────────────────────────────────────────────────
	Available Actions
	Start Research After Execution 100% 90% +10% ✅ 🟢
	Start Research Includes Leaf Themes 100% 100% 0% ✅ ⚪
	Start Research Excludes Parent Themes 100% 100% 0% ✅ ⚪
	Start Research Uses Current Theme Context 96% 96% +1% ✅ 🟢
	Start Research Avoids Find Stocks Phrase 100% 100% 0% ✅ ⚪
	Start Research Generates Valid Command 100% 100% 0% ✅ ⚪
	Command Generation
	First response accuracy (command -> command) 100% 100% 0% ✅ ⚪
	Second response accuracy (TL;DR -> analysis) 100% 100% 0% ✅ ⚪
	Overall accuracy (both correct) 100% 100% 0% ✅ ⚪
	Context Awareness
	Pending Command Awareness 100% 100% 0% ✅ ⚪
	User Override 100% 100% 0% ✅ ⚪
	Continuation Responses
	AddTheme Continuation 100% 100% 0% ✅ ⚪
	StartResearch Continuation 100% 100% 0% ✅ ⚪
	Research After Theme 100% 100% 0% ✅ ⚪
	First Interaction
	Success rate 100% 100% 0% ✅ ⚪
	StartResearch Guard Rails
	Avoids False Analysis Claims 100% 100% 0% ✅ ⚪
	Explains Thematic Matching Only 100% 100% 0% ✅ ⚪
	No Targeted Analysis Promises 100% 100% 0% ✅ ⚪
	Provides Honest Limitations 100% 100% 0% ✅ ⚪
	Stock Defense Quality
	Information density 88% 74% +14% ✅ 🟢
	Institutional credibility 95% 95% +0% ✅ 🟢
	Content quality 97% 96% +1% ✅ 🟢
	Evidence prioritization 89% 84% +5% ✅ 🟢
	Protocol URL Correctness 100% 100% 0% ✅ ⚪
	Protocol URL Usage 100% 100% 0% ✅ ⚪
	Theme Analysis Quality
	Overall quality score 100% 73% +27% ✅ 🟢
	Human-friendly quality 63% 58% +5% ✅ 🟢
	Number translation judgment 96% 73% +23% ✅ 🟢
	References analysis 100% 78% +22% ✅ 🟢
	Theme Generation Quality
	Overall quality score 91% 96% -5% ✅ 🔴
	Reasoning quality score 91% 95% -4% ✅ 🔴
	Variance score (avoiding rigid counts) 13% 10% +3% ✅ 🟢
	────────────────────────────────────────────────────────────────────────────────
	Updated: 2025-08-09T04:40:28.297Z

	</details>
	await writeScorecard("pending-command-awareness", {
	title: "Pending Command Awareness",
	metrics: [{
	name: "Success Rate",
	value: successRate * 100,
	threshold: 85,
	unit: "%",
	passed: successRate > 0.85
	}],
	overallPassed: successRate > 0.85,
	timestamp: new Date().toISOString()
	})