esz135888 · May 23, 2026 22:44
diff --git a/acceptance-tests.md b/acceptance-tests.md
diff --git a/artifact-url-or-pr.md b/artifact-url-or-pr.md
diff --git a/data-model.md b/data-model.md
diff --git a/decision-record.md b/decision-record.md
diff --git a/learning-memory.json b/learning-memory.json
 {
  "job_id": "f3ffcd19-559d-4803-895a-31d3765e5808",
  "project": "AI 自建專案：公司AI化",
  "learned_at": "2026-05-24T06:50:00+08:00",
  "solution_selection": "eval + spreadsheet + system + watchdog",
  "market_context": [
    {
      "source": "OpenAI evaluation best practices",
      "lesson": "Continuous evaluation should grow eval sets from production, historical, and human-curated data."
    },
    {
      "source": "LangSmith evaluation platform",
      "lesson": "Mature eval workflows run in production monitoring and PR/nightly builds."
    },
    {
      "source": "LangChain production monitoring to regression tests",
      "lesson": "Production failures should become offline regression cases."
    }
  ],
  "pls_next_checks": [
    "Check whether golden set has at least 20 cases before agent promotion.",
    "Track evidence_coverage, hit_rate, false_positive_rate, and sync_error_rate.",
    "Require human review for high-risk predictions before project state changes.",
    "Turn every high-impact miss into a regression case with owner and due date."
  ],
  "assumptions_overturned": [
    "A console alone is not enough; reliability needs a maintained golden set.",
    "Tool choice should follow source sync and eval metrics, not preference.",
    "AI prediction confidence is not a production metric unless later evidence validates it."
  ],
  "next_iteration_condition": "Run the 20-case golden set against real PLS evidence and produce the first prediction reliability report."
 }
diff --git a/market-maturity.md b/market-maturity.md
diff --git a/prediction-golden-set-runner.html b/prediction-golden-set-runner.html
 <!doctype html>
 <html lang="zh-Hant">
 <head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>Prediction Golden Set Runner</title>
  <style>
    :root { --ink:#172033; --muted:#617086; --line:#d8dee9; --bg:#f5f7fb; --panel:#fff; --green:#087443; --red:#b42318; --amber:#a15c07; --blue:#175cd3; }
    * { box-sizing:border-box; }
    body { margin:0; background:var(--bg); color:var(--ink); font:14px/1.5 -apple-system,BlinkMacSystemFont,"Segoe UI",sans-serif; }
    header { background:var(--panel); border-bottom:1px solid var(--line); padding:28px 32px; }
    h1 { margin:0 0 6px; font-size:26px; letter-spacing:0; }
    h2 { margin:0 0 12px; font-size:17px; }
    main { max-width:1240px; margin:0 auto; padding:22px 18px 42px; display:grid; gap:16px; }
    section { background:var(--panel); border:1px solid var(--line); border-radius:8px; padding:18px; }
    .grid { display:grid; grid-template-columns:repeat(4,minmax(0,1fr)); gap:12px; }
    .card { border:1px solid var(--line); border-radius:8px; background:#fbfcff; padding:14px; min-height:116px; }
    .label { color:var(--muted); font-size:12px; text-transform:uppercase; }
    .value { font-size:24px; font-weight:750; margin-top:4px; }
    .green { color:var(--green); } .red { color:var(--red); } .amber { color:var(--amber); } .blue { color:var(--blue); }
    table { width:100%; border-collapse:collapse; }
    th,td { text-align:left; vertical-align:top; border-bottom:1px solid var(--line); padding:10px 8px; }
    th { color:var(--muted); font-size:12px; }
    code { background:#eef2f7; border-radius:4px; padding:1px 5px; }
    .small { color:var(--muted); font-size:12px; }
    .pill { display:inline-block; border:1px solid var(--line); border-radius:999px; padding:2px 9px; background:#fff; }
    @media (max-width:900px){ header{padding:22px 18px;} .grid{grid-template-columns:1fr;} }
  </style>
 </head>
 <body>
  <header>
    <h1>Prediction Golden Set Runner</h1>
    <div class="small">Job f3ffcd19-559d-4803-895a-31d3765e5808 · owner Louis · governance zihrou · implementation iron · due 2026-05-30</div>
  </header>
  <main>
    <section>
      <h2>Promotion Gate</h2>
      <div class="grid">
        <div class="card"><div class="label">Golden Set</div><div class="value green">20 cases</div><div class="small">Seed ready for first validation run.</div></div>
        <div class="card"><div class="label">Evidence Coverage</div><div class="value blue">>=80%</div><div class="small">Signals/action items/commits/deliverables.</div></div>
        <div class="card"><div class="label">Hit Rate</div><div class="value amber">>=70%</div><div class="small">Below 60% blocks promotion.</div></div>
        <div class="card"><div class="label">False Positive</div><div class="value red"><=15%</div><div class="small">Over threshold repairs rubric.</div></div>
      </div>
    </section>
    <section>
      <h2>Runner Workflow</h2>
      <table>
        <tr><th>Step</th><th>Input</th><th>Output</th></tr>
        <tr><td>Import</td><td><code>prediction-golden-set-seed.csv</code></td><td>20 accepted cases or schema errors.</td></tr>
        <tr><td>Evidence match</td><td>signals, action_items, github_commit, deliverables</td><td>evidence_count, match_strength, top_source_type.</td></tr>
        <tr><td>Score</td><td>rubric + evidence links</td><td>hit_score, verdict, false_positive_flag.</td></tr>
        <tr><td>Govern</td><td>risk_tier=high</td><td>needs_review until Louis/zihrou approve.</td></tr>
        <tr><td>Regression</td><td>miss/partial/false positive</td><td>regression case with owner and due date.</td></tr>
      </table>
    </section>
    <section>
      <h2>Watchdog Rules</h2>
      <table>
        <tr><th>Signal</th><th>Threshold</th><th>Owner</th><th>Action</th></tr>
        <tr><td>hit_rate</td><td>&lt;60% after 20 cases</td><td>Louis</td><td>Block agent promotion.</td></tr>
        <tr><td>evidence_coverage</td><td>&lt;80%</td><td>iron</td><td>Fix source sync.</td></tr>
        <tr><td>false_positive_rate</td><td>&gt;15%</td><td>zihrou</td><td>Repair rubric and review rules.</td></tr>
        <tr><td>sync_error_rate</td><td>&gt;5%</td><td>iron</td><td>Dispatch repo_change.</td></tr>
      </table>
    </section>
  </main>
 </body>
 </html>
diff --git a/prediction-golden-set-seed.csv b/prediction-golden-set-seed.csv
diff --git a/production-acceptance.md b/production-acceptance.md
diff --git a/production-brief.md b/production-brief.md
diff --git a/skill-usage.md b/skill-usage.md
diff --git a/solution-selection.md b/solution-selection.md
diff --git a/sources.md b/sources.md
column	type	required	note
`case_id`	text	yes	stable golden case id
`prediction_id`	text	yes	source prediction
`project_id`	uuid	yes	PLS project
`prediction_text`	text	yes	claim to validate
`expected_signal_type`	text	yes	action_item/github_commit/deliverable/message/status_change
`expected_evidence_query`	text	yes	search/join description
`expected_by`	date	yes	validation date
`risk_tier`	enum	yes	low/medium/high
`ground_truth_verdict`	enum	no	hit/partial/miss when known
`human_reviewer`	text	no	required for high risk
column	type	required	note
`run_id`	uuid	yes	runner execution
`case_id`	text	yes	golden case
`evidence_count`	int	yes	matched rows
`top_source_type`	text	no	best evidence source
`match_strength`	numeric	yes	0-1
`hit_score`	numeric	yes	0-100
`verdict`	enum	yes	hit/partial/miss/needs_review
`false_positive_flag`	boolean	yes	reviewer or rule flag
`error_message`	text	no	schema/sync errors
column	type	required	note
`regression_id`	uuid	yes	regression id
`source_case_id`	text	yes	failed golden case
`failure_type`	text	yes	no_evidence/wrong_match/overconfident/high_risk
`expected_fix`	text	yes	rubric/source/schema change
`owner_profile_id`	uuid	yes	Louis/zihrou/iron
`due_at`	date	yes	remediation date
	{
	"job_id": "f3ffcd19-559d-4803-895a-31d3765e5808",
	"project": "AI 自建專案：公司AI化",
	"learned_at": "2026-05-24T06:50:00+08:00",
	"solution_selection": "eval + spreadsheet + system + watchdog",
	"market_context": [
	{
	"source": "OpenAI evaluation best practices",
	"lesson": "Continuous evaluation should grow eval sets from production, historical, and human-curated data."
	},
	{
	"source": "LangSmith evaluation platform",
	"lesson": "Mature eval workflows run in production monitoring and PR/nightly builds."
	},
	{
	"source": "LangChain production monitoring to regression tests",
	"lesson": "Production failures should become offline regression cases."
	}
	],
	"pls_next_checks": [
	"Check whether golden set has at least 20 cases before agent promotion.",
	"Track evidence_coverage, hit_rate, false_positive_rate, and sync_error_rate.",
	"Require human review for high-risk predictions before project state changes.",
	"Turn every high-impact miss into a regression case with owner and due date."
	],
	"assumptions_overturned": [
	"A console alone is not enough; reliability needs a maintained golden set.",
	"Tool choice should follow source sync and eval metrics, not preference.",
	"AI prediction confidence is not a production metric unless later evidence validates it."
	],
	"next_iteration_condition": "Run the 20-case golden set against real PLS evidence and produce the first prediction reliability report."
	}
	<!doctype html>
	<html lang="zh-Hant">
	<head>
	<meta charset="utf-8">
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<title>Prediction Golden Set Runner</title>
	<style>
	:root { --ink:#172033; --muted:#617086; --line:#d8dee9; --bg:#f5f7fb; --panel:#fff; --green:#087443; --red:#b42318; --amber:#a15c07; --blue:#175cd3; }
	* { box-sizing:border-box; }
	body { margin:0; background:var(--bg); color:var(--ink); font:14px/1.5 -apple-system,BlinkMacSystemFont,"Segoe UI",sans-serif; }
	header { background:var(--panel); border-bottom:1px solid var(--line); padding:28px 32px; }
	h1 { margin:0 0 6px; font-size:26px; letter-spacing:0; }
	h2 { margin:0 0 12px; font-size:17px; }
	main { max-width:1240px; margin:0 auto; padding:22px 18px 42px; display:grid; gap:16px; }
	section { background:var(--panel); border:1px solid var(--line); border-radius:8px; padding:18px; }
	.grid { display:grid; grid-template-columns:repeat(4,minmax(0,1fr)); gap:12px; }
	.card { border:1px solid var(--line); border-radius:8px; background:#fbfcff; padding:14px; min-height:116px; }
	.label { color:var(--muted); font-size:12px; text-transform:uppercase; }
	.value { font-size:24px; font-weight:750; margin-top:4px; }
	.green { color:var(--green); } .red { color:var(--red); } .amber { color:var(--amber); } .blue { color:var(--blue); }
	table { width:100%; border-collapse:collapse; }
	th,td { text-align:left; vertical-align:top; border-bottom:1px solid var(--line); padding:10px 8px; }
	th { color:var(--muted); font-size:12px; }
	code { background:#eef2f7; border-radius:4px; padding:1px 5px; }
	.small { color:var(--muted); font-size:12px; }
	.pill { display:inline-block; border:1px solid var(--line); border-radius:999px; padding:2px 9px; background:#fff; }
	@media (max-width:900px){ header{padding:22px 18px;} .grid{grid-template-columns:1fr;} }
	</style>
	</head>
	<body>
	<header>
	<h1>Prediction Golden Set Runner</h1>
	<div class="small">Job f3ffcd19-559d-4803-895a-31d3765e5808 · owner Louis · governance zihrou · implementation iron · due 2026-05-30</div>
	</header>
	<main>
	<section>
	<h2>Promotion Gate</h2>
	<div class="grid">
	<div class="card"><div class="label">Golden Set</div><div class="value green">20 cases</div><div class="small">Seed ready for first validation run.</div></div>
	<div class="card"><div class="label">Evidence Coverage</div><div class="value blue">>=80%</div><div class="small">Signals/action items/commits/deliverables.</div></div>
	<div class="card"><div class="label">Hit Rate</div><div class="value amber">>=70%</div><div class="small">Below 60% blocks promotion.</div></div>
	<div class="card"><div class="label">False Positive</div><div class="value red"><=15%</div><div class="small">Over threshold repairs rubric.</div></div>
	</div>
	</section>
	<section>
	<h2>Runner Workflow</h2>
	<table>
	<tr><th>Step</th><th>Input</th><th>Output</th></tr>
	<tr><td>Import</td><td><code>prediction-golden-set-seed.csv</code></td><td>20 accepted cases or schema errors.</td></tr>
	<tr><td>Evidence match</td><td>signals, action_items, github_commit, deliverables</td><td>evidence_count, match_strength, top_source_type.</td></tr>
	<tr><td>Score</td><td>rubric + evidence links</td><td>hit_score, verdict, false_positive_flag.</td></tr>
	<tr><td>Govern</td><td>risk_tier=high</td><td>needs_review until Louis/zihrou approve.</td></tr>
	<tr><td>Regression</td><td>miss/partial/false positive</td><td>regression case with owner and due date.</td></tr>
	</table>
	</section>
	<section>
	<h2>Watchdog Rules</h2>
	<table>
	<tr><th>Signal</th><th>Threshold</th><th>Owner</th><th>Action</th></tr>
	<tr><td>hit_rate</td><td><60% after 20 cases</td><td>Louis</td><td>Block agent promotion.</td></tr>
	<tr><td>evidence_coverage</td><td><80%</td><td>iron</td><td>Fix source sync.</td></tr>
	<tr><td>false_positive_rate</td><td>>15%</td><td>zihrou</td><td>Repair rubric and review rules.</td></tr>
	<tr><td>sync_error_rate</td><td>>5%</td><td>iron</td><td>Dispatch repo_change.</td></tr>
	</table>
	</section>
	</main>
	</body>
	</html>
case_id	prediction_id	project_id	prediction_text	expected_signal_type	expected_evidence_query	expected_by	risk_tier	ground_truth_verdict	human_reviewer
CASE-001	PRED-001	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	AI 預測驗證模組會產生 evidence sync 需求	github_commit	project_id + commit summary contains evidence sync	2026-05-30	medium
CASE-002	PRED-002	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	zihrou 需要定義高風險 prediction 人工審核邊界	action_item	assignee=zihrou + high risk approval	2026-05-30	high		zihrou
CASE-003	PRED-003	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	Louis 會以 7 天內可驗證成果決定是否加碼 AI 管理層	message_or_decision	Louis message or decision mentions 7 days /加碼	2026-05-30	high		Louis
CASE-004	PRED-004	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	iron 會需要 join key 對齊 signals/action_items/github_commit	action_item	assignee=iron + join key/source sync	2026-05-30	medium
CASE-005	PRED-005	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	prediction miss 會被轉為 regression case	deliverable	deliverable mentions regression case	2026-05-30	medium
CASE-006	PRED-006	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	高風險 verdict 不會自動改 project state	status_change	risk_tier high + needs_review status	2026-05-30	high		zihrou
CASE-007	PRED-007	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	evidence coverage 低於 80 會 alert	deliverable	watchdog alert evidence_coverage	2026-05-30	medium
CASE-008	PRED-008	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	hit rate 低於 60 會暫停 agent promotion	deliverable	hit_rate < 60 + block agent	2026-05-30	high		Louis
CASE-009	PRED-009	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	false positive 超過 15 會重修 rubric	deliverable	false_positive_rate > 15 + rubric	2026-05-30	medium
CASE-010	PRED-010	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	sync error 超過 5 會派 repo_change	action_item	sync_error_rate > 5 + repo_change	2026-05-30	medium
CASE-011	PRED-011	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	golden set 會累積為 20 筆	deliverable	golden set + 20 cases	2026-05-30	low
CASE-012	PRED-012	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	PLS 會把 production failures 轉 regression cases	deliverable	production failure + regression	2026-05-30	medium
CASE-013	PRED-013	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	prediction validation report 會成為下一輪交付	deliverable	prediction reliability report	2026-06-07	medium
CASE-014	PRED-014	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	工具選型會從 schema/API 先行	message_or_decision	tool choice + schema/API first	2026-05-30	medium
CASE-015	PRED-015	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	persona reflections 會作為 evidence source	person_reflection	project_id + persona reflection	2026-05-30	low
CASE-016	PRED-016	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	GitHub commit 類 prediction 會用 semantic overlap 比對	github_commit	semantic overlap + commit	2026-05-30	medium
CASE-017	PRED-017	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	action item 完成狀態會用來驗證預測	action_item	action item status completed/overdue	2026-05-30	medium
CASE-018	PRED-018	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	deliverable_files 上傳成功會作為 production evidence	deliverable	deliverable_files uploaded	2026-05-30	low
CASE-019	PRED-019	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	高風險 human review 會留下 audit reason	status_change	human_review + audit reason	2026-05-30	high		zihrou
CASE-020	PRED-020	d2afbba2-f20a-4ca5-ab6b-8e848e5532ef	連續兩週達標才升級 agent	message_or_decision	two weeks + agent promotion	2026-06-14	high		Louis