esz135888 · May 23, 2026 20:24
diff --git a/acceptance-tests.md b/acceptance-tests.md
diff --git a/artifact-url-or-pr.md b/artifact-url-or-pr.md
diff --git a/d7-calibration-run-control-tower.html b/d7-calibration-run-control-tower.html
 <!doctype html>
 <html lang="zh-Hant">
 <head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>AI Prediction Verification D7 Calibration Run Control Tower</title>
  <style>
    :root {
      --ink: #172026;
      --muted: #5c6873;
      --line: #d8dde3;
      --bg: #f6f8fb;
      --panel: #ffffff;
      --blue: #2457d6;
      --green: #16845b;
      --amber: #a96600;
      --red: #b42318;
    }
    * { box-sizing: border-box; }
    body {
      margin: 0;
      background: var(--bg);
      color: var(--ink);
      font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
      line-height: 1.5;
    }
    header {
      background: #ffffff;
      border-bottom: 1px solid var(--line);
      padding: 24px clamp(18px, 4vw, 48px);
    }
    h1 { margin: 0; font-size: clamp(24px, 3vw, 38px); letter-spacing: 0; }
    h2 { margin: 0 0 12px; font-size: 18px; }
    h3 { margin: 0 0 8px; font-size: 15px; }
    p { margin: 0 0 10px; }
    main { padding: 22px clamp(18px, 4vw, 48px) 48px; }
    .sub { color: var(--muted); max-width: 1100px; margin-top: 8px; }
    .grid { display: grid; gap: 16px; }
    .cols-4 { grid-template-columns: repeat(4, minmax(0, 1fr)); }
    .cols-3 { grid-template-columns: repeat(3, minmax(0, 1fr)); }
    .cols-2 { grid-template-columns: repeat(2, minmax(0, 1fr)); }
    .panel {
      background: var(--panel);
      border: 1px solid var(--line);
      border-radius: 8px;
      padding: 16px;
    }
    .metric {
      display: flex;
      flex-direction: column;
      min-height: 118px;
      justify-content: space-between;
    }
    .label { color: var(--muted); font-size: 13px; }
    .value { font-size: 30px; font-weight: 760; letter-spacing: 0; }
    .ok { color: var(--green); }
    .warn { color: var(--amber); }
    .stop { color: var(--red); }
    .tag {
      display: inline-flex;
      align-items: center;
      height: 24px;
      padding: 0 8px;
      border-radius: 999px;
      border: 1px solid var(--line);
      color: var(--muted);
      font-size: 12px;
      margin-right: 6px;
      background: #fbfcfe;
    }
    .stage {
      border-left: 4px solid var(--blue);
      padding-left: 12px;
    }
    table {
      width: 100%;
      border-collapse: collapse;
      font-size: 13px;
    }
    th, td {
      border-bottom: 1px solid var(--line);
      padding: 10px 8px;
      text-align: left;
      vertical-align: top;
    }
    th { color: var(--muted); font-weight: 650; background: #fbfcfe; }
    code {
      background: #eef2f7;
      padding: 2px 5px;
      border-radius: 4px;
      font-size: 12px;
    }
    .flow {
      display: grid;
      grid-template-columns: repeat(6, minmax(0, 1fr));
      gap: 10px;
      margin-top: 10px;
    }
    .flow div {
      border: 1px solid var(--line);
      background: #fbfcfe;
      border-radius: 8px;
      padding: 10px;
      min-height: 84px;
    }
    .line { height: 1px; background: var(--line); margin: 18px 0; }
    ul { padding-left: 18px; margin: 0; }
    li { margin: 6px 0; }
    @media (max-width: 980px) {
      .cols-4, .cols-3, .cols-2, .flow { grid-template-columns: 1fr; }
    }
  </style>
 </head>
 <body>
  <header>
    <h1>AI Prediction Verification D7 Calibration Run Control Tower</h1>
    <p class="sub">把上一輪 Reviewer Decision Inbox 推進成可執行的 50-case calibration run：有 owner、due、驗收門檻、資料模型、稽核邊界與下一輪 worker 記憶，避免停在文件堆疊。</p>
    <p><span class="tag">Owner: Louis</span><span class="tag">Reviewers: zihrou / iron</span><span class="tag">Due: 2026-05-31</span><span class="tag">Gate: unknown &lt; 25%</span></p>
  </header>
  <main class="grid">
    <section class="grid cols-4">
      <div class="panel metric">
        <span class="label">Batch Scope</span>
        <span class="value">50</span>
        <span class="label">accepted seed cases from the review queue</span>
      </div>
      <div class="panel metric">
        <span class="label">Reviewer Sample</span>
        <span class="value ok">>=10%</span>
        <span class="label">minimum 5 items manually checked</span>
      </div>
      <div class="panel metric">
        <span class="label">Unknown Ceiling</span>
        <span class="value warn">&lt;25%</span>
        <span class="label">otherwise source adapter work blocks dashboarding</span>
      </div>
      <div class="panel metric">
        <span class="label">Completion Rule</span>
        <span class="value stop">No Pass, No Product</span>
        <span class="label">scorecard and routing required before rollout</span>
      </div>
    </section>

    <section class="panel">
      <h2>30-Day Development Path</h2>
      <div class="grid cols-4">
        <div class="stage"><h3>D1 Readiness</h3><p>Confirm Reviewer Decision Inbox status, lock 50 accepted cases, validate signal/action item source freshness, and assign run owner.</p></div>
        <div class="stage"><h3>D7 Batch Run</h3><p>Execute calibration, label hit/miss/unknown, sample reviewer checks, route every miss or source gap, and publish scorecard.</p></div>
        <div class="stage"><h3>D14 Correction Loop</h3><p>Cluster repeated miss reasons, dispatch source adapter or prediction rubric fixes, then re-run the affected cohort.</p></div>
        <div class="stage"><h3>D30 Operating Cadence</h3><p>Turn the run into a weekly management scorecard with threshold history, people sync, and productization gate.</p></div>
      </div>
    </section>

    <section class="panel">
      <h2>Purpose-to-Purpose E2E</h2>
      <div class="flow">
        <div><strong>Original Purpose</strong><br>Know whether AI review predictions actually became true.</div>
        <div><strong>Inputs</strong><br>Reviewer decisions, signals, action items, previous review predictions.</div>
        <div><strong>Run</strong><br>Calibration batch labels hit, miss, partial, or unknown with evidence links.</div>
        <div><strong>Adoption</strong><br>Louis approves go/no-go; zihrou/iron review disputed samples.</div>
        <div><strong>Improvement</strong><br>Route miss reasons to correction tasks and source gaps to adapters.</div>
        <div><strong>Measured Result</strong><br>Unknown rate, hit rate, reviewer agreement, cycle-time saved, risk reduced.</div>
      </div>
    </section>

    <section class="grid cols-2">
      <div class="panel">
        <h2>Value and Money Path</h2>
        <ul>
          <li>Revenue: only productize AI recommendations once evidence shows useful prediction accuracy.</li>
          <li>Cost: reduce repeated manual review meetings by turning decisions into batchable calibration runs.</li>
          <li>Risk: prevent false confidence by blocking dashboards when unknown evidence exceeds threshold.</li>
          <li>Conversion: give project owners a reliable go/no-go signal for AI workflow adoption.</li>
          <li>Capacity: release reviewer time by sampling disputes instead of checking every prediction by hand.</li>
        </ul>
      </div>
      <div class="panel">
        <h2>Human Capability Improvement</h2>
        <ul>
          <li>Louis can govern AI reviews with measurable gates, not opinion-only status updates.</li>
          <li>zihrou can see which prediction patterns need rubric correction.</li>
          <li>iron can identify missing signals and source adapter gaps before automation spreads.</li>
          <li>Future workers inherit a clear run state and do not restart discovery from zero.</li>
        </ul>
      </div>
    </section>

    <section class="panel">
      <h2>Run Control Checklist</h2>
      <table>
        <thead><tr><th>Gate</th><th>Required Evidence</th><th>Owner</th><th>Pass Rule</th></tr></thead>
        <tbody>
          <tr><td>Seed lock</td><td><code>seed_queue.status=accepted</code> for 50 items</td><td>Louis</td><td>50 eligible cases, no duplicates</td></tr>
          <tr><td>Source sync</td><td>Signals and action items synced after latest review date</td><td>iron</td><td>No stale source over 7 days</td></tr>
          <tr><td>Batch labels</td><td><code>calibration_run_item.match_label</code> populated</td><td>PLS worker</td><td>hit/miss/partial/unknown for all cases</td></tr>
          <tr><td>Reviewer sample</td><td>At least 5 sampled decisions with reviewer agreement</td><td>zihrou</td><td>Agreement >=80% or disputed cases routed</td></tr>
          <tr><td>Unknown control</td><td>Unknown count and reason taxonomy</td><td>Louis</td><td>Unknown &lt;25%; otherwise source gap blocks release</td></tr>
          <tr><td>Correction routing</td><td>Every miss/source gap has owner, due, and next action</td><td>Louis</td><td>100% routed before complete</td></tr>
        </tbody>
      </table>
    </section>

    <section class="grid cols-3">
      <div class="panel">
        <h2>Data and API Contract</h2>
        <p>Primary entities: <code>calibration_run</code>, <code>calibration_run_item</code>, <code>match_label</code>, <code>reviewer_sample_result</code>, <code>correction_route</code>, <code>run_scorecard</code>.</p>
        <p>Worker API: <code>POST /ai-prediction/calibration-runs</code>, <code>POST /items/:id/label</code>, <code>POST /routes</code>, <code>GET /scorecard</code>.</p>
      </div>
      <div class="panel">
        <h2>Permissions and Audit</h2>
        <p>Only project owner can start or close a run. Reviewers can update sample outcomes. Worker writes must include evidence source ids, timestamp, model version, and decision-record reference.</p>
      </div>
      <div class="panel">
        <h2>Adoption Upgrade</h2>
        <p>Once D7 passes, expose weekly scorecard in PLS project backend. If it fails, dispatch D14 source adapter and rubric correction tasks before dashboard productization.</p>
      </div>
    </section>

    <section class="panel">
      <h2>People Sync Draft</h2>
      <p>AI 預測驗證已從 Reviewer Inbox 推進到 D7 calibration run。Louis 負責 2026-05-31 前啟動 50-case batch；zihrou/iron 抽樣至少 5 件。驗收是 unknown &lt;25%、reviewer agreement >=80%、所有 miss/source gap 都有 owner/due/next action。未通過不得做 dashboard 產品化。</p>
    </section>
  </main>
 </body>
 </html>
diff --git a/data-model.md b/data-model.md
diff --git a/decision-record.md b/decision-record.md
diff --git a/learning-memory.json b/learning-memory.json
 {
  "job_id": "7a70ab5d-bfd5-486d-9e96-17fe81064ead",
  "project_topic": "AI prediction verification module for signals and action-item evidence",
  "current_artifact": "D7 Calibration Run Control Tower",
  "owner": "Louis",
  "reviewers": ["zihrou", "iron"],
  "due": "2026-05-31",
  "next_worker_rule": {
    "if_no_calibration_run_exists": "Create D7 calibration_run from 50 accepted reviewer-inbox seed cases.",
    "if_run_status_ready_or_running": "Execute labels for all 50 cases and calculate unknown rate.",
    "if_unknown_rate_gte_25_percent": "Do not build dashboard. Dispatch source_adapter_gap correction tasks.",
    "if_reviewer_sample_lt_5": "Request reviewer sampling from zihrou or iron before completion.",
    "if_run_passed": "Move to D14 correction loop and weekly scorecard backend/dashboard design."
  },
  "acceptance_gate": {
    "total_items": 50,
    "unknown_rate_max": 0.25,
    "reviewer_sample_min": 5,
    "reviewer_agreement_target": 0.8,
    "unrouted_non_hit_count": 0
  },
  "do_not_repeat": [
    "Do not create another generic AI prediction verification concept pack.",
    "Do not complete with only text summary.",
    "Do not productize dashboard before D7 run has pass evidence."
  ],
  "artifact_files": [
    "d7-calibration-run-control-tower.html",
    "production-brief.md",
    "data-model.md",
    "acceptance-tests.md",
    "decision-record.md",
    "sources.md",
    "artifact-url-or-pr.md"
  ]
 }
diff --git a/production-brief.md b/production-brief.md
diff --git a/sources.md b/sources.md
	<!doctype html>
	<html lang="zh-Hant">
	<head>
	<meta charset="utf-8">
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<title>AI Prediction Verification D7 Calibration Run Control Tower</title>
	<style>
	:root {
	--ink: #172026;
	--muted: #5c6873;
	--line: #d8dde3;
	--bg: #f6f8fb;
	--panel: #ffffff;
	--blue: #2457d6;
	--green: #16845b;
	--amber: #a96600;
	--red: #b42318;
	}
	* { box-sizing: border-box; }
	body {
	margin: 0;
	background: var(--bg);
	color: var(--ink);
	font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
	line-height: 1.5;
	}
	header {
	background: #ffffff;
	border-bottom: 1px solid var(--line);
	padding: 24px clamp(18px, 4vw, 48px);
	}
	h1 { margin: 0; font-size: clamp(24px, 3vw, 38px); letter-spacing: 0; }
	h2 { margin: 0 0 12px; font-size: 18px; }
	h3 { margin: 0 0 8px; font-size: 15px; }
	p { margin: 0 0 10px; }
	main { padding: 22px clamp(18px, 4vw, 48px) 48px; }
	.sub { color: var(--muted); max-width: 1100px; margin-top: 8px; }
	.grid { display: grid; gap: 16px; }
	.cols-4 { grid-template-columns: repeat(4, minmax(0, 1fr)); }
	.cols-3 { grid-template-columns: repeat(3, minmax(0, 1fr)); }
	.cols-2 { grid-template-columns: repeat(2, minmax(0, 1fr)); }
	.panel {
	background: var(--panel);
	border: 1px solid var(--line);
	border-radius: 8px;
	padding: 16px;
	}
	.metric {
	display: flex;
	flex-direction: column;
	min-height: 118px;
	justify-content: space-between;
	}
	.label { color: var(--muted); font-size: 13px; }
	.value { font-size: 30px; font-weight: 760; letter-spacing: 0; }
	.ok { color: var(--green); }
	.warn { color: var(--amber); }
	.stop { color: var(--red); }
	.tag {
	display: inline-flex;
	align-items: center;
	height: 24px;
	padding: 0 8px;
	border-radius: 999px;
	border: 1px solid var(--line);
	color: var(--muted);
	font-size: 12px;
	margin-right: 6px;
	background: #fbfcfe;
	}
	.stage {
	border-left: 4px solid var(--blue);
	padding-left: 12px;
	}
	table {
	width: 100%;
	border-collapse: collapse;
	font-size: 13px;
	}
	th, td {
	border-bottom: 1px solid var(--line);
	padding: 10px 8px;
	text-align: left;
	vertical-align: top;
	}
	th { color: var(--muted); font-weight: 650; background: #fbfcfe; }
	code {
	background: #eef2f7;
	padding: 2px 5px;
	border-radius: 4px;
	font-size: 12px;
	}
	.flow {
	display: grid;
	grid-template-columns: repeat(6, minmax(0, 1fr));
	gap: 10px;
	margin-top: 10px;
	}
	.flow div {
	border: 1px solid var(--line);
	background: #fbfcfe;
	border-radius: 8px;
	padding: 10px;
	min-height: 84px;
	}
	.line { height: 1px; background: var(--line); margin: 18px 0; }
	ul { padding-left: 18px; margin: 0; }
	li { margin: 6px 0; }
	@media (max-width: 980px) {
	.cols-4, .cols-3, .cols-2, .flow { grid-template-columns: 1fr; }
	}
	</style>
	</head>
	<body>
	<header>
	<h1>AI Prediction Verification D7 Calibration Run Control Tower</h1>
	<p class="sub">把上一輪 Reviewer Decision Inbox 推進成可執行的 50-case calibration run：有 owner、due、驗收門檻、資料模型、稽核邊界與下一輪 worker 記憶，避免停在文件堆疊。</p>
	<p><span class="tag">Owner: Louis</span><span class="tag">Reviewers: zihrou / iron</span><span class="tag">Due: 2026-05-31</span><span class="tag">Gate: unknown < 25%</span></p>
	</header>
	<main class="grid">
	<section class="grid cols-4">
	<div class="panel metric">
	<span class="label">Batch Scope</span>
	<span class="value">50</span>
	<span class="label">accepted seed cases from the review queue</span>
	</div>
	<div class="panel metric">
	<span class="label">Reviewer Sample</span>
	<span class="value ok">>=10%</span>
	<span class="label">minimum 5 items manually checked</span>
	</div>
	<div class="panel metric">
	<span class="label">Unknown Ceiling</span>
	<span class="value warn"><25%</span>
	<span class="label">otherwise source adapter work blocks dashboarding</span>
	</div>
	<div class="panel metric">
	<span class="label">Completion Rule</span>
	<span class="value stop">No Pass, No Product</span>
	<span class="label">scorecard and routing required before rollout</span>
	</div>
	</section>

	<section class="panel">
	<h2>30-Day Development Path</h2>
	<div class="grid cols-4">
	<div class="stage"><h3>D1 Readiness</h3><p>Confirm Reviewer Decision Inbox status, lock 50 accepted cases, validate signal/action item source freshness, and assign run owner.</p></div>
	<div class="stage"><h3>D7 Batch Run</h3><p>Execute calibration, label hit/miss/unknown, sample reviewer checks, route every miss or source gap, and publish scorecard.</p></div>
	<div class="stage"><h3>D14 Correction Loop</h3><p>Cluster repeated miss reasons, dispatch source adapter or prediction rubric fixes, then re-run the affected cohort.</p></div>
	<div class="stage"><h3>D30 Operating Cadence</h3><p>Turn the run into a weekly management scorecard with threshold history, people sync, and productization gate.</p></div>
	</div>
	</section>

	<section class="panel">
	<h2>Purpose-to-Purpose E2E</h2>
	<div class="flow">
	<div><strong>Original Purpose</strong><br>Know whether AI review predictions actually became true.</div>
	<div><strong>Inputs</strong><br>Reviewer decisions, signals, action items, previous review predictions.</div>
	<div><strong>Run</strong><br>Calibration batch labels hit, miss, partial, or unknown with evidence links.</div>
	<div><strong>Adoption</strong><br>Louis approves go/no-go; zihrou/iron review disputed samples.</div>
	<div><strong>Improvement</strong><br>Route miss reasons to correction tasks and source gaps to adapters.</div>
	<div><strong>Measured Result</strong><br>Unknown rate, hit rate, reviewer agreement, cycle-time saved, risk reduced.</div>
	</div>
	</section>

	<section class="grid cols-2">
	<div class="panel">
	<h2>Value and Money Path</h2>
	<ul>
	<li>Revenue: only productize AI recommendations once evidence shows useful prediction accuracy.</li>
	<li>Cost: reduce repeated manual review meetings by turning decisions into batchable calibration runs.</li>
	<li>Risk: prevent false confidence by blocking dashboards when unknown evidence exceeds threshold.</li>
	<li>Conversion: give project owners a reliable go/no-go signal for AI workflow adoption.</li>
	<li>Capacity: release reviewer time by sampling disputes instead of checking every prediction by hand.</li>
	</ul>
	</div>
	<div class="panel">
	<h2>Human Capability Improvement</h2>
	<ul>
	<li>Louis can govern AI reviews with measurable gates, not opinion-only status updates.</li>
	<li>zihrou can see which prediction patterns need rubric correction.</li>
	<li>iron can identify missing signals and source adapter gaps before automation spreads.</li>
	<li>Future workers inherit a clear run state and do not restart discovery from zero.</li>
	</ul>
	</div>
	</section>

	<section class="panel">
	<h2>Run Control Checklist</h2>
	<table>
	<thead><tr><th>Gate</th><th>Required Evidence</th><th>Owner</th><th>Pass Rule</th></tr></thead>
	<tbody>
	<tr><td>Seed lock</td><td><code>seed_queue.status=accepted</code> for 50 items</td><td>Louis</td><td>50 eligible cases, no duplicates</td></tr>
	<tr><td>Source sync</td><td>Signals and action items synced after latest review date</td><td>iron</td><td>No stale source over 7 days</td></tr>
	<tr><td>Batch labels</td><td><code>calibration_run_item.match_label</code> populated</td><td>PLS worker</td><td>hit/miss/partial/unknown for all cases</td></tr>
	<tr><td>Reviewer sample</td><td>At least 5 sampled decisions with reviewer agreement</td><td>zihrou</td><td>Agreement >=80% or disputed cases routed</td></tr>
	<tr><td>Unknown control</td><td>Unknown count and reason taxonomy</td><td>Louis</td><td>Unknown <25%; otherwise source gap blocks release</td></tr>
	<tr><td>Correction routing</td><td>Every miss/source gap has owner, due, and next action</td><td>Louis</td><td>100% routed before complete</td></tr>
	</tbody>
	</table>
	</section>

	<section class="grid cols-3">
	<div class="panel">
	<h2>Data and API Contract</h2>
	<p>Primary entities: <code>calibration_run</code>, <code>calibration_run_item</code>, <code>match_label</code>, <code>reviewer_sample_result</code>, <code>correction_route</code>, <code>run_scorecard</code>.</p>
	<p>Worker API: <code>POST /ai-prediction/calibration-runs</code>, <code>POST /items/:id/label</code>, <code>POST /routes</code>, <code>GET /scorecard</code>.</p>
	</div>
	<div class="panel">
	<h2>Permissions and Audit</h2>
	<p>Only project owner can start or close a run. Reviewers can update sample outcomes. Worker writes must include evidence source ids, timestamp, model version, and decision-record reference.</p>
	</div>
	<div class="panel">
	<h2>Adoption Upgrade</h2>
	<p>Once D7 passes, expose weekly scorecard in PLS project backend. If it fails, dispatch D14 source adapter and rubric correction tasks before dashboard productization.</p>
	</div>
	</section>

	<section class="panel">
	<h2>People Sync Draft</h2>
	<p>AI 預測驗證已從 Reviewer Inbox 推進到 D7 calibration run。Louis 負責 2026-05-31 前啟動 50-case batch；zihrou/iron 抽樣至少 5 件。驗收是 unknown <25%、reviewer agreement >=80%、所有 miss/source gap 都有 owner/due/next action。未通過不得做 dashboard 產品化。</p>
	</section>
	</main>
	</body>
	</html>
Field	Type	Required	Notes
id	uuid	yes	Primary run id.
project_id	uuid	yes	PLS project id.
seed_queue_id	uuid	yes	Accepted 50-case seed queue.
status	enum	yes	`draft`, `ready`, `running`, `sample_review`, `passed`, `failed`, `blocked`.
owner_user_id	uuid	yes	Louis for this run.
due_at	datetime	yes	2026-05-31 for D7.
model_version	text	yes	AI prediction model or worker version used.
source_snapshot_at	datetime	yes	Evidence sync boundary.
decision_record_ref	text	yes	Link to decision record.
created_at / updated_at	datetime	yes	Audit timestamps.
Field	Type	Required	Notes
id	uuid	yes	Primary item id.
calibration_run_id	uuid	yes	Parent run.
prediction_id	uuid	yes	Prior review prediction.
reviewer_decision_id	uuid	yes	Accepted reviewer inbox decision.
evidence_refs	jsonb	yes	Signals, action items, review notes.
match_label	enum	yes	`hit`, `miss`, `partial`, `unknown`.
match_confidence	decimal	yes	0 to 1.
miss_reason	enum	no	`bad_prediction`, `late_signal`, `missing_source`, `ambiguous_owner`, `changed_scope`, `other`.
label_rationale	text	yes	Short evidence-based reason.
Field	Type	Required	Notes
id	uuid	yes	Primary sample id.
calibration_run_item_id	uuid	yes	Sampled item.
reviewer_user_id	uuid	yes	zihrou or iron.
reviewer_label	enum	yes	`agree`, `disagree`, `needs_more_evidence`.
reviewer_note	text	no	Dispute or confirmation note.
reviewed_at	datetime	yes	Audit timestamp.
Field	Type	Required	Notes
id	uuid	yes	Primary route id.
calibration_run_item_id	uuid	yes	Miss or unknown item.
route_type	enum	yes	`rubric_fix`, `source_adapter_gap`, `owner_followup`, `model_prompt_fix`, `ignore_with_reason`.
owner_user_id	uuid	yes	Assigned action owner.
due_at	datetime	yes	Due date for correction.
status	enum	yes	`open`, `in_progress`, `verified`, `closed`.
next_action	text	yes	Concrete follow-up.
Endpoint	Method	Purpose
`/ai-prediction/calibration-runs`	POST	Create D7 run from accepted seed queue.
`/ai-prediction/calibration-runs/:id/items/:item_id/label`	POST	Write hit/miss/partial/unknown with evidence refs.
`/ai-prediction/calibration-runs/:id/samples`	POST	Assign reviewer sample set.
`/ai-prediction/calibration-runs/:id/routes`	POST	Route miss and unknown correction work.
`/ai-prediction/calibration-runs/:id/scorecard`	GET	Return scorecard for PLS backend and LINE summary.
	{
	"job_id": "7a70ab5d-bfd5-486d-9e96-17fe81064ead",
	"project_topic": "AI prediction verification module for signals and action-item evidence",
	"current_artifact": "D7 Calibration Run Control Tower",
	"owner": "Louis",
	"reviewers": ["zihrou", "iron"],
	"due": "2026-05-31",
	"next_worker_rule": {
	"if_no_calibration_run_exists": "Create D7 calibration_run from 50 accepted reviewer-inbox seed cases.",
	"if_run_status_ready_or_running": "Execute labels for all 50 cases and calculate unknown rate.",
	"if_unknown_rate_gte_25_percent": "Do not build dashboard. Dispatch source_adapter_gap correction tasks.",
	"if_reviewer_sample_lt_5": "Request reviewer sampling from zihrou or iron before completion.",
	"if_run_passed": "Move to D14 correction loop and weekly scorecard backend/dashboard design."
	},
	"acceptance_gate": {
	"total_items": 50,
	"unknown_rate_max": 0.25,
	"reviewer_sample_min": 5,
	"reviewer_agreement_target": 0.8,
	"unrouted_non_hit_count": 0
	},
	"do_not_repeat": [
	"Do not create another generic AI prediction verification concept pack.",
	"Do not complete with only text summary.",
	"Do not productize dashboard before D7 run has pass evidence."
	],
	"artifact_files": [
	"d7-calibration-run-control-tower.html",
	"production-brief.md",
	"data-model.md",
	"acceptance-tests.md",
	"decision-record.md",
	"sources.md",
	"artifact-url-or-pr.md"
	]
	}
Day	Outcome	Acceptance
D1	Confirm reviewer inbox and source readiness. Lock 50 accepted seed cases.	50 eligible cases, no duplicates, source freshness checked.
D7	Execute calibration batch and publish hit/miss/partial/unknown scorecard.	Unknown <25%, reviewer sample >=10%, all misses/gaps routed.
D14	Correct repeated miss patterns and source adapter gaps.	Correction tasks have owner, due, evidence, and re-run cohort.
D30	Weekly scorecard becomes the operating gate for AI review productization.	Threshold history, adoption owner, and dashboard sync are live.
Layer	Production Choice
Context framework	Prediction verification uses reviewer decisions, signals, action items, and evidence timestamps.
Workflow	D1 readiness -> D7 batch labels -> reviewer sample -> correction routing -> scorecard.
Data / DB model	`calibration_run`, `calibration_run_item`, `match_label`, `reviewer_sample_result`, `correction_route`, `run_scorecard`.
Operable tool	HTML control tower plus structured schema and acceptance tests.
Acceptance indicators	50 cases, unknown <25%, reviewer agreement >=80%, 100% miss/gap routing.
Adoption and upgrade	Pass moves to weekly PLS backend scorecard; fail dispatches source adapter or rubric correction tasks.