|
"""Reproduce the Social Security microdata regression timeline. |
|
|
|
Requirements: |
|
- local clones of PolicyEngine/policyengine-us-data and PolicyEngine/policyengine-us |
|
- git, gh, jq |
|
- network access for PyPI metadata checks |
|
|
|
Set US_DATA_REPO and US_REPO if your clones are in different locations. |
|
""" |
|
|
|
from __future__ import annotations |
|
|
|
import json |
|
import os |
|
import re |
|
import subprocess |
|
import tarfile |
|
import tempfile |
|
import urllib.request |
|
from datetime import datetime |
|
from pathlib import Path |
|
|
|
|
|
US_DATA = Path(os.environ.get("US_DATA_REPO", "/Users/maxghenis/PolicyEngine/policyengine-us-data")) |
|
US = Path(os.environ.get("US_REPO", "/Users/maxghenis/PolicyEngine/policyengine-us")) |
|
|
|
|
|
def run(cmd: str, cwd: Path | None = None) -> str: |
|
result = subprocess.run( |
|
cmd, |
|
cwd=str(cwd) if cwd else None, |
|
shell=True, |
|
text=True, |
|
stdout=subprocess.PIPE, |
|
stderr=subprocess.STDOUT, |
|
check=True, |
|
) |
|
return result.stdout.strip() |
|
|
|
|
|
def git(repo: Path, args: str) -> str: |
|
return run(f"git {args}", cwd=repo) |
|
|
|
|
|
def policyengine_us_version_at(data_sha: str) -> str: |
|
lock = git(US_DATA, f"show {data_sha}:uv.lock") |
|
lines = lock.splitlines() |
|
for idx, line in enumerate(lines): |
|
if line.strip() == 'name = "policyengine-us"': |
|
for following in lines[idx + 1 : idx + 8]: |
|
m = re.search(r'version = "([^"]+)"', following) |
|
if m: |
|
return m.group(1) |
|
return "not found" |
|
|
|
|
|
def pyproject_version_at(repo: Path, sha: str) -> str: |
|
pyproject = git(repo, f"show {sha}:pyproject.toml") |
|
for line in pyproject.splitlines(): |
|
if line.startswith("version = "): |
|
return line.split('"')[1] |
|
return "not found" |
|
|
|
|
|
def commit_row(repo: Path, sha: str) -> dict[str, str]: |
|
out = git(repo, f"show --format='%cI%x09%h%x09%s' --no-patch {sha}") |
|
date, short, subject = out.split("\t", 2) |
|
return {"sha": short, "date": date, "subject": subject} |
|
|
|
|
|
def has_retirement_formula_in_sdist(version: str) -> bool: |
|
with urllib.request.urlopen(f"https://pypi.org/pypi/policyengine-us/{version}/json", timeout=30) as response: |
|
payload = json.load(response) |
|
sdist_url = next( |
|
file["url"] |
|
for file in payload["urls"] |
|
if file["packagetype"] == "sdist" |
|
) |
|
with tempfile.TemporaryDirectory() as tmp: |
|
path = Path(tmp) / f"policyengine_us-{version}.tar.gz" |
|
urllib.request.urlretrieve(sdist_url, path) |
|
with tarfile.open(path) as tar: |
|
member = next( |
|
m |
|
for m in tar.getmembers() |
|
if m.name.endswith( |
|
"policyengine_us/variables/gov/ssa/ss/social_security_retirement.py" |
|
) |
|
) |
|
text = tar.extractfile(member).read().decode() |
|
return "def formula(" in text |
|
|
|
|
|
def duration(start: str, end: str) -> str: |
|
delta = datetime.fromisoformat(end) - datetime.fromisoformat(start) |
|
days = delta.days |
|
hours, rem = divmod(delta.seconds, 3600) |
|
minutes = rem // 60 |
|
return f"{days}d {hours}h {minutes}m" |
|
|
|
|
|
def print_table(rows: list[dict[str, str]], columns: list[str]) -> None: |
|
widths = { |
|
col: max(len(col), *(len(str(row.get(col, ""))) for row in rows)) |
|
for col in columns |
|
} |
|
print(" | ".join(col.ljust(widths[col]) for col in columns)) |
|
print("-+-".join("-" * widths[col] for col in columns)) |
|
for row in rows: |
|
print(" | ".join(str(row.get(col, "")).ljust(widths[col]) for col in columns)) |
|
|
|
|
|
def main() -> None: |
|
print("Repositories") |
|
print(f"policyengine-us-data: {US_DATA}") |
|
print(f"policyengine-us: {US}") |
|
print() |
|
|
|
print("1. Key source-control events") |
|
key_events = [ |
|
("policyengine-us-data", US_DATA, "7025f6a2", "merged formula/adds/subtracts export pruning"), |
|
("policyengine-us-data", US_DATA, "6216d02c", "first data package update after #554"), |
|
("policyengine-us-data", US_DATA, "dd3455a0", "QRF-impute CPS-only variables including SS subcomponents"), |
|
("policyengine-us", US, "c5f1b5da1b", "made social_security_retirement formula-backed"), |
|
("policyengine-us-data", US_DATA, "491ac09c", "first bad dependency lock, failed pipeline"), |
|
("policyengine-us-data", US_DATA, "a2f3bb36", "first completed bad generated data run"), |
|
("policyengine-us-data", US_DATA, "f14931eb", "first promoted/live bad data run found"), |
|
("policyengine-us", US, "06a52825cf", "restored social_security_retirement as canonical input"), |
|
("policyengine-us-data", US_DATA, "61a43e95", "first data package update after the fix"), |
|
] |
|
rows = [] |
|
for repo_name, repo, sha, note in key_events: |
|
row = commit_row(repo, sha) |
|
row["repo"] = repo_name |
|
row["note"] = note |
|
if repo_name == "policyengine-us-data": |
|
row["data_version"] = pyproject_version_at(repo, sha) |
|
row["policyengine_us"] = policyengine_us_version_at(sha) |
|
else: |
|
row["data_version"] = "" |
|
row["policyengine_us"] = pyproject_version_at(repo, sha) |
|
rows.append(row) |
|
print_table(rows, ["date", "repo", "sha", "data_version", "policyengine_us", "note"]) |
|
print() |
|
|
|
print("2. PyPI check: first policyengine-us release where social_security_retirement had a formula") |
|
formula_rows = [] |
|
for version in ["1.642.0", "1.643.0", "1.644.0", "1.691.3"]: |
|
formula_rows.append( |
|
{ |
|
"policyengine_us": version, |
|
"retirement_has_formula": str(has_retirement_formula_in_sdist(version)), |
|
} |
|
) |
|
print_table(formula_rows, ["policyengine_us", "retirement_has_formula"]) |
|
print() |
|
|
|
print("3. Incident windows used in the postmortem") |
|
windows = [ |
|
{ |
|
"window": "total social_security direct-input risk", |
|
"start": "2026-03-04T15:23:42+00:00", |
|
"end": "2026-03-14T13:41:06+00:00", |
|
"duration": duration( |
|
"2026-03-04T15:23:42+00:00", |
|
"2026-03-14T13:41:06+00:00", |
|
), |
|
"basis": "#554 data release -> #589 source fix", |
|
}, |
|
{ |
|
"window": "retirement, generated artifacts", |
|
"start": "2026-04-30T20:51:13+00:00", |
|
"end": "2026-05-12T17:02:15+00:00", |
|
"duration": duration( |
|
"2026-04-30T20:51:13+00:00", |
|
"2026-05-12T17:02:15+00:00", |
|
), |
|
"basis": "first completed bad run -> first fixed Run Pipeline", |
|
}, |
|
{ |
|
"window": "retirement, live/promoted artifacts", |
|
"start": "2026-05-04T03:08:33+00:00", |
|
"end": "2026-05-12T17:02:15+00:00", |
|
"duration": duration( |
|
"2026-05-04T03:08:33+00:00", |
|
"2026-05-12T17:02:15+00:00", |
|
), |
|
"basis": "first promoted bad run found -> first fixed Run Pipeline", |
|
}, |
|
] |
|
print_table(windows, ["window", "start", "end", "duration", "basis"]) |
|
print() |
|
|
|
print("4. GitHub Actions evidence for first fixed data update") |
|
runs = run( |
|
"gh run list --repo PolicyEngine/policyengine-us-data " |
|
"--workflow 'Run Pipeline' --created '2026-05-12..2026-05-13' " |
|
"--limit 10 --json createdAt,conclusion,headSha,url " |
|
"| jq -r '.[] | [.createdAt,.conclusion,.headSha,.url] | @tsv'" |
|
) |
|
print(runs) |
|
print() |
|
|
|
print("Conclusions") |
|
print("- social_security was at risk when policyengine-us-data stopped exporting computed variables in #554.") |
|
print("- The safer data contract is: export leaf inputs; calculate formula/adds/subtracts variables in policyengine-us.") |
|
print("- That contract failed again when policyengine-us made social_security_retirement formula-backed without preserving the canonical input path.") |
|
print("- The May 12 fix restored social_security_retirement as the canonical input and moved reported/data-only paths out of public formulas.") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |