Vilin97 · February 1, 2026 17:17
diff --git a/gistfile1.txt b/gistfile1.txt
 #%% Imports and setup
 import json
 import re

 DATA_PATH = "arxiv-metadata-oai-snapshot.json"

 #%% Load all papers that contain the word "lean" (case-insensitive) anywhere in title/abstract/comments
 papers_with_lean = []

 with open(DATA_PATH, "r") as f:
    for line in f:
        try:
            paper = json.loads(line)
        except json.JSONDecodeError:
            continue

        title = paper.get("title") or ""
        abstract = paper.get("abstract") or ""
        comments = paper.get("comments") or ""
        text = title + " " + abstract + " " + comments

        if re.search(r"\blean\b", text, re.IGNORECASE):
            papers_with_lean.append(paper)

 print(f"Total papers containing the word 'lean': {len(papers_with_lean)}")

 #%% For each paper, extract and print the sentence(s) containing "lean"
 # This lets us manually inspect which uses refer to the Lean theorem prover

 for paper in papers_with_lean:
    pid = paper.get("id", "?")
    title = (paper.get("title") or "").replace("\n", " ").strip()
    abstract = (paper.get("abstract") or "").replace("\n", " ").strip()
    comments = (paper.get("comments") or "").replace("\n", " ").strip()

    # Split into sentences (rough heuristic)
    full_text = title + ". " + abstract + " " + comments
    sentences = re.split(r"(?<=[.!?])\s+", full_text)

    matching_sentences = [
        s.strip() for s in sentences if re.search(r"\blean\b", s, re.IGNORECASE)
    ]

    print(f"\n[{pid}] {title}")
    for s in matching_sentences:
        print(f"  >>> {s}")

 #%% Now filter: keep only papers where "lean" refers to the Lean theorem prover
 # Strategy: match Lean 3, Lean 4, Lean prover, Lean proof assistant,
 #           or Lean near formalization/mathlib/theorem-proving keywords

 lean_prover_papers = []

 for paper in papers_with_lean:
    title = (paper.get("title") or "").replace("\n", " ")
    abstract = (paper.get("abstract") or "").replace("\n", " ")
    comments = (paper.get("comments") or "").replace("\n", " ")
    text = (title + " " + abstract + " " + comments).lower()

    # Direct mentions of Lean as a prover
    is_lean_prover = (
        bool(re.search(r"\blean\s*[34]\b", text))
        or bool(re.search(r"\blean\s+(theorem\s+)?prover\b", text))
        or bool(re.search(r"\blean\s+proof\s+assistant\b", text))
        or bool(re.search(r"\bmathlib\b", text))
        or bool(re.search(r"\blean'?s?\s+math", text))
        # "in Lean" or "using Lean" near formalization keywords
        or bool(
            re.search(
                r"(formal|verif|proof|theorem|tactic|interactive).{0,80}\blean\b", text
            )
        )
        or bool(
            re.search(
                r"\blean\b.{0,80}(formal|verif|proof|theorem|tactic|interactive)", text
            )
        )
    )

    # Exclude known false positives: "lean manufacturing", "lean body", "lean mass",
    # "lean tree", "lean and", "lean mixture", etc.
    is_false_positive = bool(
        re.search(
            r"\blean\s+(manufactur|body|mass|mixture|burn|tree|decomposition|startup|management|enterprise|agile|six\s*sigma|production|supply|process|operation|principle|thinking|approach(?!\s+to\s+(formal|verif|proof)))",
            text,
        )
    )

    if is_lean_prover and not is_false_positive:
        lean_prover_papers.append(paper)

 print(f"\nPapers using Lean (theorem prover) for formalization: {len(lean_prover_papers)}")

 #%% Print all Lean prover papers with their matching sentences for verification

 for paper in lean_prover_papers:
    pid = paper.get("id", "?")
    title = (paper.get("title") or "").replace("\n", " ").strip()
    abstract = (paper.get("abstract") or "").replace("\n", " ").strip()
    comments = (paper.get("comments") or "").replace("\n", " ").strip()
    categories = paper.get("categories", "")
    update_date = paper.get("update_date", "")

    full_text = title + ". " + abstract + " " + comments
    sentences = re.split(r"(?<=[.!?])\s+", full_text)
    matching_sentences = [
        s.strip() for s in sentences if re.search(r"\blean\b", s, re.IGNORECASE)
    ]

    print(f"\n[{pid}] {title}")
    print(f"  categories: {categories}  date: {update_date}")
    for s in matching_sentences:
        print(f"  >>> {s}")

 #%% Summary: count by year (math papers only)

 from collections import Counter
 import matplotlib.pyplot as plt

 year_counts_all = Counter()
 year_counts_math = Counter()
 for paper in lean_prover_papers:
    date = paper.get("update_date", "")
    if not date:
        continue
    year = date[:4]
    year_counts_all[year] += 1
    cats = (paper.get("categories") or "").split()
    if any(c.startswith("math.") for c in cats):
        year_counts_math[year] += 1

 print("\nLean prover papers by year (math.* only):")
 for year in sorted(year_counts_math):
    print(f"  {year}: {year_counts_math[year]}")
 print(f"  TOTAL: {sum(year_counts_math.values())}")

 print("\nLean prover papers by year (all):")
 for year in sorted(year_counts_all):
    print(f"  {year}: {year_counts_all[year]}")
 print(f"  TOTAL: {sum(year_counts_all.values())}")

 #%% Plot: Lean formalization papers by year

 all_years = sorted(set(year_counts_all) | set(year_counts_math))
 counts_all = [year_counts_all[y] for y in all_years]
 counts_math = [year_counts_math[y] for y in all_years]

 fig, ax = plt.subplots(figsize=(8, 3), dpi=300)
 bar_width = 0.35
 x = range(len(all_years))
 ax.bar([i - bar_width / 2 for i in x], counts_all, bar_width, label=f"All Lean prover papers ({sum(counts_all)})")
 ax.bar([i + bar_width / 2 for i in x], counts_math, bar_width, label=f"math.* papers only ({sum(counts_math)})")
 ax.set_xlabel("Year")
 ax.set_ylabel("Number of papers")
 ax.set_title("ArXiv papers using Lean for formalization, by year")
 ax.set_xticks(list(x))
 ax.set_xticklabels(all_years)
 ax.legend()
 for i, (a, m) in enumerate(zip(counts_all, counts_math)):
    ax.text(i - bar_width / 2, a + 1, str(a), ha="center", fontsize=8)
    ax.text(i + bar_width / 2, m + 1, str(m), ha="center", fontsize=8)
 plt.tight_layout()
 plt.show()

 #%% Category breakdown: math vs CS

 math_only = 0
 cs_only = 0
 both_math_cs = 0
 neither = 0

 for paper in lean_prover_papers:
    cats = (paper.get("categories") or "").split()
    has_math = any(c.startswith("math.") for c in cats)
    has_cs = any(c.startswith("cs.") for c in cats)

    if has_math and has_cs:
        both_math_cs += 1
    elif has_math:
        math_only += 1
    elif has_cs:
        cs_only += 1
    else:
        neither += 1

 total = len(lean_prover_papers)
 print("\nCategory breakdown of Lean prover papers:")
 print(f"  math.* only:        {math_only}")
 print(f"  cs.* only:          {cs_only}")
 print(f"  Both math.* & cs.*: {both_math_cs}")
 print(f"  Neither:            {neither}")
 print()
 print(f"  Has any math.* category: {math_only + both_math_cs}  ({100*(math_only + both_math_cs)/total:.1f}%)")
 print(f"  Has any cs.* category:   {cs_only + both_math_cs}  ({100*(cs_only + both_math_cs)/total:.1f}%)")

 #%% Math subcategory distribution

 math_subcats = Counter()
 for paper in lean_prover_papers:
    cats = (paper.get("categories") or "").split()
    for c in cats:
        if c.startswith("math."):
            math_subcats[c] += 1

 print("\nMath subcategories among Lean prover papers:")
 for cat, count in math_subcats.most_common():
    print(f"  {cat}: {count}")

 #%% Cross-check: verify known Lean formalization papers are captured
 # These are known published Lean papers (from conferences like ITP, CPP, CICM, journals, etc.)
 # We search by key title words to see which ones have arxiv preprints in our dataset.

 known_papers = [
    ("Brasca et al. - Fermat regular primes (2025)", ["fermat", "regular", "lean"]),
    ("Loeffler & Stoll - zeta L-functions (2025)", ["zeta", "l-function", "lean"]),
    ("Mercuri - local compactness adele (2025)", ["local compactness", "adele"]),
    ("Riou - derived categories (2025)", ["derived categor", "lean"]),
    ("Asgeirsson et al. - condensed mathematics (2024)", ["condensed", "formal"]),
    ("Asgeirsson - Nobeling (2024)", ["solid abelian"]),
    ("Basold et al. - directed Van Kampen (2024)", ["van kampen", "lean"]),
    ("Bhat et al. - peephole SSA (2024)", ["peephole", "ssa"]),
    ("Clune et al. - Duper superposition (2024)", ["duper", "superposition"]),
    ("Frutos-Fernandez & Nuccio - discrete valuation (2024)", ["discrete valuation", "local field"]),
    ("Ezeh - graphical rewriting monoidal (2024)", ["graphical rewriting", "monoidal"]),
    ("Massot - teaching lean (2024)", ["teaching", "lean", "controlled natural"]),
    ("Obendrauf et al. - coalition logic (2024)", ["coalition logic", "lean"]),
    ("Subercaseaux et al. - empty hexagon (2024)", ["empty hexagon"]),
    ("Doorn & Macbeth - Gagliardo-Nirenberg (2024)", ["gagliardo"]),
    ("Avigad et al. - blockchain compiler (2023)", ["proof-producing", "blockchain"]),
    ("Angdinata & Xu - Weierstrass elliptic (2023)", ["weierstrass", "elliptic", "group law"]),
    ("Baanen et al. - class group Mordell (2023)", ["class group", "mordell"]),
    ("Best et al. - Fermat regular ITP2023 (2023)", ["fermat", "regular"]),
    ("Clune - Keller conjecture (2023)", ["keller", "conjecture", "formal"]),
    ("Frutos-Fernandez - norm extensions (2023)", ["norm extension"]),
    ("Doorn et al. - h-principle sphere eversion (2023)", ["h-principle", "sphere eversion"]),
    ("Dvorak & Blanchette - closure grammars (2023)", ["closure", "grammar", "formal"]),
    ("Livingston - group cohomology (2023)", ["group cohomology", "lean"]),
    ("Mehta - Sharkovsky (2023)", ["sharkovsky"]),
    ("Nash - Gallagher ergodic (2023)", ["gallagher", "ergodic"]),
    ("Porncharoenwase et al. - pretty printer (2023)", ["pretty", "printer"]),
    ("Wieser - multiple inheritance (2023)", ["multiple.{0,5}inheritance", "algebraic hierarch"]),
    ("Ying & Degenne - Doob martingale (2023)", ["doob", "martingale"]),
    ("Zhang - Proj construction (2023)", ["proj construction", "lean"]),
    ("Baanen et al. - Dedekind domains JAR (2022)", ["dedekind domain", "class group"]),
    ("Browning & Lutz - Galois theory (2022)", ["galois theory", "formal"]),
    ("Buzzard et al. - Schemes (2022)", ["schemes", "lean"]),
    ("Frutos-Fernandez - adeles (2022)", ["ring of ad", "lean"]),
    ("Dillies & Mehta - Szemeredi (2022)", ["szemer", "regularity"]),
    ("Dupuis et al. - semilinear (2022)", ["semilinear", "functional analysis"]),
    ("Gouezel - change of variables (2022)", ["change of variable", "mathlib"]),
    ("Kjos-Hanssen et al. - Tversky (2022)", ["tversky", "jaccard"]),
    ("Kudryashov - divergence Cauchy (2022)", ["divergence theorem", "cauchy integral"]),
    ("Mehta - Kruskal-Katona (2022)", ["kruskal", "katona"]),
    ("Porncharoenwase et al. - symbolic evaluation (2022)", ["symbolic evaluation", "merging"]),
    ("Wieser & Song - geometric algebra (2022)", ["geometric algebra", "lean"]),
    ("Wieser & Zhang - graded rings (2022)", ["graded ring", "lean"]),
    ("Baanen et al. - Dedekind domains ITP (2021)", ["dedekind domain", "class group"]),
    ("Bordg & Cavalleri - differential geometry (2021)", ["differential geometry", "lean"]),
    ("Commelin & Lewis - Witt vectors (2021)", ["witt vector"]),
    ("van Doorn - Haar measure (2021)", ["haar measure", "formal"]),
    ("Gouezel - Gromov-Hausdorff (2021)", ["gromov.hausdorff", "formal"]),
    ("Gusakov et al. - Hall marriage (2021)", ["hall", "marriage", "lean"]),
    ("Kudryashov - rotation number (2021)", ["rotation number", "lean"]),
    ("Tassarotti et al. - PAC learnability (2021)", ["pac learn", "decision stump"]),
    ("Buzzard et al. - perfectoid (2020)", ["perfectoid"]),
    ("Han & van Doorn - continuum hypothesis (2020)", ["continuum hypothesis"]),
    ("Carneiro - computability (2019)", ["computability", "partial recursive"]),
    ("Dahmen et al. - cap set (2019)", ["cap set"]),
    ("Mir - schemes 2019 (2019)", ["schemes", "lean"]),
    ("Han & van Doorn - forcing (2019)", ["forcing", "continuum hypothesis"]),
    ("Lewis - Hensel p-adic (2019)", ["hensel", "p-adic"]),
    ("Strickland & Bellumat - chromatic localisation (2019)", ["chromatic locali"]),
 ]

 lean_prover_ids = {p.get("id") for p in lean_prover_papers}

 found_count = 0
 missed_count = 0
 not_on_arxiv_count = 0

 for label, terms in known_papers:
    matches = []
    for paper in papers_with_lean:
        title = (paper.get("title") or "").replace("\n", " ").lower()
        abstract = (paper.get("abstract") or "").replace("\n", " ").lower()
        text = title + " " + abstract
        if all(re.search(t, text) for t in terms):
            matches.append(paper)

    if matches:
        any_captured = False
        for m in matches:
            pid = m.get("id")
            t = (m.get("title") or "").replace("\n", " ").strip()
            captured = pid in lean_prover_ids
            status = "CAPTURED" if captured else "MISSED"
            if captured:
                any_captured = True
            print(f"  {status:>8}  {label}")
            print(f"           -> [{pid}] {t}")
        if any_captured:
            found_count += 1
        else:
            missed_count += 1
    else:
        not_on_arxiv_count += 1
        print(f"  NOT ON ARXIV  {label}")

 print(f"\n--- Cross-check summary (per query, {len(known_papers)} total) ---")
 print(f"Captured by our filter: {found_count}")
 print(f"On arxiv but MISSED:    {missed_count}")
 print(f"Not found on arxiv:     {not_on_arxiv_count}")

 #%% Spot-check specific arxiv IDs

 spot_check_ids = [
    ("2410.01466", "Fermat's Last Theorem for regular primes (2024, Lean4)"),
    ("2405.19270", "Formalising the Local Compactness of the Adele Ring (2024, Lean4)"),
    ("2503.00959", "Formalizing zeta and L-functions in Lean (2025, Lean/Mathlib)"),
    ("2507.05327", "A Formalization of Divided Powers in Lean (2025)"),
    ("2305.08955", "Fermat's Last Theorem for regular primes (2023, Lean)"),
    ("2302.10640", "Elementary Formal Proof of Group Law on Elliptic Curves (2023, Lean3)"),
    ("2210.07746", "Formalising the h-principle and sphere eversion (2022, Lean3)"),
    ("2101.02602", "Schemes in Lean (2021/2022, Lean)"),
 ]

 print("\nSpot-check by arxiv ID:")
 for arxiv_id, label in spot_check_ids:
    in_dataset = arxiv_id in {p.get("id") for p in papers_with_lean}
    captured = arxiv_id in lean_prover_ids
    if not in_dataset:
        status = "NOT IN DATASET"
    elif captured:
        status = "CAPTURED"
    else:
        status = "IN DATASET BUT MISSED"
    print(f"  {status:<22}  [{arxiv_id}] {label}")

 # %%
	#%% Imports and setup
	import json
	import re

	DATA_PATH = "arxiv-metadata-oai-snapshot.json"

	#%% Load all papers that contain the word "lean" (case-insensitive) anywhere in title/abstract/comments
	papers_with_lean = []

	with open(DATA_PATH, "r") as f:
	for line in f:
	try:
	paper = json.loads(line)
	except json.JSONDecodeError:
	continue

	title = paper.get("title") or ""
	abstract = paper.get("abstract") or ""
	comments = paper.get("comments") or ""
	text = title + " " + abstract + " " + comments

	if re.search(r"\blean\b", text, re.IGNORECASE):
	papers_with_lean.append(paper)

	print(f"Total papers containing the word 'lean': {len(papers_with_lean)}")

	#%% For each paper, extract and print the sentence(s) containing "lean"
	# This lets us manually inspect which uses refer to the Lean theorem prover

	for paper in papers_with_lean:
	pid = paper.get("id", "?")
	title = (paper.get("title") or "").replace("\n", " ").strip()
	abstract = (paper.get("abstract") or "").replace("\n", " ").strip()
	comments = (paper.get("comments") or "").replace("\n", " ").strip()

	# Split into sentences (rough heuristic)
	full_text = title + ". " + abstract + " " + comments
	sentences = re.split(r"(?<=[.!?])\s+", full_text)

	matching_sentences = [
	s.strip() for s in sentences if re.search(r"\blean\b", s, re.IGNORECASE)
	]

	print(f"\n[{pid}] {title}")
	for s in matching_sentences:
	print(f" >>> {s}")

	#%% Now filter: keep only papers where "lean" refers to the Lean theorem prover
	# Strategy: match Lean 3, Lean 4, Lean prover, Lean proof assistant,
	# or Lean near formalization/mathlib/theorem-proving keywords

	lean_prover_papers = []

	for paper in papers_with_lean:
	title = (paper.get("title") or "").replace("\n", " ")
	abstract = (paper.get("abstract") or "").replace("\n", " ")
	comments = (paper.get("comments") or "").replace("\n", " ")
	text = (title + " " + abstract + " " + comments).lower()

	# Direct mentions of Lean as a prover
	is_lean_prover = (
	bool(re.search(r"\blean\s*[34]\b", text))
	or bool(re.search(r"\blean\s+(theorem\s+)?prover\b", text))
	or bool(re.search(r"\blean\s+proof\s+assistant\b", text))
	or bool(re.search(r"\bmathlib\b", text))
	or bool(re.search(r"\blean'?s?\s+math", text))
	# "in Lean" or "using Lean" near formalization keywords
	or bool(
	re.search(
	r"(formal\|verif\|proof\|theorem\|tactic\|interactive).{0,80}\blean\b", text
	)
	)
	or bool(
	re.search(
	r"\blean\b.{0,80}(formal\|verif\|proof\|theorem\|tactic\|interactive)", text
	)
	)
	)

	# Exclude known false positives: "lean manufacturing", "lean body", "lean mass",
	# "lean tree", "lean and", "lean mixture", etc.
	is_false_positive = bool(
	re.search(
	r"\blean\s+(manufactur\|body\|mass\|mixture\|burn\|tree\|decomposition\|startup\|management\|enterprise\|agile\|six\s*sigma\|production\|supply\|process\|operation\|principle\|thinking\|approach(?!\s+to\s+(formal\|verif\|proof)))",
	text,
	)
	)

	if is_lean_prover and not is_false_positive:
	lean_prover_papers.append(paper)

	print(f"\nPapers using Lean (theorem prover) for formalization: {len(lean_prover_papers)}")

	#%% Print all Lean prover papers with their matching sentences for verification

	for paper in lean_prover_papers:
	pid = paper.get("id", "?")
	title = (paper.get("title") or "").replace("\n", " ").strip()
	abstract = (paper.get("abstract") or "").replace("\n", " ").strip()
	comments = (paper.get("comments") or "").replace("\n", " ").strip()
	categories = paper.get("categories", "")
	update_date = paper.get("update_date", "")

	full_text = title + ". " + abstract + " " + comments
	sentences = re.split(r"(?<=[.!?])\s+", full_text)
	matching_sentences = [
	s.strip() for s in sentences if re.search(r"\blean\b", s, re.IGNORECASE)
	]

	print(f"\n[{pid}] {title}")
	print(f" categories: {categories} date: {update_date}")
	for s in matching_sentences:
	print(f" >>> {s}")

	#%% Summary: count by year (math papers only)

	from collections import Counter
	import matplotlib.pyplot as plt

	year_counts_all = Counter()
	year_counts_math = Counter()
	for paper in lean_prover_papers:
	date = paper.get("update_date", "")
	if not date:
	continue
	year = date[:4]
	year_counts_all[year] += 1
	cats = (paper.get("categories") or "").split()
	if any(c.startswith("math.") for c in cats):
	year_counts_math[year] += 1

	print("\nLean prover papers by year (math.* only):")
	for year in sorted(year_counts_math):
	print(f" {year}: {year_counts_math[year]}")
	print(f" TOTAL: {sum(year_counts_math.values())}")

	print("\nLean prover papers by year (all):")
	for year in sorted(year_counts_all):
	print(f" {year}: {year_counts_all[year]}")
	print(f" TOTAL: {sum(year_counts_all.values())}")

	#%% Plot: Lean formalization papers by year

	all_years = sorted(set(year_counts_all) \| set(year_counts_math))
	counts_all = [year_counts_all[y] for y in all_years]
	counts_math = [year_counts_math[y] for y in all_years]

	fig, ax = plt.subplots(figsize=(8, 3), dpi=300)
	bar_width = 0.35
	x = range(len(all_years))
	ax.bar([i - bar_width / 2 for i in x], counts_all, bar_width, label=f"All Lean prover papers ({sum(counts_all)})")
	ax.bar([i + bar_width / 2 for i in x], counts_math, bar_width, label=f"math.* papers only ({sum(counts_math)})")
	ax.set_xlabel("Year")
	ax.set_ylabel("Number of papers")
	ax.set_title("ArXiv papers using Lean for formalization, by year")
	ax.set_xticks(list(x))
	ax.set_xticklabels(all_years)
	ax.legend()
	for i, (a, m) in enumerate(zip(counts_all, counts_math)):
	ax.text(i - bar_width / 2, a + 1, str(a), ha="center", fontsize=8)
	ax.text(i + bar_width / 2, m + 1, str(m), ha="center", fontsize=8)
	plt.tight_layout()
	plt.show()

	#%% Category breakdown: math vs CS

	math_only = 0
	cs_only = 0
	both_math_cs = 0
	neither = 0

	for paper in lean_prover_papers:
	cats = (paper.get("categories") or "").split()
	has_math = any(c.startswith("math.") for c in cats)
	has_cs = any(c.startswith("cs.") for c in cats)

	if has_math and has_cs:
	both_math_cs += 1
	elif has_math:
	math_only += 1
	elif has_cs:
	cs_only += 1
	else:
	neither += 1

	total = len(lean_prover_papers)
	print("\nCategory breakdown of Lean prover papers:")
	print(f" math.* only: {math_only}")
	print(f" cs.* only: {cs_only}")
	print(f" Both math.* & cs.*: {both_math_cs}")
	print(f" Neither: {neither}")
	print()
	print(f" Has any math.* category: {math_only + both_math_cs} ({100*(math_only + both_math_cs)/total:.1f}%)")
	print(f" Has any cs.* category: {cs_only + both_math_cs} ({100*(cs_only + both_math_cs)/total:.1f}%)")

	#%% Math subcategory distribution

	math_subcats = Counter()
	for paper in lean_prover_papers:
	cats = (paper.get("categories") or "").split()
	for c in cats:
	if c.startswith("math."):
	math_subcats[c] += 1

	print("\nMath subcategories among Lean prover papers:")
	for cat, count in math_subcats.most_common():
	print(f" {cat}: {count}")

	#%% Cross-check: verify known Lean formalization papers are captured
	# These are known published Lean papers (from conferences like ITP, CPP, CICM, journals, etc.)
	# We search by key title words to see which ones have arxiv preprints in our dataset.

	known_papers = [
	("Brasca et al. - Fermat regular primes (2025)", ["fermat", "regular", "lean"]),
	("Loeffler & Stoll - zeta L-functions (2025)", ["zeta", "l-function", "lean"]),
	("Mercuri - local compactness adele (2025)", ["local compactness", "adele"]),
	("Riou - derived categories (2025)", ["derived categor", "lean"]),
	("Asgeirsson et al. - condensed mathematics (2024)", ["condensed", "formal"]),
	("Asgeirsson - Nobeling (2024)", ["solid abelian"]),
	("Basold et al. - directed Van Kampen (2024)", ["van kampen", "lean"]),
	("Bhat et al. - peephole SSA (2024)", ["peephole", "ssa"]),
	("Clune et al. - Duper superposition (2024)", ["duper", "superposition"]),
	("Frutos-Fernandez & Nuccio - discrete valuation (2024)", ["discrete valuation", "local field"]),
	("Ezeh - graphical rewriting monoidal (2024)", ["graphical rewriting", "monoidal"]),
	("Massot - teaching lean (2024)", ["teaching", "lean", "controlled natural"]),
	("Obendrauf et al. - coalition logic (2024)", ["coalition logic", "lean"]),
	("Subercaseaux et al. - empty hexagon (2024)", ["empty hexagon"]),
	("Doorn & Macbeth - Gagliardo-Nirenberg (2024)", ["gagliardo"]),
	("Avigad et al. - blockchain compiler (2023)", ["proof-producing", "blockchain"]),
	("Angdinata & Xu - Weierstrass elliptic (2023)", ["weierstrass", "elliptic", "group law"]),
	("Baanen et al. - class group Mordell (2023)", ["class group", "mordell"]),
	("Best et al. - Fermat regular ITP2023 (2023)", ["fermat", "regular"]),
	("Clune - Keller conjecture (2023)", ["keller", "conjecture", "formal"]),
	("Frutos-Fernandez - norm extensions (2023)", ["norm extension"]),
	("Doorn et al. - h-principle sphere eversion (2023)", ["h-principle", "sphere eversion"]),
	("Dvorak & Blanchette - closure grammars (2023)", ["closure", "grammar", "formal"]),
	("Livingston - group cohomology (2023)", ["group cohomology", "lean"]),
	("Mehta - Sharkovsky (2023)", ["sharkovsky"]),
	("Nash - Gallagher ergodic (2023)", ["gallagher", "ergodic"]),
	("Porncharoenwase et al. - pretty printer (2023)", ["pretty", "printer"]),
	("Wieser - multiple inheritance (2023)", ["multiple.{0,5}inheritance", "algebraic hierarch"]),
	("Ying & Degenne - Doob martingale (2023)", ["doob", "martingale"]),
	("Zhang - Proj construction (2023)", ["proj construction", "lean"]),
	("Baanen et al. - Dedekind domains JAR (2022)", ["dedekind domain", "class group"]),
	("Browning & Lutz - Galois theory (2022)", ["galois theory", "formal"]),
	("Buzzard et al. - Schemes (2022)", ["schemes", "lean"]),
	("Frutos-Fernandez - adeles (2022)", ["ring of ad", "lean"]),
	("Dillies & Mehta - Szemeredi (2022)", ["szemer", "regularity"]),
	("Dupuis et al. - semilinear (2022)", ["semilinear", "functional analysis"]),
	("Gouezel - change of variables (2022)", ["change of variable", "mathlib"]),
	("Kjos-Hanssen et al. - Tversky (2022)", ["tversky", "jaccard"]),
	("Kudryashov - divergence Cauchy (2022)", ["divergence theorem", "cauchy integral"]),
	("Mehta - Kruskal-Katona (2022)", ["kruskal", "katona"]),
	("Porncharoenwase et al. - symbolic evaluation (2022)", ["symbolic evaluation", "merging"]),
	("Wieser & Song - geometric algebra (2022)", ["geometric algebra", "lean"]),
	("Wieser & Zhang - graded rings (2022)", ["graded ring", "lean"]),
	("Baanen et al. - Dedekind domains ITP (2021)", ["dedekind domain", "class group"]),
	("Bordg & Cavalleri - differential geometry (2021)", ["differential geometry", "lean"]),
	("Commelin & Lewis - Witt vectors (2021)", ["witt vector"]),
	("van Doorn - Haar measure (2021)", ["haar measure", "formal"]),
	("Gouezel - Gromov-Hausdorff (2021)", ["gromov.hausdorff", "formal"]),
	("Gusakov et al. - Hall marriage (2021)", ["hall", "marriage", "lean"]),
	("Kudryashov - rotation number (2021)", ["rotation number", "lean"]),
	("Tassarotti et al. - PAC learnability (2021)", ["pac learn", "decision stump"]),
	("Buzzard et al. - perfectoid (2020)", ["perfectoid"]),
	("Han & van Doorn - continuum hypothesis (2020)", ["continuum hypothesis"]),
	("Carneiro - computability (2019)", ["computability", "partial recursive"]),
	("Dahmen et al. - cap set (2019)", ["cap set"]),
	("Mir - schemes 2019 (2019)", ["schemes", "lean"]),
	("Han & van Doorn - forcing (2019)", ["forcing", "continuum hypothesis"]),
	("Lewis - Hensel p-adic (2019)", ["hensel", "p-adic"]),
	("Strickland & Bellumat - chromatic localisation (2019)", ["chromatic locali"]),
	]

	lean_prover_ids = {p.get("id") for p in lean_prover_papers}

	found_count = 0
	missed_count = 0
	not_on_arxiv_count = 0

	for label, terms in known_papers:
	matches = []
	for paper in papers_with_lean:
	title = (paper.get("title") or "").replace("\n", " ").lower()
	abstract = (paper.get("abstract") or "").replace("\n", " ").lower()
	text = title + " " + abstract
	if all(re.search(t, text) for t in terms):
	matches.append(paper)

	if matches:
	any_captured = False
	for m in matches:
	pid = m.get("id")
	t = (m.get("title") or "").replace("\n", " ").strip()
	captured = pid in lean_prover_ids
	status = "CAPTURED" if captured else "MISSED"
	if captured:
	any_captured = True
	print(f" {status:>8} {label}")
	print(f" -> [{pid}] {t}")
	if any_captured:
	found_count += 1
	else:
	missed_count += 1
	else:
	not_on_arxiv_count += 1
	print(f" NOT ON ARXIV {label}")

	print(f"\n--- Cross-check summary (per query, {len(known_papers)} total) ---")
	print(f"Captured by our filter: {found_count}")
	print(f"On arxiv but MISSED: {missed_count}")
	print(f"Not found on arxiv: {not_on_arxiv_count}")

	#%% Spot-check specific arxiv IDs

	spot_check_ids = [
	("2410.01466", "Fermat's Last Theorem for regular primes (2024, Lean4)"),
	("2405.19270", "Formalising the Local Compactness of the Adele Ring (2024, Lean4)"),
	("2503.00959", "Formalizing zeta and L-functions in Lean (2025, Lean/Mathlib)"),
	("2507.05327", "A Formalization of Divided Powers in Lean (2025)"),
	("2305.08955", "Fermat's Last Theorem for regular primes (2023, Lean)"),
	("2302.10640", "Elementary Formal Proof of Group Law on Elliptic Curves (2023, Lean3)"),
	("2210.07746", "Formalising the h-principle and sphere eversion (2022, Lean3)"),
	("2101.02602", "Schemes in Lean (2021/2022, Lean)"),
	]

	print("\nSpot-check by arxiv ID:")
	for arxiv_id, label in spot_check_ids:
	in_dataset = arxiv_id in {p.get("id") for p in papers_with_lean}
	captured = arxiv_id in lean_prover_ids
	if not in_dataset:
	status = "NOT IN DATASET"
	elif captured:
	status = "CAPTURED"
	else:
	status = "IN DATASET BUT MISSED"
	print(f" {status:<22} [{arxiv_id}] {label}")

	# %%
No results found