Created
February 1, 2026 17:17
-
-
Save Vilin97/d8e9aaf703d062f44b05e7800f09eeb6 to your computer and use it in GitHub Desktop.
Get arXiv papers using Lean
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #%% Imports and setup | |
| import json | |
| import re | |
| DATA_PATH = "arxiv-metadata-oai-snapshot.json" | |
| #%% Load all papers that contain the word "lean" (case-insensitive) anywhere in title/abstract/comments | |
| papers_with_lean = [] | |
| with open(DATA_PATH, "r") as f: | |
| for line in f: | |
| try: | |
| paper = json.loads(line) | |
| except json.JSONDecodeError: | |
| continue | |
| title = paper.get("title") or "" | |
| abstract = paper.get("abstract") or "" | |
| comments = paper.get("comments") or "" | |
| text = title + " " + abstract + " " + comments | |
| if re.search(r"\blean\b", text, re.IGNORECASE): | |
| papers_with_lean.append(paper) | |
| print(f"Total papers containing the word 'lean': {len(papers_with_lean)}") | |
| #%% For each paper, extract and print the sentence(s) containing "lean" | |
| # This lets us manually inspect which uses refer to the Lean theorem prover | |
| for paper in papers_with_lean: | |
| pid = paper.get("id", "?") | |
| title = (paper.get("title") or "").replace("\n", " ").strip() | |
| abstract = (paper.get("abstract") or "").replace("\n", " ").strip() | |
| comments = (paper.get("comments") or "").replace("\n", " ").strip() | |
| # Split into sentences (rough heuristic) | |
| full_text = title + ". " + abstract + " " + comments | |
| sentences = re.split(r"(?<=[.!?])\s+", full_text) | |
| matching_sentences = [ | |
| s.strip() for s in sentences if re.search(r"\blean\b", s, re.IGNORECASE) | |
| ] | |
| print(f"\n[{pid}] {title}") | |
| for s in matching_sentences: | |
| print(f" >>> {s}") | |
| #%% Now filter: keep only papers where "lean" refers to the Lean theorem prover | |
| # Strategy: match Lean 3, Lean 4, Lean prover, Lean proof assistant, | |
| # or Lean near formalization/mathlib/theorem-proving keywords | |
| lean_prover_papers = [] | |
| for paper in papers_with_lean: | |
| title = (paper.get("title") or "").replace("\n", " ") | |
| abstract = (paper.get("abstract") or "").replace("\n", " ") | |
| comments = (paper.get("comments") or "").replace("\n", " ") | |
| text = (title + " " + abstract + " " + comments).lower() | |
| # Direct mentions of Lean as a prover | |
| is_lean_prover = ( | |
| bool(re.search(r"\blean\s*[34]\b", text)) | |
| or bool(re.search(r"\blean\s+(theorem\s+)?prover\b", text)) | |
| or bool(re.search(r"\blean\s+proof\s+assistant\b", text)) | |
| or bool(re.search(r"\bmathlib\b", text)) | |
| or bool(re.search(r"\blean'?s?\s+math", text)) | |
| # "in Lean" or "using Lean" near formalization keywords | |
| or bool( | |
| re.search( | |
| r"(formal|verif|proof|theorem|tactic|interactive).{0,80}\blean\b", text | |
| ) | |
| ) | |
| or bool( | |
| re.search( | |
| r"\blean\b.{0,80}(formal|verif|proof|theorem|tactic|interactive)", text | |
| ) | |
| ) | |
| ) | |
| # Exclude known false positives: "lean manufacturing", "lean body", "lean mass", | |
| # "lean tree", "lean and", "lean mixture", etc. | |
| is_false_positive = bool( | |
| re.search( | |
| r"\blean\s+(manufactur|body|mass|mixture|burn|tree|decomposition|startup|management|enterprise|agile|six\s*sigma|production|supply|process|operation|principle|thinking|approach(?!\s+to\s+(formal|verif|proof)))", | |
| text, | |
| ) | |
| ) | |
| if is_lean_prover and not is_false_positive: | |
| lean_prover_papers.append(paper) | |
| print(f"\nPapers using Lean (theorem prover) for formalization: {len(lean_prover_papers)}") | |
| #%% Print all Lean prover papers with their matching sentences for verification | |
| for paper in lean_prover_papers: | |
| pid = paper.get("id", "?") | |
| title = (paper.get("title") or "").replace("\n", " ").strip() | |
| abstract = (paper.get("abstract") or "").replace("\n", " ").strip() | |
| comments = (paper.get("comments") or "").replace("\n", " ").strip() | |
| categories = paper.get("categories", "") | |
| update_date = paper.get("update_date", "") | |
| full_text = title + ". " + abstract + " " + comments | |
| sentences = re.split(r"(?<=[.!?])\s+", full_text) | |
| matching_sentences = [ | |
| s.strip() for s in sentences if re.search(r"\blean\b", s, re.IGNORECASE) | |
| ] | |
| print(f"\n[{pid}] {title}") | |
| print(f" categories: {categories} date: {update_date}") | |
| for s in matching_sentences: | |
| print(f" >>> {s}") | |
| #%% Summary: count by year (math papers only) | |
| from collections import Counter | |
| import matplotlib.pyplot as plt | |
| year_counts_all = Counter() | |
| year_counts_math = Counter() | |
| for paper in lean_prover_papers: | |
| date = paper.get("update_date", "") | |
| if not date: | |
| continue | |
| year = date[:4] | |
| year_counts_all[year] += 1 | |
| cats = (paper.get("categories") or "").split() | |
| if any(c.startswith("math.") for c in cats): | |
| year_counts_math[year] += 1 | |
| print("\nLean prover papers by year (math.* only):") | |
| for year in sorted(year_counts_math): | |
| print(f" {year}: {year_counts_math[year]}") | |
| print(f" TOTAL: {sum(year_counts_math.values())}") | |
| print("\nLean prover papers by year (all):") | |
| for year in sorted(year_counts_all): | |
| print(f" {year}: {year_counts_all[year]}") | |
| print(f" TOTAL: {sum(year_counts_all.values())}") | |
| #%% Plot: Lean formalization papers by year | |
| all_years = sorted(set(year_counts_all) | set(year_counts_math)) | |
| counts_all = [year_counts_all[y] for y in all_years] | |
| counts_math = [year_counts_math[y] for y in all_years] | |
| fig, ax = plt.subplots(figsize=(8, 3), dpi=300) | |
| bar_width = 0.35 | |
| x = range(len(all_years)) | |
| ax.bar([i - bar_width / 2 for i in x], counts_all, bar_width, label=f"All Lean prover papers ({sum(counts_all)})") | |
| ax.bar([i + bar_width / 2 for i in x], counts_math, bar_width, label=f"math.* papers only ({sum(counts_math)})") | |
| ax.set_xlabel("Year") | |
| ax.set_ylabel("Number of papers") | |
| ax.set_title("ArXiv papers using Lean for formalization, by year") | |
| ax.set_xticks(list(x)) | |
| ax.set_xticklabels(all_years) | |
| ax.legend() | |
| for i, (a, m) in enumerate(zip(counts_all, counts_math)): | |
| ax.text(i - bar_width / 2, a + 1, str(a), ha="center", fontsize=8) | |
| ax.text(i + bar_width / 2, m + 1, str(m), ha="center", fontsize=8) | |
| plt.tight_layout() | |
| plt.show() | |
| #%% Category breakdown: math vs CS | |
| math_only = 0 | |
| cs_only = 0 | |
| both_math_cs = 0 | |
| neither = 0 | |
| for paper in lean_prover_papers: | |
| cats = (paper.get("categories") or "").split() | |
| has_math = any(c.startswith("math.") for c in cats) | |
| has_cs = any(c.startswith("cs.") for c in cats) | |
| if has_math and has_cs: | |
| both_math_cs += 1 | |
| elif has_math: | |
| math_only += 1 | |
| elif has_cs: | |
| cs_only += 1 | |
| else: | |
| neither += 1 | |
| total = len(lean_prover_papers) | |
| print("\nCategory breakdown of Lean prover papers:") | |
| print(f" math.* only: {math_only}") | |
| print(f" cs.* only: {cs_only}") | |
| print(f" Both math.* & cs.*: {both_math_cs}") | |
| print(f" Neither: {neither}") | |
| print() | |
| print(f" Has any math.* category: {math_only + both_math_cs} ({100*(math_only + both_math_cs)/total:.1f}%)") | |
| print(f" Has any cs.* category: {cs_only + both_math_cs} ({100*(cs_only + both_math_cs)/total:.1f}%)") | |
| #%% Math subcategory distribution | |
| math_subcats = Counter() | |
| for paper in lean_prover_papers: | |
| cats = (paper.get("categories") or "").split() | |
| for c in cats: | |
| if c.startswith("math."): | |
| math_subcats[c] += 1 | |
| print("\nMath subcategories among Lean prover papers:") | |
| for cat, count in math_subcats.most_common(): | |
| print(f" {cat}: {count}") | |
| #%% Cross-check: verify known Lean formalization papers are captured | |
| # These are known published Lean papers (from conferences like ITP, CPP, CICM, journals, etc.) | |
| # We search by key title words to see which ones have arxiv preprints in our dataset. | |
| known_papers = [ | |
| ("Brasca et al. - Fermat regular primes (2025)", ["fermat", "regular", "lean"]), | |
| ("Loeffler & Stoll - zeta L-functions (2025)", ["zeta", "l-function", "lean"]), | |
| ("Mercuri - local compactness adele (2025)", ["local compactness", "adele"]), | |
| ("Riou - derived categories (2025)", ["derived categor", "lean"]), | |
| ("Asgeirsson et al. - condensed mathematics (2024)", ["condensed", "formal"]), | |
| ("Asgeirsson - Nobeling (2024)", ["solid abelian"]), | |
| ("Basold et al. - directed Van Kampen (2024)", ["van kampen", "lean"]), | |
| ("Bhat et al. - peephole SSA (2024)", ["peephole", "ssa"]), | |
| ("Clune et al. - Duper superposition (2024)", ["duper", "superposition"]), | |
| ("Frutos-Fernandez & Nuccio - discrete valuation (2024)", ["discrete valuation", "local field"]), | |
| ("Ezeh - graphical rewriting monoidal (2024)", ["graphical rewriting", "monoidal"]), | |
| ("Massot - teaching lean (2024)", ["teaching", "lean", "controlled natural"]), | |
| ("Obendrauf et al. - coalition logic (2024)", ["coalition logic", "lean"]), | |
| ("Subercaseaux et al. - empty hexagon (2024)", ["empty hexagon"]), | |
| ("Doorn & Macbeth - Gagliardo-Nirenberg (2024)", ["gagliardo"]), | |
| ("Avigad et al. - blockchain compiler (2023)", ["proof-producing", "blockchain"]), | |
| ("Angdinata & Xu - Weierstrass elliptic (2023)", ["weierstrass", "elliptic", "group law"]), | |
| ("Baanen et al. - class group Mordell (2023)", ["class group", "mordell"]), | |
| ("Best et al. - Fermat regular ITP2023 (2023)", ["fermat", "regular"]), | |
| ("Clune - Keller conjecture (2023)", ["keller", "conjecture", "formal"]), | |
| ("Frutos-Fernandez - norm extensions (2023)", ["norm extension"]), | |
| ("Doorn et al. - h-principle sphere eversion (2023)", ["h-principle", "sphere eversion"]), | |
| ("Dvorak & Blanchette - closure grammars (2023)", ["closure", "grammar", "formal"]), | |
| ("Livingston - group cohomology (2023)", ["group cohomology", "lean"]), | |
| ("Mehta - Sharkovsky (2023)", ["sharkovsky"]), | |
| ("Nash - Gallagher ergodic (2023)", ["gallagher", "ergodic"]), | |
| ("Porncharoenwase et al. - pretty printer (2023)", ["pretty", "printer"]), | |
| ("Wieser - multiple inheritance (2023)", ["multiple.{0,5}inheritance", "algebraic hierarch"]), | |
| ("Ying & Degenne - Doob martingale (2023)", ["doob", "martingale"]), | |
| ("Zhang - Proj construction (2023)", ["proj construction", "lean"]), | |
| ("Baanen et al. - Dedekind domains JAR (2022)", ["dedekind domain", "class group"]), | |
| ("Browning & Lutz - Galois theory (2022)", ["galois theory", "formal"]), | |
| ("Buzzard et al. - Schemes (2022)", ["schemes", "lean"]), | |
| ("Frutos-Fernandez - adeles (2022)", ["ring of ad", "lean"]), | |
| ("Dillies & Mehta - Szemeredi (2022)", ["szemer", "regularity"]), | |
| ("Dupuis et al. - semilinear (2022)", ["semilinear", "functional analysis"]), | |
| ("Gouezel - change of variables (2022)", ["change of variable", "mathlib"]), | |
| ("Kjos-Hanssen et al. - Tversky (2022)", ["tversky", "jaccard"]), | |
| ("Kudryashov - divergence Cauchy (2022)", ["divergence theorem", "cauchy integral"]), | |
| ("Mehta - Kruskal-Katona (2022)", ["kruskal", "katona"]), | |
| ("Porncharoenwase et al. - symbolic evaluation (2022)", ["symbolic evaluation", "merging"]), | |
| ("Wieser & Song - geometric algebra (2022)", ["geometric algebra", "lean"]), | |
| ("Wieser & Zhang - graded rings (2022)", ["graded ring", "lean"]), | |
| ("Baanen et al. - Dedekind domains ITP (2021)", ["dedekind domain", "class group"]), | |
| ("Bordg & Cavalleri - differential geometry (2021)", ["differential geometry", "lean"]), | |
| ("Commelin & Lewis - Witt vectors (2021)", ["witt vector"]), | |
| ("van Doorn - Haar measure (2021)", ["haar measure", "formal"]), | |
| ("Gouezel - Gromov-Hausdorff (2021)", ["gromov.hausdorff", "formal"]), | |
| ("Gusakov et al. - Hall marriage (2021)", ["hall", "marriage", "lean"]), | |
| ("Kudryashov - rotation number (2021)", ["rotation number", "lean"]), | |
| ("Tassarotti et al. - PAC learnability (2021)", ["pac learn", "decision stump"]), | |
| ("Buzzard et al. - perfectoid (2020)", ["perfectoid"]), | |
| ("Han & van Doorn - continuum hypothesis (2020)", ["continuum hypothesis"]), | |
| ("Carneiro - computability (2019)", ["computability", "partial recursive"]), | |
| ("Dahmen et al. - cap set (2019)", ["cap set"]), | |
| ("Mir - schemes 2019 (2019)", ["schemes", "lean"]), | |
| ("Han & van Doorn - forcing (2019)", ["forcing", "continuum hypothesis"]), | |
| ("Lewis - Hensel p-adic (2019)", ["hensel", "p-adic"]), | |
| ("Strickland & Bellumat - chromatic localisation (2019)", ["chromatic locali"]), | |
| ] | |
| lean_prover_ids = {p.get("id") for p in lean_prover_papers} | |
| found_count = 0 | |
| missed_count = 0 | |
| not_on_arxiv_count = 0 | |
| for label, terms in known_papers: | |
| matches = [] | |
| for paper in papers_with_lean: | |
| title = (paper.get("title") or "").replace("\n", " ").lower() | |
| abstract = (paper.get("abstract") or "").replace("\n", " ").lower() | |
| text = title + " " + abstract | |
| if all(re.search(t, text) for t in terms): | |
| matches.append(paper) | |
| if matches: | |
| any_captured = False | |
| for m in matches: | |
| pid = m.get("id") | |
| t = (m.get("title") or "").replace("\n", " ").strip() | |
| captured = pid in lean_prover_ids | |
| status = "CAPTURED" if captured else "MISSED" | |
| if captured: | |
| any_captured = True | |
| print(f" {status:>8} {label}") | |
| print(f" -> [{pid}] {t}") | |
| if any_captured: | |
| found_count += 1 | |
| else: | |
| missed_count += 1 | |
| else: | |
| not_on_arxiv_count += 1 | |
| print(f" NOT ON ARXIV {label}") | |
| print(f"\n--- Cross-check summary (per query, {len(known_papers)} total) ---") | |
| print(f"Captured by our filter: {found_count}") | |
| print(f"On arxiv but MISSED: {missed_count}") | |
| print(f"Not found on arxiv: {not_on_arxiv_count}") | |
| #%% Spot-check specific arxiv IDs | |
| spot_check_ids = [ | |
| ("2410.01466", "Fermat's Last Theorem for regular primes (2024, Lean4)"), | |
| ("2405.19270", "Formalising the Local Compactness of the Adele Ring (2024, Lean4)"), | |
| ("2503.00959", "Formalizing zeta and L-functions in Lean (2025, Lean/Mathlib)"), | |
| ("2507.05327", "A Formalization of Divided Powers in Lean (2025)"), | |
| ("2305.08955", "Fermat's Last Theorem for regular primes (2023, Lean)"), | |
| ("2302.10640", "Elementary Formal Proof of Group Law on Elliptic Curves (2023, Lean3)"), | |
| ("2210.07746", "Formalising the h-principle and sphere eversion (2022, Lean3)"), | |
| ("2101.02602", "Schemes in Lean (2021/2022, Lean)"), | |
| ] | |
| print("\nSpot-check by arxiv ID:") | |
| for arxiv_id, label in spot_check_ids: | |
| in_dataset = arxiv_id in {p.get("id") for p in papers_with_lean} | |
| captured = arxiv_id in lean_prover_ids | |
| if not in_dataset: | |
| status = "NOT IN DATASET" | |
| elif captured: | |
| status = "CAPTURED" | |
| else: | |
| status = "IN DATASET BUT MISSED" | |
| print(f" {status:<22} [{arxiv_id}] {label}") | |
| # %% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment