Skip to content

Instantly share code, notes, and snippets.

@Vilin97
Created February 1, 2026 17:17
Show Gist options
  • Select an option

  • Save Vilin97/d8e9aaf703d062f44b05e7800f09eeb6 to your computer and use it in GitHub Desktop.

Select an option

Save Vilin97/d8e9aaf703d062f44b05e7800f09eeb6 to your computer and use it in GitHub Desktop.
Get arXiv papers using Lean
#%% Imports and setup
import json
import re
DATA_PATH = "arxiv-metadata-oai-snapshot.json"
#%% Load all papers that contain the word "lean" (case-insensitive) anywhere in title/abstract/comments
papers_with_lean = []
with open(DATA_PATH, "r") as f:
for line in f:
try:
paper = json.loads(line)
except json.JSONDecodeError:
continue
title = paper.get("title") or ""
abstract = paper.get("abstract") or ""
comments = paper.get("comments") or ""
text = title + " " + abstract + " " + comments
if re.search(r"\blean\b", text, re.IGNORECASE):
papers_with_lean.append(paper)
print(f"Total papers containing the word 'lean': {len(papers_with_lean)}")
#%% For each paper, extract and print the sentence(s) containing "lean"
# This lets us manually inspect which uses refer to the Lean theorem prover
for paper in papers_with_lean:
pid = paper.get("id", "?")
title = (paper.get("title") or "").replace("\n", " ").strip()
abstract = (paper.get("abstract") or "").replace("\n", " ").strip()
comments = (paper.get("comments") or "").replace("\n", " ").strip()
# Split into sentences (rough heuristic)
full_text = title + ". " + abstract + " " + comments
sentences = re.split(r"(?<=[.!?])\s+", full_text)
matching_sentences = [
s.strip() for s in sentences if re.search(r"\blean\b", s, re.IGNORECASE)
]
print(f"\n[{pid}] {title}")
for s in matching_sentences:
print(f" >>> {s}")
#%% Now filter: keep only papers where "lean" refers to the Lean theorem prover
# Strategy: match Lean 3, Lean 4, Lean prover, Lean proof assistant,
# or Lean near formalization/mathlib/theorem-proving keywords
lean_prover_papers = []
for paper in papers_with_lean:
title = (paper.get("title") or "").replace("\n", " ")
abstract = (paper.get("abstract") or "").replace("\n", " ")
comments = (paper.get("comments") or "").replace("\n", " ")
text = (title + " " + abstract + " " + comments).lower()
# Direct mentions of Lean as a prover
is_lean_prover = (
bool(re.search(r"\blean\s*[34]\b", text))
or bool(re.search(r"\blean\s+(theorem\s+)?prover\b", text))
or bool(re.search(r"\blean\s+proof\s+assistant\b", text))
or bool(re.search(r"\bmathlib\b", text))
or bool(re.search(r"\blean'?s?\s+math", text))
# "in Lean" or "using Lean" near formalization keywords
or bool(
re.search(
r"(formal|verif|proof|theorem|tactic|interactive).{0,80}\blean\b", text
)
)
or bool(
re.search(
r"\blean\b.{0,80}(formal|verif|proof|theorem|tactic|interactive)", text
)
)
)
# Exclude known false positives: "lean manufacturing", "lean body", "lean mass",
# "lean tree", "lean and", "lean mixture", etc.
is_false_positive = bool(
re.search(
r"\blean\s+(manufactur|body|mass|mixture|burn|tree|decomposition|startup|management|enterprise|agile|six\s*sigma|production|supply|process|operation|principle|thinking|approach(?!\s+to\s+(formal|verif|proof)))",
text,
)
)
if is_lean_prover and not is_false_positive:
lean_prover_papers.append(paper)
print(f"\nPapers using Lean (theorem prover) for formalization: {len(lean_prover_papers)}")
#%% Print all Lean prover papers with their matching sentences for verification
for paper in lean_prover_papers:
pid = paper.get("id", "?")
title = (paper.get("title") or "").replace("\n", " ").strip()
abstract = (paper.get("abstract") or "").replace("\n", " ").strip()
comments = (paper.get("comments") or "").replace("\n", " ").strip()
categories = paper.get("categories", "")
update_date = paper.get("update_date", "")
full_text = title + ". " + abstract + " " + comments
sentences = re.split(r"(?<=[.!?])\s+", full_text)
matching_sentences = [
s.strip() for s in sentences if re.search(r"\blean\b", s, re.IGNORECASE)
]
print(f"\n[{pid}] {title}")
print(f" categories: {categories} date: {update_date}")
for s in matching_sentences:
print(f" >>> {s}")
#%% Summary: count by year (math papers only)
from collections import Counter
import matplotlib.pyplot as plt
year_counts_all = Counter()
year_counts_math = Counter()
for paper in lean_prover_papers:
date = paper.get("update_date", "")
if not date:
continue
year = date[:4]
year_counts_all[year] += 1
cats = (paper.get("categories") or "").split()
if any(c.startswith("math.") for c in cats):
year_counts_math[year] += 1
print("\nLean prover papers by year (math.* only):")
for year in sorted(year_counts_math):
print(f" {year}: {year_counts_math[year]}")
print(f" TOTAL: {sum(year_counts_math.values())}")
print("\nLean prover papers by year (all):")
for year in sorted(year_counts_all):
print(f" {year}: {year_counts_all[year]}")
print(f" TOTAL: {sum(year_counts_all.values())}")
#%% Plot: Lean formalization papers by year
all_years = sorted(set(year_counts_all) | set(year_counts_math))
counts_all = [year_counts_all[y] for y in all_years]
counts_math = [year_counts_math[y] for y in all_years]
fig, ax = plt.subplots(figsize=(8, 3), dpi=300)
bar_width = 0.35
x = range(len(all_years))
ax.bar([i - bar_width / 2 for i in x], counts_all, bar_width, label=f"All Lean prover papers ({sum(counts_all)})")
ax.bar([i + bar_width / 2 for i in x], counts_math, bar_width, label=f"math.* papers only ({sum(counts_math)})")
ax.set_xlabel("Year")
ax.set_ylabel("Number of papers")
ax.set_title("ArXiv papers using Lean for formalization, by year")
ax.set_xticks(list(x))
ax.set_xticklabels(all_years)
ax.legend()
for i, (a, m) in enumerate(zip(counts_all, counts_math)):
ax.text(i - bar_width / 2, a + 1, str(a), ha="center", fontsize=8)
ax.text(i + bar_width / 2, m + 1, str(m), ha="center", fontsize=8)
plt.tight_layout()
plt.show()
#%% Category breakdown: math vs CS
math_only = 0
cs_only = 0
both_math_cs = 0
neither = 0
for paper in lean_prover_papers:
cats = (paper.get("categories") or "").split()
has_math = any(c.startswith("math.") for c in cats)
has_cs = any(c.startswith("cs.") for c in cats)
if has_math and has_cs:
both_math_cs += 1
elif has_math:
math_only += 1
elif has_cs:
cs_only += 1
else:
neither += 1
total = len(lean_prover_papers)
print("\nCategory breakdown of Lean prover papers:")
print(f" math.* only: {math_only}")
print(f" cs.* only: {cs_only}")
print(f" Both math.* & cs.*: {both_math_cs}")
print(f" Neither: {neither}")
print()
print(f" Has any math.* category: {math_only + both_math_cs} ({100*(math_only + both_math_cs)/total:.1f}%)")
print(f" Has any cs.* category: {cs_only + both_math_cs} ({100*(cs_only + both_math_cs)/total:.1f}%)")
#%% Math subcategory distribution
math_subcats = Counter()
for paper in lean_prover_papers:
cats = (paper.get("categories") or "").split()
for c in cats:
if c.startswith("math."):
math_subcats[c] += 1
print("\nMath subcategories among Lean prover papers:")
for cat, count in math_subcats.most_common():
print(f" {cat}: {count}")
#%% Cross-check: verify known Lean formalization papers are captured
# These are known published Lean papers (from conferences like ITP, CPP, CICM, journals, etc.)
# We search by key title words to see which ones have arxiv preprints in our dataset.
known_papers = [
("Brasca et al. - Fermat regular primes (2025)", ["fermat", "regular", "lean"]),
("Loeffler & Stoll - zeta L-functions (2025)", ["zeta", "l-function", "lean"]),
("Mercuri - local compactness adele (2025)", ["local compactness", "adele"]),
("Riou - derived categories (2025)", ["derived categor", "lean"]),
("Asgeirsson et al. - condensed mathematics (2024)", ["condensed", "formal"]),
("Asgeirsson - Nobeling (2024)", ["solid abelian"]),
("Basold et al. - directed Van Kampen (2024)", ["van kampen", "lean"]),
("Bhat et al. - peephole SSA (2024)", ["peephole", "ssa"]),
("Clune et al. - Duper superposition (2024)", ["duper", "superposition"]),
("Frutos-Fernandez & Nuccio - discrete valuation (2024)", ["discrete valuation", "local field"]),
("Ezeh - graphical rewriting monoidal (2024)", ["graphical rewriting", "monoidal"]),
("Massot - teaching lean (2024)", ["teaching", "lean", "controlled natural"]),
("Obendrauf et al. - coalition logic (2024)", ["coalition logic", "lean"]),
("Subercaseaux et al. - empty hexagon (2024)", ["empty hexagon"]),
("Doorn & Macbeth - Gagliardo-Nirenberg (2024)", ["gagliardo"]),
("Avigad et al. - blockchain compiler (2023)", ["proof-producing", "blockchain"]),
("Angdinata & Xu - Weierstrass elliptic (2023)", ["weierstrass", "elliptic", "group law"]),
("Baanen et al. - class group Mordell (2023)", ["class group", "mordell"]),
("Best et al. - Fermat regular ITP2023 (2023)", ["fermat", "regular"]),
("Clune - Keller conjecture (2023)", ["keller", "conjecture", "formal"]),
("Frutos-Fernandez - norm extensions (2023)", ["norm extension"]),
("Doorn et al. - h-principle sphere eversion (2023)", ["h-principle", "sphere eversion"]),
("Dvorak & Blanchette - closure grammars (2023)", ["closure", "grammar", "formal"]),
("Livingston - group cohomology (2023)", ["group cohomology", "lean"]),
("Mehta - Sharkovsky (2023)", ["sharkovsky"]),
("Nash - Gallagher ergodic (2023)", ["gallagher", "ergodic"]),
("Porncharoenwase et al. - pretty printer (2023)", ["pretty", "printer"]),
("Wieser - multiple inheritance (2023)", ["multiple.{0,5}inheritance", "algebraic hierarch"]),
("Ying & Degenne - Doob martingale (2023)", ["doob", "martingale"]),
("Zhang - Proj construction (2023)", ["proj construction", "lean"]),
("Baanen et al. - Dedekind domains JAR (2022)", ["dedekind domain", "class group"]),
("Browning & Lutz - Galois theory (2022)", ["galois theory", "formal"]),
("Buzzard et al. - Schemes (2022)", ["schemes", "lean"]),
("Frutos-Fernandez - adeles (2022)", ["ring of ad", "lean"]),
("Dillies & Mehta - Szemeredi (2022)", ["szemer", "regularity"]),
("Dupuis et al. - semilinear (2022)", ["semilinear", "functional analysis"]),
("Gouezel - change of variables (2022)", ["change of variable", "mathlib"]),
("Kjos-Hanssen et al. - Tversky (2022)", ["tversky", "jaccard"]),
("Kudryashov - divergence Cauchy (2022)", ["divergence theorem", "cauchy integral"]),
("Mehta - Kruskal-Katona (2022)", ["kruskal", "katona"]),
("Porncharoenwase et al. - symbolic evaluation (2022)", ["symbolic evaluation", "merging"]),
("Wieser & Song - geometric algebra (2022)", ["geometric algebra", "lean"]),
("Wieser & Zhang - graded rings (2022)", ["graded ring", "lean"]),
("Baanen et al. - Dedekind domains ITP (2021)", ["dedekind domain", "class group"]),
("Bordg & Cavalleri - differential geometry (2021)", ["differential geometry", "lean"]),
("Commelin & Lewis - Witt vectors (2021)", ["witt vector"]),
("van Doorn - Haar measure (2021)", ["haar measure", "formal"]),
("Gouezel - Gromov-Hausdorff (2021)", ["gromov.hausdorff", "formal"]),
("Gusakov et al. - Hall marriage (2021)", ["hall", "marriage", "lean"]),
("Kudryashov - rotation number (2021)", ["rotation number", "lean"]),
("Tassarotti et al. - PAC learnability (2021)", ["pac learn", "decision stump"]),
("Buzzard et al. - perfectoid (2020)", ["perfectoid"]),
("Han & van Doorn - continuum hypothesis (2020)", ["continuum hypothesis"]),
("Carneiro - computability (2019)", ["computability", "partial recursive"]),
("Dahmen et al. - cap set (2019)", ["cap set"]),
("Mir - schemes 2019 (2019)", ["schemes", "lean"]),
("Han & van Doorn - forcing (2019)", ["forcing", "continuum hypothesis"]),
("Lewis - Hensel p-adic (2019)", ["hensel", "p-adic"]),
("Strickland & Bellumat - chromatic localisation (2019)", ["chromatic locali"]),
]
lean_prover_ids = {p.get("id") for p in lean_prover_papers}
found_count = 0
missed_count = 0
not_on_arxiv_count = 0
for label, terms in known_papers:
matches = []
for paper in papers_with_lean:
title = (paper.get("title") or "").replace("\n", " ").lower()
abstract = (paper.get("abstract") or "").replace("\n", " ").lower()
text = title + " " + abstract
if all(re.search(t, text) for t in terms):
matches.append(paper)
if matches:
any_captured = False
for m in matches:
pid = m.get("id")
t = (m.get("title") or "").replace("\n", " ").strip()
captured = pid in lean_prover_ids
status = "CAPTURED" if captured else "MISSED"
if captured:
any_captured = True
print(f" {status:>8} {label}")
print(f" -> [{pid}] {t}")
if any_captured:
found_count += 1
else:
missed_count += 1
else:
not_on_arxiv_count += 1
print(f" NOT ON ARXIV {label}")
print(f"\n--- Cross-check summary (per query, {len(known_papers)} total) ---")
print(f"Captured by our filter: {found_count}")
print(f"On arxiv but MISSED: {missed_count}")
print(f"Not found on arxiv: {not_on_arxiv_count}")
#%% Spot-check specific arxiv IDs
spot_check_ids = [
("2410.01466", "Fermat's Last Theorem for regular primes (2024, Lean4)"),
("2405.19270", "Formalising the Local Compactness of the Adele Ring (2024, Lean4)"),
("2503.00959", "Formalizing zeta and L-functions in Lean (2025, Lean/Mathlib)"),
("2507.05327", "A Formalization of Divided Powers in Lean (2025)"),
("2305.08955", "Fermat's Last Theorem for regular primes (2023, Lean)"),
("2302.10640", "Elementary Formal Proof of Group Law on Elliptic Curves (2023, Lean3)"),
("2210.07746", "Formalising the h-principle and sphere eversion (2022, Lean3)"),
("2101.02602", "Schemes in Lean (2021/2022, Lean)"),
]
print("\nSpot-check by arxiv ID:")
for arxiv_id, label in spot_check_ids:
in_dataset = arxiv_id in {p.get("id") for p in papers_with_lean}
captured = arxiv_id in lean_prover_ids
if not in_dataset:
status = "NOT IN DATASET"
elif captured:
status = "CAPTURED"
else:
status = "IN DATASET BUT MISSED"
print(f" {status:<22} [{arxiv_id}] {label}")
# %%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment