Skip to content

Instantly share code, notes, and snippets.

@willccbb
Last active April 18, 2025 14:38
Show Gist options
  • Save willccbb/5c2032c839c7847107515709fbfabba2 to your computer and use it in GitHub Desktop.
Save willccbb/5c2032c839c7847107515709fbfabba2 to your computer and use it in GitHub Desktop.
Arxiv link to Markdown via Mistral OCR (h/t @simonw)
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "click",
# "mistralai",
# "markdown",
# "requests",
# "beautifulsoup4",
# ]
# ///
import os
import json
import base64
import re
import requests
import tempfile
from pathlib import Path
import unicodedata
from urllib.parse import urlparse
import click
import markdown
from bs4 import BeautifulSoup
from mistralai import Mistral
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
def get_arxiv_pdf(arxiv_id):
"""
Download the PDF for an arXiv ID
Args:
arxiv_id (str): arXiv ID
Returns:
bytes: PDF content
"""
pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(pdf_url, headers=headers)
if response.status_code != 200:
raise ValueError(f"Could not download PDF. Status code: {response.status_code}")
return response.content
def extract_arxiv_id(url):
"""
Extract arXiv ID from an arXiv URL (abs, PDF, or HTML)
Args:
url (str): arXiv URL
Returns:
str: arXiv ID
"""
# Parse the URL
parsed_url = urlparse(url)
# Check if it's an arXiv URL
if 'arxiv.org' not in parsed_url.netloc:
raise ValueError("Not an arXiv URL")
# Extract the arXiv ID
path_parts = parsed_url.path.strip('/').split('/')
# Handle different URL formats
arxiv_id = None
if 'abs' in path_parts:
# Format: arxiv.org/abs/1234.56789
idx = path_parts.index('abs')
if idx + 1 < len(path_parts):
arxiv_id = path_parts[idx + 1]
elif 'pdf' in path_parts:
# Format: arxiv.org/pdf/1234.56789.pdf
idx = path_parts.index('pdf')
if idx + 1 < len(path_parts):
arxiv_id = path_parts[idx + 1].replace('.pdf', '')
elif 'html' in path_parts:
# Format: arxiv.org/html/1234.56789
idx = path_parts.index('html')
if idx + 1 < len(path_parts):
arxiv_id = path_parts[idx + 1]
else:
# Try to find the ID in the last part of the path
last_part = path_parts[-1]
if re.match(r'\d+\.\d+', last_part):
arxiv_id = last_part
if not arxiv_id:
raise ValueError("Could not extract arXiv ID from URL")
return arxiv_id
def get_arxiv_bibtex(arxiv_id):
"""
Download the BibTeX citation for an arXiv ID
Args:
arxiv_id (str): arXiv ID
Returns:
str: BibTeX citation text or None if not available
"""
# arXiv's BibTeX endpoint
bibtex_url = f"https://arxiv.org/bibtex/{arxiv_id}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = requests.get(bibtex_url, headers=headers)
if response.status_code != 200:
print(f"Failed to get BibTeX: HTTP {response.status_code}")
return None
# Extract BibTeX content from the response
soup = BeautifulSoup(response.text, 'html.parser')
# First try to find the textarea that usually contains the BibTeX
textarea = soup.find('textarea')
if textarea:
return textarea.get_text().strip()
# If no textarea, try to extract pre-formatted text
pre = soup.find('pre')
if pre:
return pre.get_text().strip()
# Last resort: generate a basic BibTeX entry ourselves
print("Could not find BibTeX on the page, generating a basic entry")
# We'll need the title and authors
abs_url = f"https://arxiv.org/abs/{arxiv_id}"
abs_response = requests.get(abs_url, headers=headers)
if abs_response.status_code == 200:
abs_soup = BeautifulSoup(abs_response.text, 'html.parser')
# Extract title
title_elem = abs_soup.find('h1', class_='title')
title = title_elem.get_text().replace('Title:', '').strip() if title_elem else "Unknown Title"
# Extract authors
authors_elem = abs_soup.find('div', class_='authors')
authors = authors_elem.get_text().replace('Authors:', '').strip() if authors_elem else "Unknown Authors"
# Extract year
year = "2023" # Default to current year if we can't find it
date_elem = abs_soup.find('div', class_='dateline')
if date_elem:
date_match = re.search(r'\b(19|20)\d{2}\b', date_elem.get_text())
if date_match:
year = date_match.group(0)
# Generate a simple BibTeX entry
bibtex = f"""@article{{{arxiv_id},
title = {{{title}}},
author = {{{authors}}},
journal = {{arXiv preprint arXiv:{arxiv_id}}},
year = {{{year}}},
url = {{https://arxiv.org/abs/{arxiv_id}}},
}}"""
return bibtex
except Exception as e:
print(f"Error fetching BibTeX: {str(e)}")
return None
return None
def get_paper_metadata(arxiv_id):
"""
Get the paper title and submission date from arXiv abstract page
Args:
arxiv_id (str): arXiv ID
Returns:
tuple: (Paper title, Submission date)
"""
abs_url = f"https://arxiv.org/abs/{arxiv_id}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(abs_url, headers=headers)
if response.status_code != 200:
return "unknown-paper", None
# Parse the HTML
soup = BeautifulSoup(response.text, 'html.parser')
# Extract title
title = "unknown-paper"
title_element = soup.find('h1', class_='title')
if title_element:
title = title_element.get_text().replace('Title:', '').strip()
# Extract submission date
submission_date = None
submission_element = soup.find('div', class_='submission-history')
if submission_element:
# Look for the first submission date
submission_text = submission_element.get_text()
match = re.search(r'\[v1\]\s+(.+?)\s+\(', submission_text)
if not match:
# Try alternative pattern without version
match = re.search(r'Submitted\s+(.+?)\s+\(', submission_text)
if match:
submission_date = match.group(1).strip()
return title, submission_date
def sanitize_filename(title):
"""
Convert title to a clean filename format
Args:
title (str): Paper title
Returns:
str: Sanitized filename
"""
# Convert to lowercase and replace spaces with hyphens
filename = title.lower()
# Remove accents
filename = ''.join(c for c in unicodedata.normalize('NFKD', filename)
if not unicodedata.combining(c))
# Replace non-alphanumeric characters with hyphens
filename = re.sub(r'[^a-z0-9]', '-', filename)
# Replace multiple hyphens with a single hyphen
filename = re.sub(r'-+', '-', filename)
# Remove leading and trailing hyphens
filename = filename.strip('-')
return filename
@click.command()
@click.argument("arxiv_url")
@click.option(
"--api-key",
help="Mistral API key. If not provided, will use MISTRAL_API_KEY environment variable.",
envvar="MISTRAL_API_KEY",
)
@click.option(
"--model",
help="Mistral OCR model to use.",
default="mistral-ocr-latest"
)
@click.option(
"--json/--no-json",
"-j/-J",
"json_output",
is_flag=True,
default=False,
help="Return raw JSON instead of markdown text/Return markdown text (default).",
)
@click.option(
"--html/--no-html",
"-h/-H",
is_flag=True,
default=False,
help="Convert markdown to HTML/Keep as markdown (default).",
)
@click.option(
"--inline-images/--no-inline-images",
"-i/-I",
is_flag=True,
default=False,
help="Include images inline as data URIs/Don't include inline images (default).",
)
@click.option(
"--extract-images/--no-extract-images",
"-e/-E",
is_flag=True,
default=True, # Extract images by default
help="Extract images as separate files (default)/Skip extracting images.",
)
@click.option(
"--silent/--verbose",
"-s/-v",
is_flag=True,
default=False,
help="Suppress all output except for the requested data/Show detailed progress (default).",
)
@click.option(
"--pages",
type=int,
default=20,
help="Limit processing to the first N pages (default: 20).",
)
def arxiv_to_markdown(
arxiv_url,
api_key,
model,
json_output,
html,
inline_images,
extract_images,
silent,
pages,
):
"""Process an arXiv paper (given its URL) and convert it to markdown using Mistral OCR.
ARXIV_URL is the URL of the arXiv paper (abs, PDF, or HTML format).
The script will download the PDF version, process it with OCR, and save the result
in the papers/ directory with a sanitized filename based on the paper title.
\b
Examples:
python arxiv_ocr.py https://arxiv.org/abs/1706.03762 --api-key YOUR_API_KEY
python arxiv_ocr.py https://arxiv.org/abs/1706.03762 --pages 5 --html
"""
# Validate API key
if not api_key:
raise click.ClickException("No API key provided and MISTRAL_API_KEY environment variable not set.")
try:
# Check if papers directory exists, create if not
papers_dir = Path("papers")
papers_dir.mkdir(exist_ok=True)
# Extract arXiv ID from URL
if not silent:
click.echo(f"Extracting arXiv ID from URL: {arxiv_url}", err=True)
arxiv_id = extract_arxiv_id(arxiv_url)
if not silent:
click.echo(f"Found arXiv ID: {arxiv_id}", err=True)
# Get paper metadata
if not silent:
click.echo("Fetching paper metadata...", err=True)
paper_title, submission_date = get_paper_metadata(arxiv_id)
sanitized_title = sanitize_filename(paper_title)
if not silent:
click.echo(f"Paper title: {paper_title}", err=True)
if submission_date:
click.echo(f"Submission date: {submission_date}", err=True)
click.echo(f"Sanitized filename: {sanitized_title}", err=True)
# Download PDF
if not silent:
click.echo(f"Downloading PDF from arXiv...", err=True)
pdf_content = get_arxiv_pdf(arxiv_id)
# Download BibTeX citation
if not silent:
click.echo(f"Downloading BibTeX citation...", err=True)
bibtex_content = get_arxiv_bibtex(arxiv_id)
if bibtex_content:
if not silent:
click.echo(f"BibTeX citation retrieved successfully", err=True)
else:
if not silent:
click.echo(f"Could not retrieve BibTeX citation", err=True)
# Create temp file for PDF
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
temp_pdf_path = Path(temp_pdf.name)
temp_pdf.write(pdf_content)
try:
if not silent:
click.echo(f"Downloaded PDF to {temp_pdf_path}", err=True)
# Process PDF with Mistral OCR
client = Mistral(api_key=api_key)
uploaded_file = None
try:
# Upload PDF to Mistral
if not silent:
click.echo(f"Uploading file to Mistral...", err=True)
uploaded_file = client.files.upload(
file={
"file_name": f"{arxiv_id}.pdf",
"content": pdf_content,
},
purpose="ocr",
)
# Get signed URL
signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
# Process with OCR
if not silent:
click.echo(f"Processing with OCR model: {model}...", err=True)
# Prepare OCR processing parameters
ocr_params = {
"document": DocumentURLChunk(document_url=signed_url.url),
"model": model,
"include_image_base64": True, # Always request images
}
# Add pages parameter if limited pages are requested
if pages > 0:
ocr_params["pages"] = list(range(pages))
if not silent:
click.echo(f"Limiting processing to first {pages} pages", err=True)
pdf_response = client.ocr.process(**ocr_params)
# Parse response
response_dict = json.loads(pdf_response.model_dump_json())
# Define output paths - always create a directory structure
output_dir = papers_dir / sanitized_title
output_dir.mkdir(exist_ok=True)
output_file = output_dir / "README.md"
bibtex_file = output_dir / f"{sanitized_title}.bib"
# For HTML output, use index.html instead of README.md
if html:
output_file = output_dir / "index.html"
# Save BibTeX citation if available
if bibtex_content:
try:
bibtex_file.write_text(bibtex_content)
if not silent:
click.echo(f"BibTeX citation saved to {bibtex_file}", err=True)
except Exception as e:
if not silent:
click.echo(f"Error saving BibTeX file: {str(e)}", err=True)
# Process images if needed
image_map = {}
if extract_images or inline_images:
image_count = 0
# For extract_images, we need a directory
if extract_images:
image_dir = output_dir
# Look for images in the OCR response
for page in response_dict.get("pages", []):
for img in page.get("images", []):
if "id" in img and "image_base64" in img:
image_id = img["id"]
image_data = img["image_base64"]
# Sometimes the base64 data has a data URI prefix, sometimes not
if image_data.startswith("data:image/"):
# Extract the mime type and base64 data
mime_type = image_data.split(";")[0].split(":")[1]
base64_data = image_data.split(",", 1)[1]
else:
# Determine mime type from file extension or default to jpeg
ext = image_id.split(".")[-1].lower() if "." in image_id else "jpeg"
mime_type = f"image/{ext}"
base64_data = image_data
# For extracted images, save to disk
if extract_images:
# Create a suitable filename if it doesn't have an extension
if "." not in image_id:
ext = mime_type.split("/")[1]
image_filename = f"{image_id}.{ext}"
else:
image_filename = image_id
image_path = image_dir / image_filename
try:
with open(image_path, "wb") as img_file:
img_file.write(base64.b64decode(base64_data))
# Map image_id to relative path for referencing
image_map[image_id] = image_filename
image_count += 1
except Exception as e:
if not silent:
click.echo(f"Warning: Failed to save image {image_id}: {str(e)}", err=True)
# For inline images, prepare data URIs
elif inline_images:
# Ensure it has the data URI prefix
if not image_data.startswith("data:"):
image_data = f"data:{mime_type};base64,{base64_data}"
image_map[image_id] = image_data
if not silent and extract_images and image_count > 0:
click.echo(f"Extracted {image_count} images to {image_dir}", err=True)
# Generate output content
if json_output:
result = json.dumps(response_dict, indent=4)
else:
# Concatenate markdown content from all pages
markdown_contents = [
page.get("markdown", "") for page in response_dict.get("pages", [])
]
markdown_text = "\n\n".join(markdown_contents)
# Add metadata at the top
markdown_text = f"# {paper_title}\n\n"
# Add source information near the top
markdown_text += f"*Source: [arXiv:{arxiv_id}](https://arxiv.org/abs/{arxiv_id})*\n\n"
# Add submission date if available
if submission_date:
markdown_text += f"*[Submitted on {submission_date}]*\n\n"
# Add the content
content_text = "\n\n".join(markdown_contents)
markdown_text += content_text
# Add link to BibTeX if available at the bottom
if bibtex_content:
markdown_text += f"\n\n---\n*[BibTeX citation]({sanitized_title}.bib)*\n"
# Post-processing: Remove duplicate title if present
lines = markdown_text.split('\n')
if len(lines) >= 3:
if lines[0].strip().startswith('#') and lines[2].strip().startswith('#') and lines[0].strip() == lines[2].strip():
# Remove duplicate title
lines.pop(2)
markdown_text = '\n'.join(lines)
# Handle image references
for img_id, img_src in image_map.items():
pattern = r"!\[(.*?)\]\(\s*" + re.escape(img_id) + r"\s*\)"
replacement = r"![\1](" + img_src + r")"
markdown_text = re.sub(pattern, replacement, markdown_text)
if html:
# Convert markdown to HTML
md = markdown.Markdown(extensions=["tables"])
html_content = md.convert(markdown_text)
# Add HTML wrapper with basic styling
result = f"""<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{paper_title}</title>
<style>
body {{
font-family: Arial, sans-serif;
line-height: 1.6;
margin: 0 auto;
max-width: 800px;
padding: 20px;
}}
img {{ max-width: 100%; height: auto; }}
h1, h2, h3 {{ margin-top: 1.5em; }}
p {{ margin: 1em 0; }}
</style>
</head>
<body>
{html_content}
</body>
</html>"""
else: # markdown
result = markdown_text
# Write output to file
output_file.write_text(result)
if not silent:
click.echo(f"Results saved to {output_file}", err=True)
if bibtex_content:
click.echo(f"BibTeX citation saved to {bibtex_file}", err=True)
click.echo(f"Original arXiv URL: {arxiv_url}", err=True)
click.echo(f"PDF URL: https://arxiv.org/pdf/{arxiv_id}.pdf", err=True)
finally:
# Clean up uploaded file
if uploaded_file:
try:
client.files.delete(file_id=uploaded_file.id)
if not silent:
click.echo("Temporary Mistral file deleted", err=True)
except Exception as e:
if not silent:
click.echo(f"Warning: Could not delete temporary Mistral file: {str(e)}", err=True)
finally:
# Clean up temp file
if temp_pdf_path.exists():
os.unlink(temp_pdf_path)
if not silent:
click.echo("Temporary PDF file deleted", err=True)
except Exception as e:
raise click.ClickException(f"Error: {str(e)}")
if __name__ == "__main__":
arxiv_to_markdown()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment