Skip to content

Instantly share code, notes, and snippets.

@rollwagen
Created February 26, 2025 16:21
Show Gist options
  • Save rollwagen/2ab2fb8f7a1f3756570e319795c89c70 to your computer and use it in GitHub Desktop.
Save rollwagen/2ab2fb8f7a1f3756570e319795c89c70 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Convert PDF documents to markdown format using OCR.
This module provides tools to extract text from PDF files using OCR
technology and convert it to markdown format.
"""
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "bs4",
# "pdf2image",
# "pytesseract",
# ]
# ///
import argparse
import sys
from collections.abc import Iterator
from pathlib import Path
from typing import TypeAlias
try:
import pytesseract
from bs4 import BeautifulSoup
from pdf2image import convert_from_path
from PIL.Image import Image
except ImportError as e:
print(f"Error importing dependencies: {e}")
print("Run 'uv pip install bs4 pdf2image pytesseract' to install required packages")
print("Run 'uv run pdf_img_to_markdown.py' to run script directly")
sys.exit(1)
# Type aliases
OCRText: TypeAlias = str # OCR extracted text
HOCRBytes: TypeAlias = bytes # hOCR formatted bytes
def pdf_to_images(pdf_path: str | Path) -> list[Image]:
"""Convert PDF to images using pdf2image.
Args:
pdf_path: Path to the PDF file
Returns:
List of PIL Image objects
"""
return convert_from_path(pdf_path, fmt="tiff")
def image_to_hocr(image: Image, lang: str = "eng") -> HOCRBytes:
"""Convert image to hOCR format using pytesseract.
Args:
image: PIL Image object to process
lang: Language for OCR (default: "eng")
Returns:
hOCR content as bytes
"""
return pytesseract.image_to_pdf_or_hocr(image, extension="hocr", lang=lang)
def hocr_to_markdown(hocr: HOCRBytes) -> OCRText:
"""Convert hOCR content to markdown format.
Args:
hocr: hOCR content as bytes
Returns:
Markdown formatted text
"""
soup = BeautifulSoup(hocr, "html.parser")
lines = [
" ".join(word.get_text() for word in line.find_all("span", class_="ocrx_word"))
for line in soup.find_all("span", class_="ocr_line")
]
return "\n".join(lines)
def pdf_to_markdown(pdf_path: str | Path, lang: str = "deu") -> OCRText:
"""Convert PDF file to markdown text.
Args:
pdf_path: Path to the PDF file
lang: Language for OCR (default: "deu")
Returns:
Markdown formatted text of the PDF content
"""
images = pdf_to_images(pdf_path)
# Use generator expression for efficiency
markdown_chunks: Iterator[OCRText] = (
hocr_to_markdown(image_to_hocr(image, lang=lang)) for image in images
)
# Join all chunks with double newlines
return "\n\n".join(markdown_chunks)
def main() -> None:
"""Process command line arguments and run the conversion."""
parser = argparse.ArgumentParser(description="Convert PDF to Markdown with OCR")
parser.add_argument("input", type=str, help="Path to input PDF file")
parser.add_argument(
"--output",
"-o",
type=str,
default="output.md",
help="Path to output markdown file (default: output.md)",
)
parser.add_argument(
"--lang", "-l", type=str, default="deu", help="OCR language (default: deu)"
)
args = parser.parse_args()
# Convert input to Path object
pdf_path = Path(args.input)
output_path = Path(args.output)
# Validate input file exists
if not pdf_path.exists():
print(f"Error: Input file '{pdf_path}' not found")
return
# Process PDF and write output
markdown_output = pdf_to_markdown(pdf_path, lang=args.lang)
output_path.write_text(markdown_output, encoding="utf-8")
print(f"Conversion complete. Output written to: {output_path}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment