Created
February 26, 2025 16:21
-
-
Save rollwagen/2ab2fb8f7a1f3756570e319795c89c70 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Convert PDF documents to markdown format using OCR. | |
This module provides tools to extract text from PDF files using OCR | |
technology and convert it to markdown format. | |
""" | |
# /// script | |
# requires-python = ">=3.12" | |
# dependencies = [ | |
# "bs4", | |
# "pdf2image", | |
# "pytesseract", | |
# ] | |
# /// | |
import argparse | |
import sys | |
from collections.abc import Iterator | |
from pathlib import Path | |
from typing import TypeAlias | |
try: | |
import pytesseract | |
from bs4 import BeautifulSoup | |
from pdf2image import convert_from_path | |
from PIL.Image import Image | |
except ImportError as e: | |
print(f"Error importing dependencies: {e}") | |
print("Run 'uv pip install bs4 pdf2image pytesseract' to install required packages") | |
print("Run 'uv run pdf_img_to_markdown.py' to run script directly") | |
sys.exit(1) | |
# Type aliases | |
OCRText: TypeAlias = str # OCR extracted text | |
HOCRBytes: TypeAlias = bytes # hOCR formatted bytes | |
def pdf_to_images(pdf_path: str | Path) -> list[Image]: | |
"""Convert PDF to images using pdf2image. | |
Args: | |
pdf_path: Path to the PDF file | |
Returns: | |
List of PIL Image objects | |
""" | |
return convert_from_path(pdf_path, fmt="tiff") | |
def image_to_hocr(image: Image, lang: str = "eng") -> HOCRBytes: | |
"""Convert image to hOCR format using pytesseract. | |
Args: | |
image: PIL Image object to process | |
lang: Language for OCR (default: "eng") | |
Returns: | |
hOCR content as bytes | |
""" | |
return pytesseract.image_to_pdf_or_hocr(image, extension="hocr", lang=lang) | |
def hocr_to_markdown(hocr: HOCRBytes) -> OCRText: | |
"""Convert hOCR content to markdown format. | |
Args: | |
hocr: hOCR content as bytes | |
Returns: | |
Markdown formatted text | |
""" | |
soup = BeautifulSoup(hocr, "html.parser") | |
lines = [ | |
" ".join(word.get_text() for word in line.find_all("span", class_="ocrx_word")) | |
for line in soup.find_all("span", class_="ocr_line") | |
] | |
return "\n".join(lines) | |
def pdf_to_markdown(pdf_path: str | Path, lang: str = "deu") -> OCRText: | |
"""Convert PDF file to markdown text. | |
Args: | |
pdf_path: Path to the PDF file | |
lang: Language for OCR (default: "deu") | |
Returns: | |
Markdown formatted text of the PDF content | |
""" | |
images = pdf_to_images(pdf_path) | |
# Use generator expression for efficiency | |
markdown_chunks: Iterator[OCRText] = ( | |
hocr_to_markdown(image_to_hocr(image, lang=lang)) for image in images | |
) | |
# Join all chunks with double newlines | |
return "\n\n".join(markdown_chunks) | |
def main() -> None: | |
"""Process command line arguments and run the conversion.""" | |
parser = argparse.ArgumentParser(description="Convert PDF to Markdown with OCR") | |
parser.add_argument("input", type=str, help="Path to input PDF file") | |
parser.add_argument( | |
"--output", | |
"-o", | |
type=str, | |
default="output.md", | |
help="Path to output markdown file (default: output.md)", | |
) | |
parser.add_argument( | |
"--lang", "-l", type=str, default="deu", help="OCR language (default: deu)" | |
) | |
args = parser.parse_args() | |
# Convert input to Path object | |
pdf_path = Path(args.input) | |
output_path = Path(args.output) | |
# Validate input file exists | |
if not pdf_path.exists(): | |
print(f"Error: Input file '{pdf_path}' not found") | |
return | |
# Process PDF and write output | |
markdown_output = pdf_to_markdown(pdf_path, lang=args.lang) | |
output_path.write_text(markdown_output, encoding="utf-8") | |
print(f"Conversion complete. Output written to: {output_path}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment