rollwagen · February 26, 2025 16:21
diff --git a/pdf_img_to_markdown.py b/pdf_img_to_markdown.py
 #!/usr/bin/env python3
 """
 Convert PDF documents to markdown format using OCR.

 This module provides tools to extract text from PDF files using OCR
 technology and convert it to markdown format.
 """
 # /// script
 # requires-python = ">=3.12"
 # dependencies = [
 #     "bs4",
 #     "pdf2image",
 #     "pytesseract",
 # ]
 # ///
 import argparse
 import sys
 from collections.abc import Iterator
 from pathlib import Path
 from typing import TypeAlias

 try:
    import pytesseract
    from bs4 import BeautifulSoup
    from pdf2image import convert_from_path
    from PIL.Image import Image
 except ImportError as e:
    print(f"Error importing dependencies: {e}")
    print("Run 'uv pip install bs4 pdf2image pytesseract' to install required packages")
    print("Run 'uv run pdf_img_to_markdown.py' to run script directly")
    sys.exit(1)

 # Type aliases
 OCRText: TypeAlias = str  # OCR extracted text
 HOCRBytes: TypeAlias = bytes  # hOCR formatted bytes


 def pdf_to_images(pdf_path: str | Path) -> list[Image]:
    """Convert PDF to images using pdf2image.

    Args:
        pdf_path: Path to the PDF file

    Returns:
        List of PIL Image objects
    """
    return convert_from_path(pdf_path, fmt="tiff")


 def image_to_hocr(image: Image, lang: str = "eng") -> HOCRBytes:
    """Convert image to hOCR format using pytesseract.

    Args:
        image: PIL Image object to process
        lang: Language for OCR (default: "eng")

    Returns:
        hOCR content as bytes
    """
    return pytesseract.image_to_pdf_or_hocr(image, extension="hocr", lang=lang)


 def hocr_to_markdown(hocr: HOCRBytes) -> OCRText:
    """Convert hOCR content to markdown format.

    Args:
        hocr: hOCR content as bytes

    Returns:
        Markdown formatted text
    """
    soup = BeautifulSoup(hocr, "html.parser")

    lines = [
        " ".join(word.get_text() for word in line.find_all("span", class_="ocrx_word"))
        for line in soup.find_all("span", class_="ocr_line")
    ]

    return "\n".join(lines)


 def pdf_to_markdown(pdf_path: str | Path, lang: str = "deu") -> OCRText:
    """Convert PDF file to markdown text.

    Args:
        pdf_path: Path to the PDF file
        lang: Language for OCR (default: "deu")

    Returns:
        Markdown formatted text of the PDF content
    """
    images = pdf_to_images(pdf_path)

    # Use generator expression for efficiency
    markdown_chunks: Iterator[OCRText] = (
        hocr_to_markdown(image_to_hocr(image, lang=lang)) for image in images
    )

    # Join all chunks with double newlines
    return "\n\n".join(markdown_chunks)


 def main() -> None:
    """Process command line arguments and run the conversion."""
    parser = argparse.ArgumentParser(description="Convert PDF to Markdown with OCR")
    parser.add_argument("input", type=str, help="Path to input PDF file")
    parser.add_argument(
        "--output",
        "-o",
        type=str,
        default="output.md",
        help="Path to output markdown file (default: output.md)",
    )
    parser.add_argument(
        "--lang", "-l", type=str, default="deu", help="OCR language (default: deu)"
    )

    args = parser.parse_args()

    # Convert input to Path object
    pdf_path = Path(args.input)
    output_path = Path(args.output)

    # Validate input file exists
    if not pdf_path.exists():
        print(f"Error: Input file '{pdf_path}' not found")
        return

    # Process PDF and write output
    markdown_output = pdf_to_markdown(pdf_path, lang=args.lang)
    output_path.write_text(markdown_output, encoding="utf-8")
    print(f"Conversion complete. Output written to: {output_path}")


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Convert PDF documents to markdown format using OCR.

	This module provides tools to extract text from PDF files using OCR
	technology and convert it to markdown format.
	"""
	# /// script
	# requires-python = ">=3.12"
	# dependencies = [
	# "bs4",
	# "pdf2image",
	# "pytesseract",
	# ]
	# ///
	import argparse
	import sys
	from collections.abc import Iterator
	from pathlib import Path
	from typing import TypeAlias

	try:
	import pytesseract
	from bs4 import BeautifulSoup
	from pdf2image import convert_from_path
	from PIL.Image import Image
	except ImportError as e:
	print(f"Error importing dependencies: {e}")
	print("Run 'uv pip install bs4 pdf2image pytesseract' to install required packages")
	print("Run 'uv run pdf_img_to_markdown.py' to run script directly")
	sys.exit(1)

	# Type aliases
	OCRText: TypeAlias = str # OCR extracted text
	HOCRBytes: TypeAlias = bytes # hOCR formatted bytes


	def pdf_to_images(pdf_path: str \| Path) -> list[Image]:
	"""Convert PDF to images using pdf2image.

	Args:
	pdf_path: Path to the PDF file

	Returns:
	List of PIL Image objects
	"""
	return convert_from_path(pdf_path, fmt="tiff")


	def image_to_hocr(image: Image, lang: str = "eng") -> HOCRBytes:
	"""Convert image to hOCR format using pytesseract.

	Args:
	image: PIL Image object to process
	lang: Language for OCR (default: "eng")

	Returns:
	hOCR content as bytes
	"""
	return pytesseract.image_to_pdf_or_hocr(image, extension="hocr", lang=lang)


	def hocr_to_markdown(hocr: HOCRBytes) -> OCRText:
	"""Convert hOCR content to markdown format.

	Args:
	hocr: hOCR content as bytes

	Returns:
	Markdown formatted text
	"""
	soup = BeautifulSoup(hocr, "html.parser")

	lines = [
	" ".join(word.get_text() for word in line.find_all("span", class_="ocrx_word"))
	for line in soup.find_all("span", class_="ocr_line")
	]

	return "\n".join(lines)


	def pdf_to_markdown(pdf_path: str \| Path, lang: str = "deu") -> OCRText:
	"""Convert PDF file to markdown text.

	Args:
	pdf_path: Path to the PDF file
	lang: Language for OCR (default: "deu")

	Returns:
	Markdown formatted text of the PDF content
	"""
	images = pdf_to_images(pdf_path)

	# Use generator expression for efficiency
	markdown_chunks: Iterator[OCRText] = (
	hocr_to_markdown(image_to_hocr(image, lang=lang)) for image in images
	)

	# Join all chunks with double newlines
	return "\n\n".join(markdown_chunks)


	def main() -> None:
	"""Process command line arguments and run the conversion."""
	parser = argparse.ArgumentParser(description="Convert PDF to Markdown with OCR")
	parser.add_argument("input", type=str, help="Path to input PDF file")
	parser.add_argument(
	"--output",
	"-o",
	type=str,
	default="output.md",
	help="Path to output markdown file (default: output.md)",
	)
	parser.add_argument(
	"--lang", "-l", type=str, default="deu", help="OCR language (default: deu)"
	)

	args = parser.parse_args()

	# Convert input to Path object
	pdf_path = Path(args.input)
	output_path = Path(args.output)

	# Validate input file exists
	if not pdf_path.exists():
	print(f"Error: Input file '{pdf_path}' not found")
	return

	# Process PDF and write output
	markdown_output = pdf_to_markdown(pdf_path, lang=args.lang)
	output_path.write_text(markdown_output, encoding="utf-8")
	print(f"Conversion complete. Output written to: {output_path}")


	if __name__ == "__main__":
	main()