Skip to content

Instantly share code, notes, and snippets.

@waynegraham
Last active September 18, 2025 13:23
Show Gist options
  • Select an option

  • Save waynegraham/9fa49970eb75ca1f23397414374fad09 to your computer and use it in GitHub Desktop.

Select an option

Save waynegraham/9fa49970eb75ca1f23397414374fad09 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import os
import subprocess
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing
# Input and output directories
INPUT_DIR = "data" # change to your directory of PDFs
OUTPUT_DIR = "ocr_output"
# Make sure output directory exists
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
def ocr_pdf(input_path, output_path):
try:
subprocess.run(
[
"ocrmypdf",
"-l", "ara", # Arabic OCR
# "--optimize", "3", # aggressive optimization
"--output-type", "pdfa", # archive-friendly PDF
# "--skip-text", # skip OCR if text layer already exists
str(input_path),
str(output_path),
],
check=True,
capture_output=True,
)
return f"✅ OCR complete: {output_path}"
except subprocess.CalledProcessError as e:
return f"❌ Error processing {input_path}: {e.stderr.decode(errors='ignore')}"
def main():
pdf_files = [
f for f in os.listdir(INPUT_DIR) if f.lower().endswith(".pdf")
]
if not pdf_files:
print("No PDF files found.")
return
# Use all CPUs (or set max_workers to something smaller if needed)
# e.g. multiprocessing.cpu_count() - 2
max_workers = multiprocessing.cpu_count()
with ProcessPoolExecutor(max_workers=max_workers) as executor:
futures = []
for filename in pdf_files:
input_path = Path(INPUT_DIR) / filename
output_path = Path(OUTPUT_DIR) / filename
futures.append(executor.submit(ocr_pdf, input_path, output_path))
for future in as_completed(futures):
print(future.result())
if __name__ == "__main__":
main()
#! /usr/bin/env python3
import os
import subprocess
from pathlib import Path
INPUT_DIR = "data"
OUTPUT_DIR = "ocr_output"
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
def ocr_pdf(input_path, output_path):
try:
subprocess.run(
[
"ocrmypdf",
"-l", "ara", # Arabic OCR
"--optimize", "3", # aggressive optimization
"--output-type", "pdfa", # archive-friendly PDF
# "--skip-text", # skip OCR if text layer already exists
str(input_path),
str(output_path),
],
check=True
)
print(f"✅ OCR complete: {output_path}")
except subprocess.CalledProcessError as e:
print(f"❌ Error processing {input_path}: {e}")
def main():
for filename in os.listdir(INPUT_DIR):
if filename.lower().endswith(".pdf"):
input_path = Path(INPUT_DIR) / filename
output_path = Path(OUTPUT_DIR) / filename
ocr_pdf(input_path, output_path)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment