Last active
September 18, 2025 13:23
-
-
Save waynegraham/9fa49970eb75ca1f23397414374fad09 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import os | |
| import subprocess | |
| from pathlib import Path | |
| from concurrent.futures import ProcessPoolExecutor, as_completed | |
| import multiprocessing | |
| # Input and output directories | |
| INPUT_DIR = "data" # change to your directory of PDFs | |
| OUTPUT_DIR = "ocr_output" | |
| # Make sure output directory exists | |
| Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True) | |
| def ocr_pdf(input_path, output_path): | |
| try: | |
| subprocess.run( | |
| [ | |
| "ocrmypdf", | |
| "-l", "ara", # Arabic OCR | |
| # "--optimize", "3", # aggressive optimization | |
| "--output-type", "pdfa", # archive-friendly PDF | |
| # "--skip-text", # skip OCR if text layer already exists | |
| str(input_path), | |
| str(output_path), | |
| ], | |
| check=True, | |
| capture_output=True, | |
| ) | |
| return f"✅ OCR complete: {output_path}" | |
| except subprocess.CalledProcessError as e: | |
| return f"❌ Error processing {input_path}: {e.stderr.decode(errors='ignore')}" | |
| def main(): | |
| pdf_files = [ | |
| f for f in os.listdir(INPUT_DIR) if f.lower().endswith(".pdf") | |
| ] | |
| if not pdf_files: | |
| print("No PDF files found.") | |
| return | |
| # Use all CPUs (or set max_workers to something smaller if needed) | |
| # e.g. multiprocessing.cpu_count() - 2 | |
| max_workers = multiprocessing.cpu_count() | |
| with ProcessPoolExecutor(max_workers=max_workers) as executor: | |
| futures = [] | |
| for filename in pdf_files: | |
| input_path = Path(INPUT_DIR) / filename | |
| output_path = Path(OUTPUT_DIR) / filename | |
| futures.append(executor.submit(ocr_pdf, input_path, output_path)) | |
| for future in as_completed(futures): | |
| print(future.result()) | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python3 | |
| import os | |
| import subprocess | |
| from pathlib import Path | |
| INPUT_DIR = "data" | |
| OUTPUT_DIR = "ocr_output" | |
| Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True) | |
| def ocr_pdf(input_path, output_path): | |
| try: | |
| subprocess.run( | |
| [ | |
| "ocrmypdf", | |
| "-l", "ara", # Arabic OCR | |
| "--optimize", "3", # aggressive optimization | |
| "--output-type", "pdfa", # archive-friendly PDF | |
| # "--skip-text", # skip OCR if text layer already exists | |
| str(input_path), | |
| str(output_path), | |
| ], | |
| check=True | |
| ) | |
| print(f"✅ OCR complete: {output_path}") | |
| except subprocess.CalledProcessError as e: | |
| print(f"❌ Error processing {input_path}: {e}") | |
| def main(): | |
| for filename in os.listdir(INPUT_DIR): | |
| if filename.lower().endswith(".pdf"): | |
| input_path = Path(INPUT_DIR) / filename | |
| output_path = Path(OUTPUT_DIR) / filename | |
| ocr_pdf(input_path, output_path) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment