Created
          November 17, 2024 06:08 
        
      - 
      
- 
        Save ericwastaken/f45b604455242923c0d1a007692bd74a to your computer and use it in GitHub Desktop. 
    A shell script that runs OCRmyPDF with options suitable for document archival in PDFs while supporting text search and file optimization. Meant to be used in any platform that supports Homebrew, but can be adapted to any other Linux/Unix platform so long as the OCRmyPDF and tesseract dependencies are installed.
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | #!/bin/bash | |
| # This script performs OCR on a PDF file using OCRmyPDF, | |
| # optimizes images and document layout. | |
| # | |
| # Uses OCRmyPDF - https://github.com/ocrmypdf/OCRmyPDF | |
| # | |
| # Dependencies: | |
| # - `brew install tesseract-lang`` # Option 2: for all | |
| # language packs (to support spanish and others) | |
| # - OCRmyPDF: Install via Homebrew with the command | |
| # `brew install ocrmypdf`. | |
| # - Ensure Homebrew is installed on your Mac (https://brew.sh). | |
| # Check if the correct number of arguments is provided | |
| if [ $# -ne 1 ]; then | |
| echo "Usage: $0 <input-pdf-file-path>" | |
| exit 1 | |
| fi | |
| # Input PDF file path | |
| input_pdf="$1" | |
| # Perform OCR with specified options | |
| # -l eng+spa \ # Support for both English and Spanish languages | |
| # --output-type pdfa \ # Enforce PDF/A for long-term archiving | |
| # --oversample 300 \ # Oversample to 300 DPI to improve OCR results | |
| # --force-ocr \ # Force OCR on pages that already contain text | |
| # -O 1 \ # Use safe, lossless optimizations | |
| # --deskew # For pages that are not straight | |
| # --rotate-pages # Rotates pages that need it | |
| ocrmypdf \ | |
| -l eng+spa \ | |
| --output-type pdfa \ | |
| --oversample 300 \ | |
| --force-ocr \ | |
| -O 1 \ | |
| --deskew \ | |
| --rotate-pages \ | |
| "$input_pdf" "$input_pdf" | |
| # Check if the OCR was successful | |
| if [ $? -eq 0 ]; then | |
| echo "OCR completed successfully." | |
| else | |
| echo "OCR encountered an error." | |
| exit 1 | |
| fi | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment