Last active
October 21, 2024 15:43
-
-
Save USMortality/2627ac00de2ecade2f00c87f50f69eaa to your computer and use it in GitHub Desktop.
OCR PDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Ensure the required tools are installed | |
command -v pdftoppm >/dev/null 2>&1 || { | |
echo "pdftoppm is not installed. Exiting." | |
exit 1 | |
} | |
command -v tesseract >/dev/null 2>&1 || { | |
echo "Tesseract is not installed. Exiting." | |
exit 1 | |
} | |
command -v gs >/dev/null 2>&1 || { | |
echo "Ghostscript is not installed. Exiting." | |
exit 1 | |
} | |
# Input PDF file | |
INPUT_PDF="$1" | |
OUTPUT_DIR="./ocr_output" | |
OUTPUT_PDF="searchable_output.pdf" | |
# Create a directory to store images and PDFs | |
mkdir -p "$OUTPUT_DIR" | |
echo "Converting PDF to JPG images..." | |
# Convert PDF to JPG images (one image per page) | |
pdftoppm -jpeg "$INPUT_PDF" "$OUTPUT_DIR/output" | |
# Process each JPG image with Tesseract to create PDFs | |
echo "Running OCR on images and creating individual PDFs..." | |
for IMG in "$OUTPUT_DIR"/*.jpg; do | |
BASENAME=$(basename "$IMG" .jpg) | |
OCR_PDF="$OUTPUT_DIR/$BASENAME.pdf" | |
echo "Processing $IMG..." | |
tesseract "$IMG" "$OUTPUT_DIR/$BASENAME" -l deu pdf # Generates searchable PDF | |
done | |
# Combine individual PDFs into a single searchable PDF using Ghostscript | |
echo "Merging individual PDFs into a single searchable PDF with Ghostscript..." | |
gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile="$OUTPUT_PDF" "$OUTPUT_DIR"/*.pdf | |
echo "Process completed! Searchable PDF saved as $OUTPUT_PDF." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment