USMortality · October 21, 2024 15:43
diff --git a/pdf-ocr.sh b/pdf-ocr.sh
 #!/bin/bash

 # Ensure the required tools are installed
 command -v pdftoppm >/dev/null 2>&1 || {
    echo "pdftoppm is not installed. Exiting."
    exit 1
 }
 command -v tesseract >/dev/null 2>&1 || {
    echo "Tesseract is not installed. Exiting."
    exit 1
 }
 command -v gs >/dev/null 2>&1 || {
    echo "Ghostscript is not installed. Exiting."
    exit 1
 }

 # Input PDF file
 INPUT_PDF="$1"
 OUTPUT_DIR="./ocr_output"
 OUTPUT_PDF="searchable_output.pdf"

 # Create a directory to store images and PDFs
 mkdir -p "$OUTPUT_DIR"
 echo "Converting PDF to JPG images..."

 # Convert PDF to JPG images (one image per page)
 pdftoppm -jpeg "$INPUT_PDF" "$OUTPUT_DIR/output"

 # Process each JPG image with Tesseract to create PDFs
 echo "Running OCR on images and creating individual PDFs..."
 for IMG in "$OUTPUT_DIR"/*.jpg; do
    BASENAME=$(basename "$IMG" .jpg)
    OCR_PDF="$OUTPUT_DIR/$BASENAME.pdf"

    echo "Processing $IMG..."
    tesseract "$IMG" "$OUTPUT_DIR/$BASENAME" -l deu pdf # Generates searchable PDF
 done

 # Combine individual PDFs into a single searchable PDF using Ghostscript
 echo "Merging individual PDFs into a single searchable PDF with Ghostscript..."
 gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile="$OUTPUT_PDF" "$OUTPUT_DIR"/*.pdf

 echo "Process completed! Searchable PDF saved as $OUTPUT_PDF."
	#!/bin/bash

	# Ensure the required tools are installed
	command -v pdftoppm >/dev/null 2>&1 \|\| {
	echo "pdftoppm is not installed. Exiting."
	exit 1
	}
	command -v tesseract >/dev/null 2>&1 \|\| {
	echo "Tesseract is not installed. Exiting."
	exit 1
	}
	command -v gs >/dev/null 2>&1 \|\| {
	echo "Ghostscript is not installed. Exiting."
	exit 1
	}

	# Input PDF file
	INPUT_PDF="$1"
	OUTPUT_DIR="./ocr_output"
	OUTPUT_PDF="searchable_output.pdf"

	# Create a directory to store images and PDFs
	mkdir -p "$OUTPUT_DIR"
	echo "Converting PDF to JPG images..."

	# Convert PDF to JPG images (one image per page)
	pdftoppm -jpeg "$INPUT_PDF" "$OUTPUT_DIR/output"

	# Process each JPG image with Tesseract to create PDFs
	echo "Running OCR on images and creating individual PDFs..."
	for IMG in "$OUTPUT_DIR"/*.jpg; do
	BASENAME=$(basename "$IMG" .jpg)
	OCR_PDF="$OUTPUT_DIR/$BASENAME.pdf"

	echo "Processing $IMG..."
	tesseract "$IMG" "$OUTPUT_DIR/$BASENAME" -l deu pdf # Generates searchable PDF
	done

	# Combine individual PDFs into a single searchable PDF using Ghostscript
	echo "Merging individual PDFs into a single searchable PDF with Ghostscript..."
	gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile="$OUTPUT_PDF" "$OUTPUT_DIR"/*.pdf

	echo "Process completed! Searchable PDF saved as $OUTPUT_PDF."