wcaleb · November 6, 2013 14:41 · scruss · Jun 4, 2018 · ramack19 · Oct 17, 2021
diff --git a/ocrpdf.sh b/ocrpdf.sh
 #!/bin/sh

 # Take a PDF, OCR it, and add OCR Text as background layer to original PDF to make it searchable.
 # Hacked together using tips from these websites:
 #      http://www.jlaundry.com/2012/ocr-a-scanned-pdf-with-tesseract/
 #      http://askubuntu.com/questions/27097/how-to-print-a-regular-file-to-pdf-from-command-line
 # Dependencies: pdftk, tesseract, imagemagick, enscript, ps2pdf
 # Would be nice to use hocr2pdf instead so that the text lines up with the PDF image.
 #      http://www.exactcode.com/site/open_source/exactimage/hocr2pdf/

 cp $1 $1.bak
 pdftk $1 burst output tesspage_%02d.pdf
 for file in `ls tesspage*`
 do
    PAGE=$(basename "$file" .pdf)
  # Convert the PDF page into a TIFF file
    convert -monochrome -density 600 $file "$PAGE".tif
  # OCR the TIFF file and save text to output.txt
    tesseract "$PAGE".tif output
  # Turn text file outputed by tesseract into a PDF, then put it in background of original page
    enscript output.txt -B -o - | ps2pdf - output.pdf && pdftk $file background output.pdf output new-"$file"
  # Clean up
    rm output*
    rm "$file"
    rm *.tif
 done
 pdftk new* cat output $1
	#!/bin/sh

	# Take a PDF, OCR it, and add OCR Text as background layer to original PDF to make it searchable.
	# Hacked together using tips from these websites:
	# http://www.jlaundry.com/2012/ocr-a-scanned-pdf-with-tesseract/
	# http://askubuntu.com/questions/27097/how-to-print-a-regular-file-to-pdf-from-command-line
	# Dependencies: pdftk, tesseract, imagemagick, enscript, ps2pdf
	# Would be nice to use hocr2pdf instead so that the text lines up with the PDF image.
	# http://www.exactcode.com/site/open_source/exactimage/hocr2pdf/

	cp $1 $1.bak
	pdftk $1 burst output tesspage_%02d.pdf
	for file in `ls tesspage*`
	do
	PAGE=$(basename "$file" .pdf)
	# Convert the PDF page into a TIFF file
	convert -monochrome -density 600 $file "$PAGE".tif
	# OCR the TIFF file and save text to output.txt
	tesseract "$PAGE".tif output
	# Turn text file outputed by tesseract into a PDF, then put it in background of original page
	enscript output.txt -B -o - \| ps2pdf - output.pdf && pdftk $file background output.pdf output new-"$file"
	# Clean up
	rm output*
	rm "$file"
	rm *.tif
	done
	pdftk new* cat output $1