kueda · February 14, 2024 08:27
diff --git a/img2pdf b/img2pdf
 #!/bin/sh
 #
 # img2pdf
 #
 # OS X bash script that turns a collection of images into an OCR'd PDF
 #
 # Adapted from http://apple.stackexchange.com/questions/128384/ocr-on-pdfs-in-os-x-with-free-open-source-tools, 
 # where it was in turn adapted from
 # http://www.morethantechnical.com/2013/11/21/creating-a-searchable-pdf-with-opensource-tools-ghostscript-hocr2pdf-and-tesseract-ocr/
 # from http://www.ehow.com/how_6874571_merge-pdf-files-ghostscript.html
 # bash tut: http://linuxconfig.org/bash-scripting-tutorial
 # Linux PDF,OCR: http://blog.konradvoelkel.de/2013/03/scan-to-pdfa/
 # Dealing w/ alpha: http://stackoverflow.com/questions/5083492/problem-with-tesseract-and-tiff-format
 #
 # Install
 #   brew install tesseract --HEAD
 #   brew install imagemagick
 #   brew install ghostscript
 #   chmod +x img2pdf
 #
 # Usage
 #   ./img2pdf *.gif
 #
 # If you have a mix of extensions:
 #   ./img2pdf *.{gif,jpeg}
 #

 y="`pwd`/$1"
 echo Creating a searchable PDF for $y

 x=`basename "$y"`
 name=${x%.*}

 # process each page
 for f in $@; do
  echo $f
  echo "\tConverting to TIFF..."
  convert $f -background white -flatten +matte $f.tiff
  echo "\tTesseract OCR..."
  # echo "tesseract -l eng -psm 3 $f.tiff ${f%.*} pdf"
  tesseract -l eng -psm 3 $f.tiff ${f%.*} pdf 1>/dev/null 2>&1
  echo "\tCleanup..."
  rm $f.tiff
  rm ${f%.*}.txt
  mv ${f%.*}.pdf $f.tmp.pdf
 done

 echo "Combining all pages into a single PDF..."
 gs -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${name}.searchable.pdf *.tmp.pdf
 rm *.tmp.pdf
 echo "Created $name.searchable.pdf"
	#!/bin/sh
	#
	# img2pdf
	#
	# OS X bash script that turns a collection of images into an OCR'd PDF
	#
	# Adapted from http://apple.stackexchange.com/questions/128384/ocr-on-pdfs-in-os-x-with-free-open-source-tools,
	# where it was in turn adapted from
	# http://www.morethantechnical.com/2013/11/21/creating-a-searchable-pdf-with-opensource-tools-ghostscript-hocr2pdf-and-tesseract-ocr/
	# from http://www.ehow.com/how_6874571_merge-pdf-files-ghostscript.html
	# bash tut: http://linuxconfig.org/bash-scripting-tutorial
	# Linux PDF,OCR: http://blog.konradvoelkel.de/2013/03/scan-to-pdfa/
	# Dealing w/ alpha: http://stackoverflow.com/questions/5083492/problem-with-tesseract-and-tiff-format
	#
	# Install
	# brew install tesseract --HEAD
	# brew install imagemagick
	# brew install ghostscript
	# chmod +x img2pdf
	#
	# Usage
	# ./img2pdf *.gif
	#
	# If you have a mix of extensions:
	# ./img2pdf *.{gif,jpeg}
	#

	y="`pwd`/$1"
	echo Creating a searchable PDF for $y

	x=`basename "$y"`
	name=${x%.*}

	# process each page
	for f in $@; do
	echo $f
	echo "\tConverting to TIFF..."
	convert $f -background white -flatten +matte $f.tiff
	echo "\tTesseract OCR..."
	# echo "tesseract -l eng -psm 3 $f.tiff ${f%.*} pdf"
	tesseract -l eng -psm 3 $f.tiff ${f%.*} pdf 1>/dev/null 2>&1
	echo "\tCleanup..."
	rm $f.tiff
	rm ${f%.*}.txt
	mv ${f%.*}.pdf $f.tmp.pdf
	done

	echo "Combining all pages into a single PDF..."
	gs -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${name}.searchable.pdf *.tmp.pdf
	rm *.tmp.pdf
	echo "Created $name.searchable.pdf"