gtfierro · May 11, 2022 16:57 · jon4thin · May 11, 2022
diff --git a/run_ocr.sh b/run_ocr.sh
 #!/bin/bash

 # requires ghostscript (http://www.ghostscript.com/)
 # requires ImageMagick
 # requires tesseract (https://code.google.com/p/tesseract-ocr/)
 # requires GNU parallel (https://www.gnu.org/software/parallel/)
 # all of these are typically available through yum/apt/brew/etc.

 # number of cores over which the process will be parallelized
 num_cores=$1

 # converts each of the PDFs into TIFF images so that tesseract can interact with them
 ind . -name '*.pdf' | parallel --gnu -j $NUMCORES convert -depth 8 -density 200 {}[0-19] {}.tif

 # runs OCR on the found TIFF files and converts them to text. Assumes English, but you can supply
 # extra arguments to tesseract
 find . -name '*.tif' | parallel -j $NUMCORES tesseract -l eng {} {}
	#!/bin/bash

	# requires ghostscript (http://www.ghostscript.com/)
	# requires ImageMagick
	# requires tesseract (https://code.google.com/p/tesseract-ocr/)
	# requires GNU parallel (https://www.gnu.org/software/parallel/)
	# all of these are typically available through yum/apt/brew/etc.

	# number of cores over which the process will be parallelized
	num_cores=$1

	# converts each of the PDFs into TIFF images so that tesseract can interact with them
	ind . -name '*.pdf' \| parallel --gnu -j $NUMCORES convert -depth 8 -density 200 {}[0-19] {}.tif

	# runs OCR on the found TIFF files and converts them to text. Assumes English, but you can supply
	# extra arguments to tesseract
	find . -name '*.tif' \| parallel -j $NUMCORES tesseract -l eng {} {}