vehrka · January 28, 2015 07:24
diff --git a/ocr.sh b/ocr.sh
 #!/bin/sh

 STARTPAGE=1 # set to pagenumber of the first page of PDF you wish to convert
 ENDPAGE=11 # set to pagenumber of the last page of PDF you wish to convert
 SOURCE=source.pdf # set to the file name of the PDF
 OUTPUT=destination.txt # set to the final output file
 RESOLUTION=75 # set to the resolution the scanner used (for B/W and good scans 75 suffice)

 touch $OUTPUT
 for i in `seq $STARTPAGE $ENDPAGE`; do
    echo extracting page $i
    convert -monochrome -density $RESOLUTION $SOURCE\[$(($i - 1 ))\] /tmp/page.tif
    echo processing page $i
    tesseract -l spa -psm 3 /tmp/page.tif /tmp/tempoutput
    cat /tmp/tempoutput.txt >> $OUTPUT
 done
	#!/bin/sh

	STARTPAGE=1 # set to pagenumber of the first page of PDF you wish to convert
	ENDPAGE=11 # set to pagenumber of the last page of PDF you wish to convert
	SOURCE=source.pdf # set to the file name of the PDF
	OUTPUT=destination.txt # set to the final output file
	RESOLUTION=75 # set to the resolution the scanner used (for B/W and good scans 75 suffice)

	touch $OUTPUT
	for i in `seq $STARTPAGE $ENDPAGE`; do
	echo extracting page $i
	convert -monochrome -density $RESOLUTION $SOURCE\[$(($i - 1 ))\] /tmp/page.tif
	echo processing page $i
	tesseract -l spa -psm 3 /tmp/page.tif /tmp/tempoutput
	cat /tmp/tempoutput.txt >> $OUTPUT
	done