kanru · January 19, 2012 08:19
diff --git a/pdf-ocr.sh b/pdf-ocr.sh
 #! /bin/sh
 # Batch OCR pdf files to text files
 #
 # Copyright (C) 2012  Kan-Ru Chen <[email protected]>
 #  
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #  
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #  
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

 # Prerequisite:
 #  - tesseract 3.00
 #  - Chinese (Traditional) language data for Tesseract
 #  - mupdf-tools
 #  - imagemagick
 #
 # You can get the tesseract files from
 # https://code.google.com/p/tesseract-ocr/downloads/list

 TESSERACT=api/tesseract
 LANGDATA=chi_tra

 usage() {
    echo "pdf-ocr.sh [pdf file]"
 }

 if [ x"$1" = x"" ]; then
    usage;
    exit 1;
 fi

 PDF="$1"
 TEMP=`mktemp -d pdf-ocr.XXXXXXXXXX`

 pdfdraw -o $TEMP/page%d.png -r 300 $PDF
 for png in `ls $TEMP/page*.png`; do
    convert $png $png.tif
 done
 for tif in `ls $TEMP/page*.tif`; do
    echo "OCRing $tif"
    $TESSERACT $tif $tif -l $LANGDATA
 done
 for txt in `ls $TEMP/page*.txt`; do
    (cat $txt;echo ) >> $PDF.txt
 done

 rm -rf $TEMP
	#! /bin/sh
	# Batch OCR pdf files to text files
	#
	# Copyright (C) 2012 Kan-Ru Chen <[email protected]>
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.

	# Prerequisite:
	# - tesseract 3.00
	# - Chinese (Traditional) language data for Tesseract
	# - mupdf-tools
	# - imagemagick
	#
	# You can get the tesseract files from
	# https://code.google.com/p/tesseract-ocr/downloads/list

	TESSERACT=api/tesseract
	LANGDATA=chi_tra

	usage() {
	echo "pdf-ocr.sh [pdf file]"
	}

	if [ x"$1" = x"" ]; then
	usage;
	exit 1;
	fi

	PDF="$1"
	TEMP=`mktemp -d pdf-ocr.XXXXXXXXXX`

	pdfdraw -o $TEMP/page%d.png -r 300 $PDF
	for png in `ls $TEMP/page*.png`; do
	convert $png $png.tif
	done
	for tif in `ls $TEMP/page*.tif`; do
	echo "OCRing $tif"
	$TESSERACT $tif $tif -l $LANGDATA
	done
	for txt in `ls $TEMP/page*.txt`; do
	(cat $txt;echo ) >> $PDF.txt
	done

	rm -rf $TEMP
No results found