Created
September 15, 2013 18:17
-
-
Save diramazioni/6573112 to your computer and use it in GitHub Desktop.
A script I quickly come up for my needs to extract image from pdf, preprocess and extract text with tesseract
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
STARTPAGE=2 | |
ENDPAGE=75 | |
SOURCE=book.pdf | |
OUTPUT=book.txt | |
RESOLUTION=600 | |
LAYOUT="1" | |
[[ -e out.txt ]] && rm out.txt | |
for i in `seq $STARTPAGE $ENDPAGE`; do | |
echo processing page $i | |
[[ ! -e page_"$i".png ]] && convert -colorspace Gray -crop 6704x5408+1894+0 -density $RESOLUTION $SOURCE\[$(($i - 1 ))\] page_"$i".png | |
if [[ ! -e 8/page8_$i.png ]]; then | |
convert -depth 8 -brightness-contrast +30x+100 page_"$i".png 8/page8_$i.png # need to convert to 8 color bit in order to make unpaper work | |
fi | |
if [[ ! -e pages/page_"$i"_1.png || ! -e pages/page_"$i"_2.png ]]; then | |
echo "**** unpaper $i" | |
rm page_"$i".pnm page_"$i"_1.pnm page_"$i"_2.pnm page_"$i"_1u.pnm page_"$i"_2u.pnm 2>/dev/null | |
if [[ LAYOUT == "1" ]]; then | |
convert -crop 3352x5408+1894+0 8/page8_"$i".png 8/page8_"$i"_1.png | |
convert -crop 3352x5408+5246+0 8/page8_"$i".png 8/page8_"$i"_2.png | |
pngtopnm 8/page8_"$i"_1.png >page_"$i"_1.pnm | |
pngtopnm 8/page8_"$i"_2.png >page_"$i"_2.pnm | |
unpaper --layout single -dn left -ds 1500 -dd 0.6 -dv 1.1 page_"$i"_1.pnm page_"$i"_1u.pnm | |
unpaper --layout single -dn right -ds 1500 -dd 0.6 -dv 1.1 page_"$i"_2.pnm page_"$i"_2u.pnm | |
else: | |
pngtopnm 8/page8_$i.png >page_"$i".pnm | |
unpaper --layout double --output-pages 2 -dn left -ds 1500 -dd 0.6 -dv 1.1 -mw 350 page_"$i".pnm page_"$i"_%d.pnm | |
fi | |
convert page_"$i"_1u.pnm pages/page_"$i"_1.png | |
convert page_"$i"_2u.pnm pages/page_"$i"_2.png | |
fi | |
tesseract pages/page_"$i"_1.png out -l deu | |
echo "--------------------------------- pdf $i A ---------------------------------">> $OUTPUT | |
cat out.txt >> $OUTPUT | |
tesseract pages/page_"$i"_2.png out -l deu | |
echo "--------------------------------- pdf $i B ---------------------------------">> $OUTPUT | |
cat out.txt >> $OUTPUT | |
done | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The different value for LAYOUT control the mode unpaper works with double layout performs the deskew comparing left and right side. I've tried both ways with mixed results, seems that --deskew-scan-size plays a big role in fixing incorrectly rotated images, test out what works best for your scanned pages.