Last active
November 13, 2019 21:24
-
-
Save Turupawn/88fc893405ce6b2749b18715d14c47e7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
FILE=./input.doc | |
if test -f "$FILE"; then | |
lowriter --convert-to pdf:writer_pdf_Export input.doc | |
fi | |
mkdir image | |
pdftoppm input.pdf image/page -png | |
# DUPLICATE | |
#page=1 | |
#for image_file in ./image/*.png; do | |
# additional_zeros="" | |
# if [ $page -lt 10 ] | |
# then | |
# additional_zeros="00" | |
# elif [ $page -lt 100 ] | |
# then | |
# additional_zeros="0" | |
# fi | |
# cp $image_file "./image/"$additional_zeros$page"A.png" | |
# mv $image_file "./image/"$additional_zeros$page"B.png" | |
# page=$((page+1)) | |
#done | |
# CROP NORTH | |
#for image_file in ./image/*.png; do | |
# convert $image_file -gravity North -chop 0x474 $image_file | |
#done | |
# CROP SOUTH | |
#for image_file in ./image/*.png; do | |
# convert $image_file -gravity South -chop 0x426 $image_file | |
#done | |
# CROP EAST | |
#for image_file in ./image/*A.png; do | |
# convert $image_file -gravity East -chop 825x0 $image_file | |
#done | |
# CROP WEST | |
#for image_file in ./image/*B.png; do | |
# convert $image_file -gravity West -chop 825x0 $image_file | |
#done | |
mkdir text | |
page=1 | |
for image_file in ./image/*.png; do | |
additional_zeros="" | |
if [ $page -lt 10 ] | |
then | |
additional_zeros="00" | |
elif [ $page -lt 100 ] | |
then | |
additional_zeros="0" | |
fi | |
tesseract $image_file "./text/"$additional_zeros$page -l spa | |
page=$((page+1)) | |
done | |
for text_file in ./text/*.txt; do | |
cat $text_file >> output.txt | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment