Last active
June 27, 2017 08:14
-
-
Save drjwbaker/dbb8396cdf840a3a44c70484659dafad to your computer and use it in GitHub Desktop.
Tesseract/docker script for multiple files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
docker ps -f name=t4re | |
TASK_TMP_DIR=TASK_$$_$(date +"%N") | |
echo "====== TASK $TASK_TMP_DIR started ======" | |
docker exec -it t4re mkdir \-p ./$TASK_TMP_DIR/ | |
docker cp ./ocr-files/. t4re:/home/work/$TASK_TMP_DIR/ | |
docker exec -it t4re /bin/bash -c "mkdir -p ./$TASK_TMP_DIR/out/; cd ./$TASK_TMP_DIR/out/; for file in *.tif ; do outfilename="${file%.*}" ; tesseract $file $outfilename -l eng --psm 1 --oem 2 txt pdf hocr; done" | |
mkdir -p ./ocr-files/output/$TASK_TMP_DIR/ | |
docker cp t4re:/home/work/$TASK_TMP_DIR/out/ ./ocr-files/output/$TASK_TMP_DIR/ | |
docker exec -it t4re rm \-r ./$TASK_TMP_DIR/ | |
docker exec -it t4re ls | |
echo "====== Result files was copied to ./ocr-files/output/$TASK_TMP_DIR/ ======" | |
# This script https://github.com/tesseract-shadow/tesseract-ocr-re/blob/master/scripts/test.sh enables me to run Tesseract through Docker to OCR a single file (whatever is at `/ocr-files/phototest.tif`). | |
# I need a script that lets me run this against multiple files. | |
# Thanks to Ben https://twitter.com/antinomy/status/879444295136292864 and Ben https://twitter.com/benosteen/status/879378121677524992 I've adapted the above script and got as far as the above. | |
# All have done is amend line 6 to change the files copies and everything from `for file in` on line 7. | |
# I get the traceback: | |
# ====== TASK TASK_15811_261629617 started ====== | |
# Tesseract Open Source OCR Engine v4.00.00alpha with Leptonica | |
# Error in fopenReadStream: file not found | |
# Error in findFileFormat: image file not found | |
# Error during processing. | |
# ====== Result files was copied to ./ocr-files/output/TASK_15811_261629617/ ====== | |
# I have also tried - with the same results - the slightly different: | |
#!/bin/bash | |
docker ps -f name=t4re | |
TASK_TMP_DIR=TASK_$$_$(date +"%N") | |
echo "====== TASK $TASK_TMP_DIR started ======" | |
docker exec -it t4re mkdir \-p ./$TASK_TMP_DIR/ | |
docker cp ./ocr-files/. t4re:/home/work/$TASK_TMP_DIR/ | |
docker exec -it t4re /bin/bash -c "mkdir -p ./$TASK_TMP_DIR/out/; cd ./$TASK_TMP_DIR/out/; for file in *.tif ; do tesseract $file outfilename="${file%.*}" -l eng --psm 1 --oem 2 txt pdf hocr; done" | |
mkdir -p ./ocr-files/output/$TASK_TMP_DIR/ | |
docker cp t4re:/home/work/$TASK_TMP_DIR/out/ ./ocr-files/output/$TASK_TMP_DIR/ | |
docker exec -it t4re rm \-r ./$TASK_TMP_DIR/ | |
docker exec -it t4re ls | |
echo "====== Result files was copied to ./ocr-files/output/$TASK_TMP_DIR/ ======" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment