Last active
October 3, 2019 14:07
-
-
Save simonszu/17c4429d5b509d743fc1690f7471dd1c to your computer and use it in GitHub Desktop.
A process of deskewing, cropping and OCRing scanned documents, just like the Doxie Scanner App does - but for Command line
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Emulates the process of a Doxie workflow | |
# - Deskewing | |
# - Cropping black border | |
# - Convert to PDF | |
# - OCR | |
# | |
# Dependencies: imagemagick, tesseract, tesseract-data-deu | |
# Check if all working directories exist | |
if [ ! -d "Input" ]; then | |
echo "Input directory not found. Please create Input directory." | |
exit 1 | |
fi | |
if [ ! -d "Deskewed" ]; then | |
mkdir -p Deskewed | |
fi | |
if [ ! -d "Cropped" ]; then | |
mkdir -p Cropped | |
fi | |
if [ ! -d "Output" ]; then | |
mkdir -p Output | |
fi | |
# Deskew images | |
echo "Deskewing images..." | |
cd Input | |
for f in *.JPG | |
do | |
convert $f -set option:deskew:auto-crop true -background black -deskew 40% ../Deskewed/$f | |
done | |
# Crop black borders from images | |
echo "Cropping images..." | |
cd ../Deskewed | |
for f in *.JPG | |
do | |
convert $f -bordercolor black -border 1x1 -fuzz 40% -trim ../Cropped/$f | |
done | |
# OCR the PDF files | |
echo "OCR processing..." | |
cd ../Cropped | |
for f in *.JPG | |
do | |
filename=$(echo $f | cut -f 1 -d '.') | |
tesseract $f ../Output/$filename pdf | |
done | |
# Clean up the intermediate folders | |
cd .. | |
rm -r Deskewed | |
rm -r Cropped |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment