Last active
December 31, 2020 11:55
-
-
Save jovandeginste/c9c03ba24e7502a54e47049e7bd569fd to your computer and use it in GitHub Desktop.
Scan single or multi-page document, post-process it and convert to OCR'ed PDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -eu | |
# Usage: scandoc [expected number of pages] | |
# Eg: | |
# scandoc 1 # Scan one page, then process | |
# scandoc 5 # Scan 5 pages, then process | |
# scandoc # Scan until page 99 or other action triggers end | |
# | |
# Description: | |
# Scandoc interactively scans a number (1 or more) of pages, runs | |
# post-processing operations on each page, runs OCR on the high quality | |
# version and generates a new, lower quality PDF. The OCR from the high | |
# quality version and the images from the low quality version are then | |
# merged into a single PDF. | |
# | |
# This final PDF should be fairly low size, very readable, and contain | |
# accurately recognized text. | |
# | |
# Requires: | |
# - scanimage for scanning | |
# - unpaper for rectifying, removing background | |
# - imagemagic merging, resampling and compressing | |
# - tesseract for OCR | |
# - qpdf to merge image and text pdf | |
# | |
# Use scanimage -L to get a list of devices. | |
# e.g. device `genesys:libusb:006:003' is a Canon LiDE 210 flatbed scanner | |
# then copy/paste genesys:libusb:006:003 into SCANNER below. | |
PAGES=${1:-99} # Parameter: number of pages; defaults to 99 (which is equivalent to 'unlimited') | |
# Parameters based on environment variables | |
# Sensible defaults are set for me | |
# You can set your own defaults per shell by exporting them, or | |
# eg. add them to your ~/.bashrc | |
: ${SCANNER:="brother4:bus9;dev1"} # My USB scanner | |
: ${OCR_LANG:=nld} # Tesseract language | |
: ${SCAN_DPI:=300} # A good resolution for OCR | |
: ${FINAL_RES:=1240x} # A good resolution for PDF | |
: ${SHOW:=0} # Show the result | |
: ${NICE:=10} # Priority of this process - this prevents Tesseract from hogging your machine | |
: ${UNPAPER:=1} # Whether to run unpaper | |
: ${FILENAME:=AUTO} # If no FILENAME is set, or set to 'AUTO': make one up based on time | |
if [[ "${FILENAME}" == "AUTO" ]]; then | |
FILENAME=$(date "+%F_%T" | sed 's/:/-/g').pdf # Don't use ':' because of Windows | |
fi | |
for command in unpaper scanimage convert tesseract qpdf; do | |
if ! command -v "${command}" &>/dev/null; then | |
echo "Missing command: '${command}'" | |
command "${command}" | |
exit 1 | |
fi | |
done | |
re='^[0-9]+$' # Check if PAGES is a number | |
if ! [[ ${PAGES} =~ $re ]]; then | |
echo "error: Usage: $0 filename number_of_pages" >&2 | |
exit 1 | |
fi | |
TMP_DIR=$(mktemp -d) | |
trap "{ rm -vrf "${TMP_DIR}"; }" EXIT | |
echo "Scanning to: '${FILENAME}'" | |
PAGE=0 | |
while [[ "${PAGE}" -lt "${PAGES}" ]]; do | |
PAGE=$((PAGE + 1)) | |
echo "Please, place page ${PAGE}/${PAGES} on the scanner" | |
cat <<EOF | |
Press: | |
[enter] to scan, | |
[q] to stop scanning and start parsing, | |
[l] to scan one more page and then start parsing, | |
[m] to scan and increase the total number of pages (${PAGES} -> $((PAGES + 1))) | |
[ctrl+c] to abort" | |
EOF | |
read ACTION | |
[[ "${ACTION}" == "q" ]] && break | |
echo "Start scanimage..." | |
filename="${TMP_DIR}/page_$(printf "%03d" "${PAGE}").tiff" | |
scanimage -p -d "${SCANNER}" \ | |
--format=tiff \ | |
--resolution "${SCAN_DPI}" \ | |
--output-file "${filename}" | |
if [[ "${UNPAPER}" == "1" ]]; then | |
unpaper -q --dpi 300.0 "${filename}" "${filename%.tiff}-unpaper.pnm" & | |
fi | |
[[ "${ACTION}" == "l" ]] && break | |
if [[ "${ACTION}" == "m" ]]; then | |
PAGES=$((PAGES + 1)) | |
fi | |
done | |
echo "Changing priority to '${NICE}' (higher is lower)" | |
echo "Waiting for unpaper to finish..." | |
wait | |
renice -n "${NICE}" $$ | |
echo "Merging pages..." | |
if [[ "${UNPAPER}" == "1" ]]; then | |
convert "${TMP_DIR}"/page_*-unpaper.pnm "${TMP_DIR}/combined.tiff" | |
else | |
convert "${TMP_DIR}"/page_*.tiff "${TMP_DIR}/combined.tiff" | |
fi | |
convert "${TMP_DIR}/combined.tiff" -scale "${FINAL_RES}" -compress jpeg -quality 75 "${TMP_DIR}/combined.pdf" | |
echo "Running OCR..." | |
tesseract -l "${OCR_LANG}" -c textonly_pdf=1 "${TMP_DIR}/combined.tiff" "${TMP_DIR}/ocr" pdf | |
qpdf "${TMP_DIR}/combined.pdf" --underlay "${TMP_DIR}/ocr.pdf" -- "${FILENAME}" | |
echo "Scan of '${FILENAME}' is done" | |
rm -rf ${TMP_DIR} | |
if [[ "${SHOW}" == "1" ]]; then | |
xdg-open "${FILENAME}" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment