Skip to content

Instantly share code, notes, and snippets.

@jovandeginste
Last active December 31, 2020 11:55
Show Gist options
  • Save jovandeginste/c9c03ba24e7502a54e47049e7bd569fd to your computer and use it in GitHub Desktop.
Save jovandeginste/c9c03ba24e7502a54e47049e7bd569fd to your computer and use it in GitHub Desktop.
Scan single or multi-page document, post-process it and convert to OCR'ed PDF
#!/bin/bash -eu
# Usage: scandoc [expected number of pages]
# Eg:
# scandoc 1 # Scan one page, then process
# scandoc 5 # Scan 5 pages, then process
# scandoc # Scan until page 99 or other action triggers end
#
# Description:
# Scandoc interactively scans a number (1 or more) of pages, runs
# post-processing operations on each page, runs OCR on the high quality
# version and generates a new, lower quality PDF. The OCR from the high
# quality version and the images from the low quality version are then
# merged into a single PDF.
#
# This final PDF should be fairly low size, very readable, and contain
# accurately recognized text.
#
# Requires:
# - scanimage for scanning
# - unpaper for rectifying, removing background
# - imagemagic merging, resampling and compressing
# - tesseract for OCR
# - qpdf to merge image and text pdf
#
# Use scanimage -L to get a list of devices.
# e.g. device `genesys:libusb:006:003' is a Canon LiDE 210 flatbed scanner
# then copy/paste genesys:libusb:006:003 into SCANNER below.
PAGES=${1:-99} # Parameter: number of pages; defaults to 99 (which is equivalent to 'unlimited')
# Parameters based on environment variables
# Sensible defaults are set for me
# You can set your own defaults per shell by exporting them, or
# eg. add them to your ~/.bashrc
: ${SCANNER:="brother4:bus9;dev1"} # My USB scanner
: ${OCR_LANG:=nld} # Tesseract language
: ${SCAN_DPI:=300} # A good resolution for OCR
: ${FINAL_RES:=1240x} # A good resolution for PDF
: ${SHOW:=0} # Show the result
: ${NICE:=10} # Priority of this process - this prevents Tesseract from hogging your machine
: ${UNPAPER:=1} # Whether to run unpaper
: ${FILENAME:=AUTO} # If no FILENAME is set, or set to 'AUTO': make one up based on time
if [[ "${FILENAME}" == "AUTO" ]]; then
FILENAME=$(date "+%F_%T" | sed 's/:/-/g').pdf # Don't use ':' because of Windows
fi
for command in unpaper scanimage convert tesseract qpdf; do
if ! command -v "${command}" &>/dev/null; then
echo "Missing command: '${command}'"
command "${command}"
exit 1
fi
done
re='^[0-9]+$' # Check if PAGES is a number
if ! [[ ${PAGES} =~ $re ]]; then
echo "error: Usage: $0 filename number_of_pages" >&2
exit 1
fi
TMP_DIR=$(mktemp -d)
trap "{ rm -vrf "${TMP_DIR}"; }" EXIT
echo "Scanning to: '${FILENAME}'"
PAGE=0
while [[ "${PAGE}" -lt "${PAGES}" ]]; do
PAGE=$((PAGE + 1))
echo "Please, place page ${PAGE}/${PAGES} on the scanner"
cat <<EOF
Press:
[enter] to scan,
[q] to stop scanning and start parsing,
[l] to scan one more page and then start parsing,
[m] to scan and increase the total number of pages (${PAGES} -> $((PAGES + 1)))
[ctrl+c] to abort"
EOF
read ACTION
[[ "${ACTION}" == "q" ]] && break
echo "Start scanimage..."
filename="${TMP_DIR}/page_$(printf "%03d" "${PAGE}").tiff"
scanimage -p -d "${SCANNER}" \
--format=tiff \
--resolution "${SCAN_DPI}" \
--output-file "${filename}"
if [[ "${UNPAPER}" == "1" ]]; then
unpaper -q --dpi 300.0 "${filename}" "${filename%.tiff}-unpaper.pnm" &
fi
[[ "${ACTION}" == "l" ]] && break
if [[ "${ACTION}" == "m" ]]; then
PAGES=$((PAGES + 1))
fi
done
echo "Changing priority to '${NICE}' (higher is lower)"
echo "Waiting for unpaper to finish..."
wait
renice -n "${NICE}" $$
echo "Merging pages..."
if [[ "${UNPAPER}" == "1" ]]; then
convert "${TMP_DIR}"/page_*-unpaper.pnm "${TMP_DIR}/combined.tiff"
else
convert "${TMP_DIR}"/page_*.tiff "${TMP_DIR}/combined.tiff"
fi
convert "${TMP_DIR}/combined.tiff" -scale "${FINAL_RES}" -compress jpeg -quality 75 "${TMP_DIR}/combined.pdf"
echo "Running OCR..."
tesseract -l "${OCR_LANG}" -c textonly_pdf=1 "${TMP_DIR}/combined.tiff" "${TMP_DIR}/ocr" pdf
qpdf "${TMP_DIR}/combined.pdf" --underlay "${TMP_DIR}/ocr.pdf" -- "${FILENAME}"
echo "Scan of '${FILENAME}' is done"
rm -rf ${TMP_DIR}
if [[ "${SHOW}" == "1" ]]; then
xdg-open "${FILENAME}"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment