Skip to content

Instantly share code, notes, and snippets.

@jovandeginste
Created January 5, 2021 11:20
Show Gist options
  • Save jovandeginste/c86f4c421d4e9b96dc6681d44d089860 to your computer and use it in GitHub Desktop.
Save jovandeginste/c86f4c421d4e9b96dc6681d44d089860 to your computer and use it in GitHub Desktop.
Scan and/or convert
#!/bin/bash -eu
: ${SCANNER:="brother4:bus9;dev1"} # My USB scanner
: ${OCR_LANG:=nld} # Tesseract language
: ${SCAN_DPI:=300} # A good resolution for OCR
: ${FINAL_RES:=1240x} # A good resolution for PDF
: ${SHOW:=0} # Show the result
: ${NICE:=10} # Priority of this process - this prevents Tesseract from hogging your machine
: ${UNPAPER:=1} # Whether to run unpaper
: ${FILENAME:=AUTO} # If no FILENAME is set, or set to 'AUTO': make one up based on time
: ${VERBOSE:=0} # Whether to print verbose information
: ${ROOT_DIR:=$PWD}
: ${INPUT_DIR:=${ROOT_DIR}/input}
: ${INTER_DIR:=${ROOT_DIR}/inter}
: ${OUTPUT_DIR:=${ROOT_DIR}/output}
red=$(tput setaf 1)
green=$(tput setaf 2)
blue=$(tput setaf 4)
reset=$(tput sgr0)
function usage() {
cat <<EOF
Usage: $0 subcommand
Subcommands:
- scan: scan documents into '${INPUT_DIR}' folders
- convert: convert previously scanned documents into '${OUTPUT_DIR}' folders
EOF
}
function echo_verbose() {
if [[ "$VERBOSE" == "1" ]]; then
echo "$@"
fi
}
function timed_dir() {
date "+%F_%T" | sed 's/:/-/g' # Don't use ':' because of Windows
}
function _scan() {
PAGE=1
SCAN_SUB_DIR=$(timed_dir)
while true; do
cat <<EOF
Press:
[enter] to scan,
[n] for new document,
[l] to scan the last page, then start new document,
[ctrl+c] to abort
Going to scan page '${red}${PAGE}${reset}' into '${green}${SCAN_SUB_DIR}${reset}'
EOF
read ACTION
case "${ACTION}" in
n)
finish_scan "${SCAN_SUB_DIR}"
PAGE=0
SCAN_SUB_DIR=$(timed_dir)
;;
l)
scan_file "${SCAN_SUB_DIR}" "${PAGE}"
finish_scan "${SCAN_SUB_DIR}"
PAGE=0
SCAN_SUB_DIR=$(timed_dir)
;;
*)
scan_file "${SCAN_SUB_DIR}" "${PAGE}"
;;
esac
PAGE=$((PAGE + 1))
echo
done
}
function finish_scan() {
SCAN_SUB_DIR="$1"
FULL_DIR="${INPUT_DIR}/${SCAN_SUB_DIR}"
if [[ -d "${FULL_DIR}" ]]; then
cat <<EOF >>"${FULL_DIR}/info"
SCAN_DPI='${SCAN_DPI}'
OCR_LANG='${OCR_LANG}'
FINAL_RES='${FINAL_RES}'
UNPAPER='${UNPAPER}'
EOF
touch "${FULL_DIR}/done"
fi
echo
echo "==================================================================================="
}
function scan_file() {
SCAN_SUB_DIR="$1"
PAGE="$2"
FULL_DIR="${INPUT_DIR}/${SCAN_SUB_DIR}"
FILE="${FULL_DIR}/page_$(printf "%03d" "${PAGE}").tiff"
echo "Now scanning page '${red}${PAGE}${reset}' into '${green}${FULL_DIR}${reset}'"
mkdir -p "${FULL_DIR}"
touch "${FULL_DIR}/info"
scanimage -p -d "${SCANNER}" \
--format=tiff \
--resolution "${SCAN_DPI}" \
--output-file "${FILE}"
echo "Finished."
}
function _convert() {
mkdir -p "${OUTPUT_DIR}"
for entry in ${INPUT_DIR}/*; do
[[ ! -d "${entry}" ]] && continue
[[ ! -f "${entry}/done" ]] && continue
convert_dir "${entry}"
rm -rf "${entry}"
done
}
function convert_dir() {
inputdir="$1"
subdir=${inputdir#${INPUT_DIR}/}
interdir="${INTER_DIR}/${subdir}"
UNPAPER=1
FILENAME="${OUTPUT_DIR}/${subdir}.pdf"
if [[ -f "${FILENAME}" ]]; then
echo_verbose "Already processed '${red}${inputdir}${reset}' - ${blue}skipping${reset}!"
return
fi
echo "Converting: '${red}${inputdir}${reset}' into '${green}${interdir}${reset}'"
[[ -f "${entry}/info" ]] && source "${entry}/info"
mkdir -p "${interdir}"
COMBINED_TIFF="${interdir}/combined.tiff"
if [[ "${UNPAPER}" == "1" ]]; then
pre_process "${inputdir}" "${interdir}"
convert "${interdir}"/page_*-unpaper.pnm "${COMBINED_TIFF}"
else
convert "${inputdir}"/page_*.tiff "${COMBINED_TIFF}"
fi
echo_verbose "Rescaling output PDF..."
convert "${COMBINED_TIFF}" -scale "${FINAL_RES}" -compress jpeg -quality 75 "${interdir}/combined.pdf"
echo_verbose "Running OCR..."
tesseract -l "${OCR_LANG}" -c textonly_pdf=1 "${COMBINED_TIFF}" "${interdir}/ocr" pdf
qpdf "${interdir}/combined.pdf" --underlay "${interdir}/ocr.pdf" -- "${FILENAME}"
echo "Conversion of '${FILENAME}' is done"
rm -rf "${interdir}"
}
function pre_process() {
INPUT="$1"
OUTPUT="$2"
PAGES="${INPUT}"/page_*.tiff
COUNT=$(find "${INPUT}" -name 'page_*.tiff' | wc -l)
echo "Pre-processing ${red}${COUNT}${reset} page(s) in parallel..."
for PAGE in $PAGES; do
name="$(basename "$PAGE")"
outputname="${name%.tiff}-unpaper.pnm"
pre_process_page "${PAGE}" "${OUTPUT}/${outputname}" &
done | parallel --bar -u
}
function pre_process_page() {
INPUT="$1"
OUTPUT="$2"
echo "unpaper --overwrite -q --dpi 300.0 '${INPUT}' '${OUTPUT}' 2>/dev/null"
}
function test_deps() {
deps=""
case "$1" in
scan)
deps="scanimage"
;;
convert)
deps="unpaper convert tesseract qpdf parallel"
;;
esac
for command in ${deps}; do
if ! command -v "${command}" &>/dev/null; then
echo "Missing command: '${command}'"
command "${command}"
exit 1
fi
done
}
function _run() {
COMMAND="$1"
test_deps "${COMMAND}"
case "${COMMAND}" in
scan) _scan ;;
convert) _convert ;;
h | help) usage && exit 0 ;;
*) usage && exit 1 ;;
esac
}
_run ${1:-help}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment