jovandeginste · January 5, 2021 11:20
diff --git a/scan b/scan
 #!/bin/bash -eu

 : ${SCANNER:="brother4:bus9;dev1"} # My USB scanner
 : ${OCR_LANG:=nld}                 # Tesseract language
 : ${SCAN_DPI:=300}                 # A good resolution for OCR
 : ${FINAL_RES:=1240x}              # A good resolution for PDF
 : ${SHOW:=0}                       # Show the result
 : ${NICE:=10}                      # Priority of this process - this prevents Tesseract from hogging your machine
 : ${UNPAPER:=1}                    # Whether to run unpaper
 : ${FILENAME:=AUTO}                # If no FILENAME is set, or set to 'AUTO': make one up based on time
 : ${VERBOSE:=0}                    # Whether to print verbose information

 : ${ROOT_DIR:=$PWD}
 : ${INPUT_DIR:=${ROOT_DIR}/input}
 : ${INTER_DIR:=${ROOT_DIR}/inter}
 : ${OUTPUT_DIR:=${ROOT_DIR}/output}

 red=$(tput setaf 1)
 green=$(tput setaf 2)
 blue=$(tput setaf 4)
 reset=$(tput sgr0)

 function usage() {
    cat <<EOF
 Usage: $0 subcommand

 Subcommands:

 - scan: scan documents into '${INPUT_DIR}' folders
 - convert: convert previously scanned documents into '${OUTPUT_DIR}' folders

 EOF
 }

 function echo_verbose() {
    if [[ "$VERBOSE" == "1" ]]; then
        echo "$@"
    fi
 }

 function timed_dir() {
    date "+%F_%T" | sed 's/:/-/g' # Don't use ':' because of Windows
 }

 function _scan() {
    PAGE=1
    SCAN_SUB_DIR=$(timed_dir)

    while true; do
        cat <<EOF

 Press:
  [enter] to scan,
  [n] for new document,
  [l] to scan the last page, then start new document,
  [ctrl+c] to abort

 Going to scan page '${red}${PAGE}${reset}' into '${green}${SCAN_SUB_DIR}${reset}'
 EOF
        read ACTION

        case "${ACTION}" in
        n)
            finish_scan "${SCAN_SUB_DIR}"

            PAGE=0
            SCAN_SUB_DIR=$(timed_dir)
            ;;
        l)
            scan_file "${SCAN_SUB_DIR}" "${PAGE}"
            finish_scan "${SCAN_SUB_DIR}"

            PAGE=0
            SCAN_SUB_DIR=$(timed_dir)
            ;;
        *)
            scan_file "${SCAN_SUB_DIR}" "${PAGE}"
            ;;

        esac

        PAGE=$((PAGE + 1))
        echo
    done
 }

 function finish_scan() {
    SCAN_SUB_DIR="$1"
    FULL_DIR="${INPUT_DIR}/${SCAN_SUB_DIR}"

    if [[ -d "${FULL_DIR}" ]]; then
        cat <<EOF >>"${FULL_DIR}/info"
 SCAN_DPI='${SCAN_DPI}'
 OCR_LANG='${OCR_LANG}'
 FINAL_RES='${FINAL_RES}'
 UNPAPER='${UNPAPER}'
 EOF
        touch "${FULL_DIR}/done"
    fi

    echo
    echo "==================================================================================="
 }

 function scan_file() {
    SCAN_SUB_DIR="$1"
    PAGE="$2"
    FULL_DIR="${INPUT_DIR}/${SCAN_SUB_DIR}"
    FILE="${FULL_DIR}/page_$(printf "%03d" "${PAGE}").tiff"

    echo "Now scanning page '${red}${PAGE}${reset}' into '${green}${FULL_DIR}${reset}'"
    mkdir -p "${FULL_DIR}"
    touch "${FULL_DIR}/info"

    scanimage -p -d "${SCANNER}" \
        --format=tiff \
        --resolution "${SCAN_DPI}" \
        --output-file "${FILE}"

    echo "Finished."
 }

 function _convert() {
    mkdir -p "${OUTPUT_DIR}"

    for entry in ${INPUT_DIR}/*; do
        [[ ! -d "${entry}" ]] && continue
        [[ ! -f "${entry}/done" ]] && continue

        convert_dir "${entry}"
        rm -rf "${entry}"
    done
 }

 function convert_dir() {
    inputdir="$1"

    subdir=${inputdir#${INPUT_DIR}/}
    interdir="${INTER_DIR}/${subdir}"

    UNPAPER=1
    FILENAME="${OUTPUT_DIR}/${subdir}.pdf"

    if [[ -f "${FILENAME}" ]]; then
        echo_verbose "Already processed '${red}${inputdir}${reset}' - ${blue}skipping${reset}!"
        return
    fi
    echo "Converting: '${red}${inputdir}${reset}' into '${green}${interdir}${reset}'"

    [[ -f "${entry}/info" ]] && source "${entry}/info"

    mkdir -p "${interdir}"

    COMBINED_TIFF="${interdir}/combined.tiff"
    if [[ "${UNPAPER}" == "1" ]]; then
        pre_process "${inputdir}" "${interdir}"
        convert "${interdir}"/page_*-unpaper.pnm "${COMBINED_TIFF}"
    else
        convert "${inputdir}"/page_*.tiff "${COMBINED_TIFF}"
    fi

    echo_verbose "Rescaling output PDF..."
    convert "${COMBINED_TIFF}" -scale "${FINAL_RES}" -compress jpeg -quality 75 "${interdir}/combined.pdf"

    echo_verbose "Running OCR..."
    tesseract -l "${OCR_LANG}" -c textonly_pdf=1 "${COMBINED_TIFF}" "${interdir}/ocr" pdf

    qpdf "${interdir}/combined.pdf" --underlay "${interdir}/ocr.pdf" -- "${FILENAME}"

    echo "Conversion of '${FILENAME}' is done"

    rm -rf "${interdir}"
 }

 function pre_process() {
    INPUT="$1"
    OUTPUT="$2"
    PAGES="${INPUT}"/page_*.tiff
    COUNT=$(find "${INPUT}" -name 'page_*.tiff' | wc -l)

    echo "Pre-processing ${red}${COUNT}${reset} page(s) in parallel..."

    for PAGE in $PAGES; do
        name="$(basename "$PAGE")"
        outputname="${name%.tiff}-unpaper.pnm"
        pre_process_page "${PAGE}" "${OUTPUT}/${outputname}" &
    done | parallel --bar -u

 }
 function pre_process_page() {
    INPUT="$1"
    OUTPUT="$2"

    echo "unpaper --overwrite -q --dpi 300.0 '${INPUT}' '${OUTPUT}' 2>/dev/null"
 }

 function test_deps() {
    deps=""
    case "$1" in
    scan)
        deps="scanimage"
        ;;
    convert)
        deps="unpaper convert tesseract qpdf parallel"
        ;;
    esac

    for command in ${deps}; do
        if ! command -v "${command}" &>/dev/null; then
            echo "Missing command: '${command}'"
            command "${command}"
            exit 1
        fi
    done
 }

 function _run() {
    COMMAND="$1"

    test_deps "${COMMAND}"

    case "${COMMAND}" in
    scan) _scan ;;
    convert) _convert ;;

    h | help) usage && exit 0 ;;
    *) usage && exit 1 ;;
    esac

 }

 _run ${1:-help}
	#!/bin/bash -eu

	: ${SCANNER:="brother4:bus9;dev1"} # My USB scanner
	: ${OCR_LANG:=nld} # Tesseract language
	: ${SCAN_DPI:=300} # A good resolution for OCR
	: ${FINAL_RES:=1240x} # A good resolution for PDF
	: ${SHOW:=0} # Show the result
	: ${NICE:=10} # Priority of this process - this prevents Tesseract from hogging your machine
	: ${UNPAPER:=1} # Whether to run unpaper
	: ${FILENAME:=AUTO} # If no FILENAME is set, or set to 'AUTO': make one up based on time
	: ${VERBOSE:=0} # Whether to print verbose information

	: ${ROOT_DIR:=$PWD}
	: ${INPUT_DIR:=${ROOT_DIR}/input}
	: ${INTER_DIR:=${ROOT_DIR}/inter}
	: ${OUTPUT_DIR:=${ROOT_DIR}/output}

	red=$(tput setaf 1)
	green=$(tput setaf 2)
	blue=$(tput setaf 4)
	reset=$(tput sgr0)

	function usage() {
	cat <<EOF
	Usage: $0 subcommand

	Subcommands:

	- scan: scan documents into '${INPUT_DIR}' folders
	- convert: convert previously scanned documents into '${OUTPUT_DIR}' folders

	EOF
	}

	function echo_verbose() {
	if [[ "$VERBOSE" == "1" ]]; then
	echo "$@"
	fi
	}

	function timed_dir() {
	date "+%F_%T" \| sed 's/:/-/g' # Don't use ':' because of Windows
	}

	function _scan() {
	PAGE=1
	SCAN_SUB_DIR=$(timed_dir)

	while true; do
	cat <<EOF

	Press:
	[enter] to scan,
	[n] for new document,
	[l] to scan the last page, then start new document,
	[ctrl+c] to abort

	Going to scan page '${red}${PAGE}${reset}' into '${green}${SCAN_SUB_DIR}${reset}'
	EOF
	read ACTION

	case "${ACTION}" in
	n)
	finish_scan "${SCAN_SUB_DIR}"

	PAGE=0
	SCAN_SUB_DIR=$(timed_dir)
	;;
	l)
	scan_file "${SCAN_SUB_DIR}" "${PAGE}"
	finish_scan "${SCAN_SUB_DIR}"

	PAGE=0
	SCAN_SUB_DIR=$(timed_dir)
	;;
	*)
	scan_file "${SCAN_SUB_DIR}" "${PAGE}"
	;;

	esac

	PAGE=$((PAGE + 1))
	echo
	done
	}

	function finish_scan() {
	SCAN_SUB_DIR="$1"
	FULL_DIR="${INPUT_DIR}/${SCAN_SUB_DIR}"

	if [[ -d "${FULL_DIR}" ]]; then
	cat <<EOF >>"${FULL_DIR}/info"
	SCAN_DPI='${SCAN_DPI}'
	OCR_LANG='${OCR_LANG}'
	FINAL_RES='${FINAL_RES}'
	UNPAPER='${UNPAPER}'
	EOF
	touch "${FULL_DIR}/done"
	fi

	echo
	echo "==================================================================================="
	}

	function scan_file() {
	SCAN_SUB_DIR="$1"
	PAGE="$2"
	FULL_DIR="${INPUT_DIR}/${SCAN_SUB_DIR}"
	FILE="${FULL_DIR}/page_$(printf "%03d" "${PAGE}").tiff"

	echo "Now scanning page '${red}${PAGE}${reset}' into '${green}${FULL_DIR}${reset}'"
	mkdir -p "${FULL_DIR}"
	touch "${FULL_DIR}/info"

	scanimage -p -d "${SCANNER}" \
	--format=tiff \
	--resolution "${SCAN_DPI}" \
	--output-file "${FILE}"

	echo "Finished."
	}

	function _convert() {
	mkdir -p "${OUTPUT_DIR}"

	for entry in ${INPUT_DIR}/*; do
	[[ ! -d "${entry}" ]] && continue
	[[ ! -f "${entry}/done" ]] && continue

	convert_dir "${entry}"
	rm -rf "${entry}"
	done
	}

	function convert_dir() {
	inputdir="$1"

	subdir=${inputdir#${INPUT_DIR}/}
	interdir="${INTER_DIR}/${subdir}"

	UNPAPER=1
	FILENAME="${OUTPUT_DIR}/${subdir}.pdf"

	if [[ -f "${FILENAME}" ]]; then
	echo_verbose "Already processed '${red}${inputdir}${reset}' - ${blue}skipping${reset}!"
	return
	fi
	echo "Converting: '${red}${inputdir}${reset}' into '${green}${interdir}${reset}'"

	[[ -f "${entry}/info" ]] && source "${entry}/info"

	mkdir -p "${interdir}"

	COMBINED_TIFF="${interdir}/combined.tiff"
	if [[ "${UNPAPER}" == "1" ]]; then
	pre_process "${inputdir}" "${interdir}"
	convert "${interdir}"/page_*-unpaper.pnm "${COMBINED_TIFF}"
	else
	convert "${inputdir}"/page_*.tiff "${COMBINED_TIFF}"
	fi

	echo_verbose "Rescaling output PDF..."
	convert "${COMBINED_TIFF}" -scale "${FINAL_RES}" -compress jpeg -quality 75 "${interdir}/combined.pdf"

	echo_verbose "Running OCR..."
	tesseract -l "${OCR_LANG}" -c textonly_pdf=1 "${COMBINED_TIFF}" "${interdir}/ocr" pdf

	qpdf "${interdir}/combined.pdf" --underlay "${interdir}/ocr.pdf" -- "${FILENAME}"

	echo "Conversion of '${FILENAME}' is done"

	rm -rf "${interdir}"
	}

	function pre_process() {
	INPUT="$1"
	OUTPUT="$2"
	PAGES="${INPUT}"/page_*.tiff
	COUNT=$(find "${INPUT}" -name 'page_*.tiff' \| wc -l)

	echo "Pre-processing ${red}${COUNT}${reset} page(s) in parallel..."

	for PAGE in $PAGES; do
	name="$(basename "$PAGE")"
	outputname="${name%.tiff}-unpaper.pnm"
	pre_process_page "${PAGE}" "${OUTPUT}/${outputname}" &
	done \| parallel --bar -u

	}
	function pre_process_page() {
	INPUT="$1"
	OUTPUT="$2"

	echo "unpaper --overwrite -q --dpi 300.0 '${INPUT}' '${OUTPUT}' 2>/dev/null"
	}

	function test_deps() {
	deps=""
	case "$1" in
	scan)
	deps="scanimage"
	;;
	convert)
	deps="unpaper convert tesseract qpdf parallel"
	;;
	esac

	for command in ${deps}; do
	if ! command -v "${command}" &>/dev/null; then
	echo "Missing command: '${command}'"
	command "${command}"
	exit 1
	fi
	done
	}

	function _run() {
	COMMAND="$1"

	test_deps "${COMMAND}"

	case "${COMMAND}" in
	scan) _scan ;;
	convert) _convert ;;

	h \| help) usage && exit 0 ;;
	*) usage && exit 1 ;;
	esac

	}

	_run ${1:-help}