-
-
Save jovandeginste/c86f4c421d4e9b96dc6681d44d089860 to your computer and use it in GitHub Desktop.
Scan and/or convert
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash -eu | |
| : ${SCANNER:="brother4:bus9;dev1"} # My USB scanner | |
| : ${OCR_LANG:=nld} # Tesseract language | |
| : ${SCAN_DPI:=300} # A good resolution for OCR | |
| : ${FINAL_RES:=1240x} # A good resolution for PDF | |
| : ${SHOW:=0} # Show the result | |
| : ${NICE:=10} # Priority of this process - this prevents Tesseract from hogging your machine | |
| : ${UNPAPER:=1} # Whether to run unpaper | |
| : ${FILENAME:=AUTO} # If no FILENAME is set, or set to 'AUTO': make one up based on time | |
| : ${VERBOSE:=0} # Whether to print verbose information | |
| : ${ROOT_DIR:=$PWD} | |
| : ${INPUT_DIR:=${ROOT_DIR}/input} | |
| : ${INTER_DIR:=${ROOT_DIR}/inter} | |
| : ${OUTPUT_DIR:=${ROOT_DIR}/output} | |
| red=$(tput setaf 1) | |
| green=$(tput setaf 2) | |
| blue=$(tput setaf 4) | |
| reset=$(tput sgr0) | |
| function usage() { | |
| cat <<EOF | |
| Usage: $0 subcommand | |
| Subcommands: | |
| - scan: scan documents into '${INPUT_DIR}' folders | |
| - convert: convert previously scanned documents into '${OUTPUT_DIR}' folders | |
| EOF | |
| } | |
| function echo_verbose() { | |
| if [[ "$VERBOSE" == "1" ]]; then | |
| echo "$@" | |
| fi | |
| } | |
| function timed_dir() { | |
| date "+%F_%T" | sed 's/:/-/g' # Don't use ':' because of Windows | |
| } | |
| function _scan() { | |
| PAGE=1 | |
| SCAN_SUB_DIR=$(timed_dir) | |
| while true; do | |
| cat <<EOF | |
| Press: | |
| [enter] to scan, | |
| [n] for new document, | |
| [l] to scan the last page, then start new document, | |
| [ctrl+c] to abort | |
| Going to scan page '${red}${PAGE}${reset}' into '${green}${SCAN_SUB_DIR}${reset}' | |
| EOF | |
| read ACTION | |
| case "${ACTION}" in | |
| n) | |
| finish_scan "${SCAN_SUB_DIR}" | |
| PAGE=0 | |
| SCAN_SUB_DIR=$(timed_dir) | |
| ;; | |
| l) | |
| scan_file "${SCAN_SUB_DIR}" "${PAGE}" | |
| finish_scan "${SCAN_SUB_DIR}" | |
| PAGE=0 | |
| SCAN_SUB_DIR=$(timed_dir) | |
| ;; | |
| *) | |
| scan_file "${SCAN_SUB_DIR}" "${PAGE}" | |
| ;; | |
| esac | |
| PAGE=$((PAGE + 1)) | |
| echo | |
| done | |
| } | |
| function finish_scan() { | |
| SCAN_SUB_DIR="$1" | |
| FULL_DIR="${INPUT_DIR}/${SCAN_SUB_DIR}" | |
| if [[ -d "${FULL_DIR}" ]]; then | |
| cat <<EOF >>"${FULL_DIR}/info" | |
| SCAN_DPI='${SCAN_DPI}' | |
| OCR_LANG='${OCR_LANG}' | |
| FINAL_RES='${FINAL_RES}' | |
| UNPAPER='${UNPAPER}' | |
| EOF | |
| touch "${FULL_DIR}/done" | |
| fi | |
| echo | |
| echo "===================================================================================" | |
| } | |
| function scan_file() { | |
| SCAN_SUB_DIR="$1" | |
| PAGE="$2" | |
| FULL_DIR="${INPUT_DIR}/${SCAN_SUB_DIR}" | |
| FILE="${FULL_DIR}/page_$(printf "%03d" "${PAGE}").tiff" | |
| echo "Now scanning page '${red}${PAGE}${reset}' into '${green}${FULL_DIR}${reset}'" | |
| mkdir -p "${FULL_DIR}" | |
| touch "${FULL_DIR}/info" | |
| scanimage -p -d "${SCANNER}" \ | |
| --format=tiff \ | |
| --resolution "${SCAN_DPI}" \ | |
| --output-file "${FILE}" | |
| echo "Finished." | |
| } | |
| function _convert() { | |
| mkdir -p "${OUTPUT_DIR}" | |
| for entry in ${INPUT_DIR}/*; do | |
| [[ ! -d "${entry}" ]] && continue | |
| [[ ! -f "${entry}/done" ]] && continue | |
| convert_dir "${entry}" | |
| rm -rf "${entry}" | |
| done | |
| } | |
| function convert_dir() { | |
| inputdir="$1" | |
| subdir=${inputdir#${INPUT_DIR}/} | |
| interdir="${INTER_DIR}/${subdir}" | |
| UNPAPER=1 | |
| FILENAME="${OUTPUT_DIR}/${subdir}.pdf" | |
| if [[ -f "${FILENAME}" ]]; then | |
| echo_verbose "Already processed '${red}${inputdir}${reset}' - ${blue}skipping${reset}!" | |
| return | |
| fi | |
| echo "Converting: '${red}${inputdir}${reset}' into '${green}${interdir}${reset}'" | |
| [[ -f "${entry}/info" ]] && source "${entry}/info" | |
| mkdir -p "${interdir}" | |
| COMBINED_TIFF="${interdir}/combined.tiff" | |
| if [[ "${UNPAPER}" == "1" ]]; then | |
| pre_process "${inputdir}" "${interdir}" | |
| convert "${interdir}"/page_*-unpaper.pnm "${COMBINED_TIFF}" | |
| else | |
| convert "${inputdir}"/page_*.tiff "${COMBINED_TIFF}" | |
| fi | |
| echo_verbose "Rescaling output PDF..." | |
| convert "${COMBINED_TIFF}" -scale "${FINAL_RES}" -compress jpeg -quality 75 "${interdir}/combined.pdf" | |
| echo_verbose "Running OCR..." | |
| tesseract -l "${OCR_LANG}" -c textonly_pdf=1 "${COMBINED_TIFF}" "${interdir}/ocr" pdf | |
| qpdf "${interdir}/combined.pdf" --underlay "${interdir}/ocr.pdf" -- "${FILENAME}" | |
| echo "Conversion of '${FILENAME}' is done" | |
| rm -rf "${interdir}" | |
| } | |
| function pre_process() { | |
| INPUT="$1" | |
| OUTPUT="$2" | |
| PAGES="${INPUT}"/page_*.tiff | |
| COUNT=$(find "${INPUT}" -name 'page_*.tiff' | wc -l) | |
| echo "Pre-processing ${red}${COUNT}${reset} page(s) in parallel..." | |
| for PAGE in $PAGES; do | |
| name="$(basename "$PAGE")" | |
| outputname="${name%.tiff}-unpaper.pnm" | |
| pre_process_page "${PAGE}" "${OUTPUT}/${outputname}" & | |
| done | parallel --bar -u | |
| } | |
| function pre_process_page() { | |
| INPUT="$1" | |
| OUTPUT="$2" | |
| echo "unpaper --overwrite -q --dpi 300.0 '${INPUT}' '${OUTPUT}' 2>/dev/null" | |
| } | |
| function test_deps() { | |
| deps="" | |
| case "$1" in | |
| scan) | |
| deps="scanimage" | |
| ;; | |
| convert) | |
| deps="unpaper convert tesseract qpdf parallel" | |
| ;; | |
| esac | |
| for command in ${deps}; do | |
| if ! command -v "${command}" &>/dev/null; then | |
| echo "Missing command: '${command}'" | |
| command "${command}" | |
| exit 1 | |
| fi | |
| done | |
| } | |
| function _run() { | |
| COMMAND="$1" | |
| test_deps "${COMMAND}" | |
| case "${COMMAND}" in | |
| scan) _scan ;; | |
| convert) _convert ;; | |
| h | help) usage && exit 0 ;; | |
| *) usage && exit 1 ;; | |
| esac | |
| } | |
| _run ${1:-help} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment