jovandeginste · December 31, 2020 11:55
diff --git a/scandoc b/scandoc
 #!/bin/bash -eu

 # Usage: scandoc [expected number of pages]
 # Eg:
 #   scandoc 1    # Scan one page, then process
 #   scandoc 5    # Scan 5 pages, then process
 #   scandoc      # Scan until page 99 or other action triggers end
 #
 # Description:
 #   Scandoc interactively scans a number (1 or more) of pages, runs
 #   post-processing   operations on each page, runs OCR on the high quality
 #   version and generates a new, lower quality PDF. The OCR from the high
 #   quality version and the images from the low quality version are then
 #   merged into a single PDF.
 #
 #   This final PDF should be fairly low size, very readable, and contain
 #   accurately recognized text.
 #
 # Requires:
 #   - scanimage for scanning
 #   - unpaper for rectifying, removing background
 #   - imagemagic merging, resampling and compressing
 #   - tesseract for OCR
 #   - qpdf to merge image and text pdf
 #
 #   Use scanimage -L to get a list of devices.
 #   e.g. device `genesys:libusb:006:003' is a Canon LiDE 210 flatbed scanner
 #   then copy/paste genesys:libusb:006:003 into SCANNER below.

 PAGES=${1:-99} # Parameter: number of pages; defaults to 99 (which is equivalent to 'unlimited')

 # Parameters based on environment variables
 # Sensible defaults are set for me
 # You can set your own defaults per shell by exporting them, or
 # eg. add them to your ~/.bashrc

 : ${SCANNER:="brother4:bus9;dev1"} # My USB scanner
 : ${OCR_LANG:=nld}                 # Tesseract language
 : ${SCAN_DPI:=300}                 # A good resolution for OCR
 : ${FINAL_RES:=1240x}              # A good resolution for PDF
 : ${SHOW:=0}                       # Show the result
 : ${NICE:=10}                      # Priority of this process - this prevents Tesseract from hogging your machine
 : ${UNPAPER:=1}                    # Whether to run unpaper
 : ${FILENAME:=AUTO}                # If no FILENAME is set, or set to 'AUTO': make one up based on time

 if [[ "${FILENAME}" == "AUTO" ]]; then
  FILENAME=$(date "+%F_%T" | sed 's/:/-/g').pdf # Don't use ':' because of Windows
 fi

 for command in unpaper scanimage convert tesseract qpdf; do
  if ! command -v "${command}" &>/dev/null; then
    echo "Missing command: '${command}'"
    command "${command}"
    exit 1
  fi
 done

 re='^[0-9]+$' # Check if PAGES is a number
 if ! [[ ${PAGES} =~ $re ]]; then
  echo "error: Usage: $0 filename number_of_pages" >&2
  exit 1
 fi

 TMP_DIR=$(mktemp -d)
 trap "{ rm -vrf "${TMP_DIR}"; }" EXIT

 echo "Scanning to: '${FILENAME}'"

 PAGE=0

 while [[ "${PAGE}" -lt "${PAGES}" ]]; do
  PAGE=$((PAGE + 1))

  echo "Please, place page ${PAGE}/${PAGES} on the scanner"
  cat <<EOF
 Press:
  [enter] to scan,
  [q] to stop scanning and start parsing,
  [l] to scan one more page and then start parsing,
  [m] to scan and increase the total number of pages (${PAGES} -> $((PAGES + 1)))
  [ctrl+c] to abort"
 EOF

  read ACTION
  [[ "${ACTION}" == "q" ]] && break
  echo "Start scanimage..."

  filename="${TMP_DIR}/page_$(printf "%03d" "${PAGE}").tiff"
  scanimage -p -d "${SCANNER}" \
    --format=tiff \
    --resolution "${SCAN_DPI}" \
    --output-file "${filename}"

  if [[ "${UNPAPER}" == "1" ]]; then
    unpaper -q --dpi 300.0 "${filename}" "${filename%.tiff}-unpaper.pnm" &
  fi
  [[ "${ACTION}" == "l" ]] && break
  if [[ "${ACTION}" == "m" ]]; then
    PAGES=$((PAGES + 1))
  fi
 done

 echo "Changing priority to '${NICE}' (higher is lower)"

 echo "Waiting for unpaper to finish..."
 wait
 renice -n "${NICE}" $$

 echo "Merging pages..."

 if [[ "${UNPAPER}" == "1" ]]; then
  convert "${TMP_DIR}"/page_*-unpaper.pnm "${TMP_DIR}/combined.tiff"
 else
  convert "${TMP_DIR}"/page_*.tiff "${TMP_DIR}/combined.tiff"
 fi

 convert "${TMP_DIR}/combined.tiff" -scale "${FINAL_RES}" -compress jpeg -quality 75 "${TMP_DIR}/combined.pdf"

 echo "Running OCR..."
 tesseract -l "${OCR_LANG}" -c textonly_pdf=1 "${TMP_DIR}/combined.tiff" "${TMP_DIR}/ocr" pdf

 qpdf "${TMP_DIR}/combined.pdf" --underlay "${TMP_DIR}/ocr.pdf" -- "${FILENAME}"

 echo "Scan of '${FILENAME}' is done"

 rm -rf ${TMP_DIR}

 if [[ "${SHOW}" == "1" ]]; then
  xdg-open "${FILENAME}"
 fi
	#!/bin/bash -eu

	# Usage: scandoc [expected number of pages]
	# Eg:
	# scandoc 1 # Scan one page, then process
	# scandoc 5 # Scan 5 pages, then process
	# scandoc # Scan until page 99 or other action triggers end
	#
	# Description:
	# Scandoc interactively scans a number (1 or more) of pages, runs
	# post-processing operations on each page, runs OCR on the high quality
	# version and generates a new, lower quality PDF. The OCR from the high
	# quality version and the images from the low quality version are then
	# merged into a single PDF.
	#
	# This final PDF should be fairly low size, very readable, and contain
	# accurately recognized text.
	#
	# Requires:
	# - scanimage for scanning
	# - unpaper for rectifying, removing background
	# - imagemagic merging, resampling and compressing
	# - tesseract for OCR
	# - qpdf to merge image and text pdf
	#
	# Use scanimage -L to get a list of devices.
	# e.g. device `genesys:libusb:006:003' is a Canon LiDE 210 flatbed scanner
	# then copy/paste genesys:libusb:006:003 into SCANNER below.

	PAGES=${1:-99} # Parameter: number of pages; defaults to 99 (which is equivalent to 'unlimited')

	# Parameters based on environment variables
	# Sensible defaults are set for me
	# You can set your own defaults per shell by exporting them, or
	# eg. add them to your ~/.bashrc

	: ${SCANNER:="brother4:bus9;dev1"} # My USB scanner
	: ${OCR_LANG:=nld} # Tesseract language
	: ${SCAN_DPI:=300} # A good resolution for OCR
	: ${FINAL_RES:=1240x} # A good resolution for PDF
	: ${SHOW:=0} # Show the result
	: ${NICE:=10} # Priority of this process - this prevents Tesseract from hogging your machine
	: ${UNPAPER:=1} # Whether to run unpaper
	: ${FILENAME:=AUTO} # If no FILENAME is set, or set to 'AUTO': make one up based on time

	if [[ "${FILENAME}" == "AUTO" ]]; then
	FILENAME=$(date "+%F_%T" \| sed 's/:/-/g').pdf # Don't use ':' because of Windows
	fi

	for command in unpaper scanimage convert tesseract qpdf; do
	if ! command -v "${command}" &>/dev/null; then
	echo "Missing command: '${command}'"
	command "${command}"
	exit 1
	fi
	done

	re='^[0-9]+$' # Check if PAGES is a number
	if ! [[ ${PAGES} =~ $re ]]; then
	echo "error: Usage: $0 filename number_of_pages" >&2
	exit 1
	fi

	TMP_DIR=$(mktemp -d)
	trap "{ rm -vrf "${TMP_DIR}"; }" EXIT

	echo "Scanning to: '${FILENAME}'"

	PAGE=0

	while [[ "${PAGE}" -lt "${PAGES}" ]]; do
	PAGE=$((PAGE + 1))

	echo "Please, place page ${PAGE}/${PAGES} on the scanner"
	cat <<EOF
	Press:
	[enter] to scan,
	[q] to stop scanning and start parsing,
	[l] to scan one more page and then start parsing,
	[m] to scan and increase the total number of pages (${PAGES} -> $((PAGES + 1)))
	[ctrl+c] to abort"
	EOF

	read ACTION
	[[ "${ACTION}" == "q" ]] && break
	echo "Start scanimage..."

	filename="${TMP_DIR}/page_$(printf "%03d" "${PAGE}").tiff"
	scanimage -p -d "${SCANNER}" \
	--format=tiff \
	--resolution "${SCAN_DPI}" \
	--output-file "${filename}"

	if [[ "${UNPAPER}" == "1" ]]; then
	unpaper -q --dpi 300.0 "${filename}" "${filename%.tiff}-unpaper.pnm" &
	fi
	[[ "${ACTION}" == "l" ]] && break
	if [[ "${ACTION}" == "m" ]]; then
	PAGES=$((PAGES + 1))
	fi
	done

	echo "Changing priority to '${NICE}' (higher is lower)"

	echo "Waiting for unpaper to finish..."
	wait
	renice -n "${NICE}" $$

	echo "Merging pages..."

	if [[ "${UNPAPER}" == "1" ]]; then
	convert "${TMP_DIR}"/page_*-unpaper.pnm "${TMP_DIR}/combined.tiff"
	else
	convert "${TMP_DIR}"/page_*.tiff "${TMP_DIR}/combined.tiff"
	fi

	convert "${TMP_DIR}/combined.tiff" -scale "${FINAL_RES}" -compress jpeg -quality 75 "${TMP_DIR}/combined.pdf"

	echo "Running OCR..."
	tesseract -l "${OCR_LANG}" -c textonly_pdf=1 "${TMP_DIR}/combined.tiff" "${TMP_DIR}/ocr" pdf

	qpdf "${TMP_DIR}/combined.pdf" --underlay "${TMP_DIR}/ocr.pdf" -- "${FILENAME}"

	echo "Scan of '${FILENAME}' is done"

	rm -rf ${TMP_DIR}

	if [[ "${SHOW}" == "1" ]]; then
	xdg-open "${FILENAME}"
	fi