Last active
August 10, 2024 18:03
-
-
Save jikamens/9cc4973a595c82d47c941892d97d07d0 to your computer and use it in GitHub Desktop.
Demonstration of how to automatically scan and file bills
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -e | |
WHOAMI=$(basename $0) | |
DEV_FILE=$HOME/.scan-bill.dev | |
RETRY=false | |
MULTIPLE=false | |
DUPLEX_SOURCE='Automatic Document Feeder(centrally aligned,Duplex)' | |
SIMPLEX_SOURCE='Automatic Document Feeder(centrally aligned)' | |
SOURCE="$DUPLEX_SOURCE" | |
STARTED=false | |
USAGE="Usage: $WHOAMI [--retry] [--date M/D/Y] [--multiple] [--simplex] | |
[--pages-per-bill #]" | |
while [ -n "$1" ]; do | |
case "$1" in | |
-h|--help) echo "$USAGE"; exit ;; | |
# Try to process the files in /tmp/scan-bill again. Useful if something | |
# failed and then you fixed the script to account for the failure and | |
# don't want to rescan the document. | |
--retry) RETRY=true; shift ;; | |
# Specify the bill date for bill being processed. If processing | |
# multiple bills (see --pages-per-bill below), only specifies the bill | |
# date for the first one. | |
--date) shift; MMDDYY="$(date +%D --date "$1")"; shift ;; | |
# Allow multiple bills for the same date for the same service provider. | |
# Adds "-2", "-3", etc. suffixes to the file names of bills after the | |
# first one. | |
--multiple) MULTIPLE=true; shift ;; | |
# Tells the scanner to only scan the fronts of pages. | |
--simplex) SOURCE="$SIMPLEX_SOURCE"; shift ;; | |
# Indicates that multiple bills are being scanned, and that each bill | |
# has the specified number of pages. Note that this is *sides of a | |
# page*, so e.g. if you're scanning duplex bills that have only one | |
# sheet then the correct number to specify here is 2. | |
# When you specify this, then you can scan a stack of bills -- all of | |
# which are the same in terms of simplex/duplex and length -- and the | |
# script processes them all sequentially. If one of them fails the | |
# script aborts with its files in place, and then you can fix the | |
# problem and rerun the script with --retry and it'll pick up where it | |
# left off. | |
--pages-per-bill) shift; PAGES_PER_BILL="$1"; shift ;; | |
*) echo "Unrecognized argument: $1" 1>&2; exit 1 ;; | |
esac | |
done | |
TD_SCANNING=/tmp/$WHOAMI-images | |
TD_WORKING=/tmp/$WHOAMI | |
main() { | |
while do_one && [ -n "$PAGES_PER_BILL" ]; do | |
: | |
done | |
} | |
do_scanadf() { | |
mkdir -p $TD_SCANNING || exit 1 | |
if ! cd $TD_SCANNING; then | |
exit 1 | |
fi | |
if ! ls image* &>/dev/null; then | |
if $STARTED; then | |
return 1 | |
fi | |
if [ -f $DEV_FILE ]; then | |
dev=$(cat $DEV_FILE) | |
else | |
dev=$(scanimage -L | | |
sed -E -n -e "s/^device \`(.*)' is a .*/\\1/p" | | |
head -1) | |
if [ -z $dev ]; then | |
echo "Could not find scanner" 1>&2 | |
exit 1 | |
fi | |
echo $dev > $DEV_FILE | |
fi | |
if ! scanadf --device "$dev" --mode 'Black & White' \ | |
--resolution 300 --source "$SOURCE" -y 279.4 >| \ | |
scanadf.log 2>&1; then | |
rm -f image* | |
return 1 | |
fi | |
fi | |
if [ -n "$PAGES_PER_BILL" ]; then | |
mv $(ls image* | head -$PAGES_PER_BILL) $TD_WORKING/. || exit 1 | |
else | |
mv image* $TD_WORKING/. || exit 1 | |
fi | |
} | |
check_empty() { | |
image_file=$1; shift | |
if [ ! -f $image_file ]; then | |
echo "check_empty: $image_file does not exist" 1>&2 | |
return | |
fi | |
rm -f maybe-empty.txt | |
tesseract --psm 6 $image_file maybe-empty >| maybe-empty.log 2>&1 || exit 1 | |
chars=$(wc -c < maybe-empty.txt) | |
if ((chars < 92)); then | |
echo "$image_file is empty, removing" 1>&2 | |
rm -f $image_file | |
else | |
echo "$image_file has $chars characters in it, preserving" 1>&2 | |
fi | |
rm -f maybe-empty.txt | |
} | |
eastern_bank_statement_date() { | |
MMDDYY="$(tre-agrep -2 'Statement *Date:? *[0-9][0-9]/[0-9][0-9]/([0-9][0-9])?[0-9][0-9]' page1.txt | sed -E -n -e 's,.*([0-9][0-9]/[0-9][0-9]/)([0-9][0-9])?([0-9][0-9]).*,\1\3,p')" | |
if [ -n "$MMDDYY" ]; then | |
echo "$MMDDYY" | |
return | |
fi | |
ENGLISH_DATE="$(tre-agrep -2 'Statement *Date:? *[A-Z][a-z][a-z][- ]*[1-9][0-9]*, *20[0-9][0-9]' page1.txt | sed -n -e 's/.*\([A-Z][a-z][a-z]\)[- ]*\([1-9][0-9]*,\)[- ]*\(20[0-9][0-9]\).*/\1 \2 \3/p')" | |
if [ -n "$ENGLISH_DATE" ]; then | |
date --date "$ENGLISH_DATE" +%m/%d/%y | |
return | |
fi | |
echo "Failed to detect statement date" 1>&2 | |
exit 1 | |
} | |
do_one() { | |
local SUBDIR DIR FILE_BASENAME | |
if $RETRY; then | |
RETRY=false | |
else | |
rm -rf $TD_WORKING | |
mkdir $TD_WORKING || exit 1 | |
if ! do_scanadf; then | |
if $STARTED; then | |
exit 1 | |
fi | |
rm -f $DEV_FILE | |
do_scanadf || exit 1 | |
fi | |
fi | |
if ! cd $TD_WORKING; then | |
exit 1 | |
fi | |
STARTED=true | |
tesseract --psm 6 $(ls image* | head -1) page1 >| tesseract.log 2>&1 || exit 1 | |
if tre-agrep -q -s -2 home-loan-account-number page1.txt; then | |
if [ ! "$MMDDYY" ]; then | |
MMDDYY=$(eastern_bank_statement_date) | |
fi | |
SUBDIR=eastern_bank/home_loan | |
check_empty image-0004 | |
elif tre-agrep -q -s -2 heloc-account-number page1.txt; then | |
if [ ! "$MMDDYY" ]; then | |
MMDDYY=$(eastern_bank_statement_date) | |
fi | |
SUBDIR=eastern_bank/heloc | |
check_empty image-0004 | |
elif tre-agrep -q -s -2 heat-loan-account-number page1.txt; then | |
if [ ! "$MMDDYY" ]; then | |
MMDDYY=$(eastern_bank_statement_date) | |
fi | |
SUBDIR=eastern_bank/heat_loan | |
check_empty image-0002 | |
check_empty image-0004 | |
elif egrep -q -s -i 'JEWISH COMMUNITY DAY SCHOOL|Afterschool Invoice|jcdsboston|JCDS|57 Stanley' page1.txt | |
then | |
if [ ! "$MMDDYY" ]; then | |
MMDDYY=$(perl -e 'use Date::Parse; use POSIX "strftime"; $t = 0; while (<>) { while (m,(\d\d?/\d\d?/\d\d\s*\d\s*\d),g) { ($s = $1) =~ s/\s+//g; $t2 = str2time($s); if ($t2 > $t) { $t = $t2; } } } print(strftime("%D", localtime($t))) if ($t);' < page1.txt) | |
if [ -z "$MMDDYY" ]; then | |
echo "Failed to detect statement date" 1>&2 | |
exit 1 | |
fi | |
fi | |
SUBDIR=jcds | |
elif grep -q -s -i 'boston water' page1.txt; then | |
echo "Detected Boston Water statement" | |
if [ ! "$MMDDYY" ]; then | |
MMDDYY=$(sed -E -n -e 's;.*([0-9][0-9]/[0-9][0-9]/[0-9][0-9]) *previous balance.*;\1;pi' page1.txt) | |
if [ -z "$MMDDYY" ]; then | |
echo "Failed to detect statement date" 1>&2 | |
exit 1 | |
fi | |
fi | |
SUBDIR=boston_water | |
fi | |
if [ -n "$MMDDYY" -a -n "$SUBDIR" ]; then | |
# The $(expr 0 + ...) trick is to remove leading zeroes from month and day | |
# numbers so printf won't treat them like octal numbers. | |
MONTH=$(printf '%02d' $(expr 0 + $(expr $MMDDYY : '\(..\?\)/..\?/..'))) | |
DAY=$(printf '%02d' $(expr 0 + $(expr $MMDDYY : '..\?/\(..\?\)/..'))) | |
YEAR=20$(expr $MMDDYY : '..\?/..\?/\(..\)') | |
DIR=$HOME/closed/finances/statements/$SUBDIR/$YEAR | |
FILE_BASENAME=$YEAR$MONTH$DAY | |
fi | |
convert image* bill.pdf >| convert.log 2>&1 || exit 1 | |
if [ -n "$DIR" -a -n "$FILE_BASENAME" ]; then | |
SUFFIX="" | |
while true; do | |
TF="$DIR/$FILE_BASENAME$SUFFIX.pdf" | |
if [ -f $TF ]; then | |
if ! $MULTIPLE; then | |
echo "$TF already exists, aborting." 1>&2 | |
exit 1 | |
fi | |
if [ ! "$SUFFIX" ]; then | |
SUFFIX=-2 | |
else | |
SUFFIX=-$((${SUFFIX#*-}+1)) | |
fi | |
continue | |
fi | |
mkdir -p $DIR || exit 1 | |
cp -i bill.pdf "$TF" || exit 1 | |
echo Saved as "$TF" | |
break | |
done | |
else | |
echo Failed to determine bill type 1>&2 | |
exit 1 | |
fi | |
# If specified on command, only applies to first bill. | |
MMDDYY="" | |
} | |
main |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment