jikamens · August 10, 2024 18:03
diff --git a/scan-bill.sh b/scan-bill.sh
 #!/bin/bash -e

 WHOAMI=$(basename $0)
 DEV_FILE=$HOME/.scan-bill.dev
 RETRY=false
 MULTIPLE=false
 DUPLEX_SOURCE='Automatic Document Feeder(centrally aligned,Duplex)'
 SIMPLEX_SOURCE='Automatic Document Feeder(centrally aligned)'
 SOURCE="$DUPLEX_SOURCE"
 STARTED=false
 USAGE="Usage: $WHOAMI [--retry] [--date M/D/Y] [--multiple] [--simplex]
        [--pages-per-bill #]"

 while [ -n "$1" ]; do
    case "$1" in
        -h|--help) echo "$USAGE"; exit ;;
        # Try to process the files in /tmp/scan-bill again. Useful if something
        # failed and then you fixed the script to account for the failure and
        # don't want to rescan the document.
        --retry) RETRY=true; shift ;;
        # Specify the bill date for bill being processed. If processing
        # multiple bills (see --pages-per-bill below), only specifies the bill
        # date for the first one.
        --date) shift; MMDDYY="$(date +%D --date "$1")"; shift ;;
        # Allow multiple bills for the same date for the same service provider.
        # Adds "-2", "-3", etc. suffixes to the file names of bills after the
        # first one.
        --multiple) MULTIPLE=true; shift ;;
        # Tells the scanner to only scan the fronts of pages.
        --simplex) SOURCE="$SIMPLEX_SOURCE"; shift ;;
        # Indicates that multiple bills are being scanned, and that each bill
        # has the specified number of pages. Note that this is *sides of a
        # page*, so e.g. if you're scanning duplex bills that have only one
        # sheet then the correct number to specify here is 2.
        # When you specify this, then you can scan a stack of bills -- all of
        # which are the same in terms of simplex/duplex and length -- and the
        # script processes them all sequentially. If one of them fails the
        # script aborts with its files in place, and then you can fix the
        # problem and rerun the script with --retry and it'll pick up where it
        # left off.
        --pages-per-bill) shift; PAGES_PER_BILL="$1"; shift ;;
        *) echo "Unrecognized argument: $1" 1>&2; exit 1 ;;
    esac
 done

 TD_SCANNING=/tmp/$WHOAMI-images
 TD_WORKING=/tmp/$WHOAMI

 main() {
    while do_one && [ -n "$PAGES_PER_BILL" ]; do
        :
    done
 }

 do_scanadf() {
    mkdir -p $TD_SCANNING || exit 1
    if ! cd $TD_SCANNING; then
        exit 1
    fi
    if ! ls image* &>/dev/null; then
        if $STARTED; then
            return 1
        fi
        if [ -f $DEV_FILE ]; then
            dev=$(cat $DEV_FILE)
        else
            dev=$(scanimage -L |
                      sed -E -n -e "s/^device \`(.*)' is a .*/\\1/p" |
                  head -1)
            if [ -z $dev ]; then
                echo "Could not find scanner" 1>&2
                exit 1
            fi
            echo $dev > $DEV_FILE
        fi

        if ! scanadf --device "$dev" --mode 'Black & White' \
             --resolution 300 --source "$SOURCE" -y 279.4 >| \
                scanadf.log 2>&1; then
            rm -f image*
            return 1
        fi
    fi
    if [ -n "$PAGES_PER_BILL" ]; then
        mv $(ls image* | head -$PAGES_PER_BILL) $TD_WORKING/. || exit 1
    else
        mv image* $TD_WORKING/. || exit 1
    fi
 }

 check_empty() {
    image_file=$1; shift
    if [ ! -f $image_file ]; then
        echo "check_empty: $image_file does not exist" 1>&2
        return
    fi
    rm -f maybe-empty.txt
    tesseract --psm 6 $image_file maybe-empty >| maybe-empty.log 2>&1 || exit 1
    chars=$(wc -c < maybe-empty.txt)
    if ((chars < 92)); then
        echo "$image_file is empty, removing" 1>&2
        rm -f $image_file
    else
        echo "$image_file has $chars characters in it, preserving" 1>&2
    fi
    rm -f maybe-empty.txt
 }

 eastern_bank_statement_date() {
    MMDDYY="$(tre-agrep -2 'Statement *Date:? *[0-9][0-9]/[0-9][0-9]/([0-9][0-9])?[0-9][0-9]' page1.txt | sed -E -n -e 's,.*([0-9][0-9]/[0-9][0-9]/)([0-9][0-9])?([0-9][0-9]).*,\1\3,p')"
    if [ -n "$MMDDYY" ]; then
        echo "$MMDDYY"
        return
    fi
    ENGLISH_DATE="$(tre-agrep -2 'Statement *Date:? *[A-Z][a-z][a-z][- ]*[1-9][0-9]*, *20[0-9][0-9]' page1.txt | sed -n -e 's/.*\([A-Z][a-z][a-z]\)[- ]*\([1-9][0-9]*,\)[- ]*\(20[0-9][0-9]\).*/\1 \2 \3/p')"
    if [ -n "$ENGLISH_DATE" ]; then
        date --date "$ENGLISH_DATE" +%m/%d/%y
        return
    fi
    echo "Failed to detect statement date" 1>&2
    exit 1
 }
    
 do_one() {
    local SUBDIR DIR FILE_BASENAME

    if $RETRY; then
        RETRY=false
    else
        rm -rf $TD_WORKING
        mkdir $TD_WORKING || exit 1
        if ! do_scanadf; then
            if $STARTED; then
                exit 1
            fi
            rm -f $DEV_FILE
            do_scanadf || exit 1
        fi
    fi
    if ! cd $TD_WORKING; then
        exit 1
    fi

    STARTED=true

    tesseract --psm 6 $(ls image* | head -1) page1 >| tesseract.log 2>&1 || exit 1

    if tre-agrep -q -s -2 home-loan-account-number page1.txt; then
        if [ ! "$MMDDYY" ]; then
            MMDDYY=$(eastern_bank_statement_date)
        fi
        SUBDIR=eastern_bank/home_loan
        check_empty image-0004
    elif tre-agrep -q -s -2 heloc-account-number page1.txt; then
        if [ ! "$MMDDYY" ]; then
            MMDDYY=$(eastern_bank_statement_date)
        fi
        SUBDIR=eastern_bank/heloc
        check_empty image-0004
    elif tre-agrep -q -s -2 heat-loan-account-number page1.txt; then
        if [ ! "$MMDDYY" ]; then
            MMDDYY=$(eastern_bank_statement_date)
        fi
        SUBDIR=eastern_bank/heat_loan
        check_empty image-0002
        check_empty image-0004
    elif egrep -q -s -i 'JEWISH COMMUNITY DAY SCHOOL|Afterschool Invoice|jcdsboston|JCDS|57 Stanley' page1.txt
    then
        if [ ! "$MMDDYY" ]; then
            MMDDYY=$(perl -e 'use Date::Parse; use POSIX "strftime"; $t = 0; while (<>) { while (m,(\d\d?/\d\d?/\d\d\s*\d\s*\d),g) { ($s = $1) =~ s/\s+//g; $t2 = str2time($s); if ($t2 > $t) { $t = $t2; } } } print(strftime("%D", localtime($t))) if ($t);' < page1.txt)
            if [ -z "$MMDDYY" ]; then
                echo "Failed to detect statement date" 1>&2
                exit 1
            fi
        fi
        SUBDIR=jcds
    elif grep -q -s -i 'boston water' page1.txt; then
        echo "Detected Boston Water statement"
        if [ ! "$MMDDYY" ]; then
            MMDDYY=$(sed -E -n -e 's;.*([0-9][0-9]/[0-9][0-9]/[0-9][0-9]) *previous balance.*;\1;pi' page1.txt)
            if [ -z "$MMDDYY" ]; then
                echo "Failed to detect statement date" 1>&2
                exit 1
            fi
        fi
        SUBDIR=boston_water
    fi

    if [ -n "$MMDDYY" -a -n "$SUBDIR" ]; then
        # The $(expr 0 + ...) trick is to remove leading zeroes from month and day
        # numbers so printf won't treat them like octal numbers.
        MONTH=$(printf '%02d' $(expr 0 + $(expr $MMDDYY : '\(..\?\)/..\?/..')))
        DAY=$(printf '%02d' $(expr 0 + $(expr $MMDDYY : '..\?/\(..\?\)/..')))
        YEAR=20$(expr $MMDDYY : '..\?/..\?/\(..\)')
        DIR=$HOME/closed/finances/statements/$SUBDIR/$YEAR
        FILE_BASENAME=$YEAR$MONTH$DAY
    fi

    convert image* bill.pdf >| convert.log 2>&1 || exit 1

    if [ -n "$DIR" -a -n "$FILE_BASENAME" ]; then
        SUFFIX=""
        while true; do
            TF="$DIR/$FILE_BASENAME$SUFFIX.pdf"
            if [ -f $TF ]; then
                if ! $MULTIPLE; then
                    echo "$TF already exists, aborting." 1>&2
                    exit 1
                fi
                if [ ! "$SUFFIX" ]; then
                    SUFFIX=-2
                else
                    SUFFIX=-$((${SUFFIX#*-}+1))
                fi
                continue
            fi

            mkdir -p $DIR || exit 1
            cp -i bill.pdf "$TF" || exit 1
            echo Saved as "$TF"
            break
        done
    else
        echo Failed to determine bill type 1>&2
        exit 1
    fi

    # If specified on command, only applies to first bill.
    MMDDYY=""
 }

 main
	#!/bin/bash -e

	WHOAMI=$(basename $0)
	DEV_FILE=$HOME/.scan-bill.dev
	RETRY=false
	MULTIPLE=false
	DUPLEX_SOURCE='Automatic Document Feeder(centrally aligned,Duplex)'
	SIMPLEX_SOURCE='Automatic Document Feeder(centrally aligned)'
	SOURCE="$DUPLEX_SOURCE"
	STARTED=false
	USAGE="Usage: $WHOAMI [--retry] [--date M/D/Y] [--multiple] [--simplex]
	[--pages-per-bill #]"

	while [ -n "$1" ]; do
	case "$1" in
	-h\|--help) echo "$USAGE"; exit ;;
	# Try to process the files in /tmp/scan-bill again. Useful if something
	# failed and then you fixed the script to account for the failure and
	# don't want to rescan the document.
	--retry) RETRY=true; shift ;;
	# Specify the bill date for bill being processed. If processing
	# multiple bills (see --pages-per-bill below), only specifies the bill
	# date for the first one.
	--date) shift; MMDDYY="$(date +%D --date "$1")"; shift ;;
	# Allow multiple bills for the same date for the same service provider.
	# Adds "-2", "-3", etc. suffixes to the file names of bills after the
	# first one.
	--multiple) MULTIPLE=true; shift ;;
	# Tells the scanner to only scan the fronts of pages.
	--simplex) SOURCE="$SIMPLEX_SOURCE"; shift ;;
	# Indicates that multiple bills are being scanned, and that each bill
	# has the specified number of pages. Note that this is *sides of a
	# page*, so e.g. if you're scanning duplex bills that have only one
	# sheet then the correct number to specify here is 2.
	# When you specify this, then you can scan a stack of bills -- all of
	# which are the same in terms of simplex/duplex and length -- and the
	# script processes them all sequentially. If one of them fails the
	# script aborts with its files in place, and then you can fix the
	# problem and rerun the script with --retry and it'll pick up where it
	# left off.
	--pages-per-bill) shift; PAGES_PER_BILL="$1"; shift ;;
	*) echo "Unrecognized argument: $1" 1>&2; exit 1 ;;
	esac
	done

	TD_SCANNING=/tmp/$WHOAMI-images
	TD_WORKING=/tmp/$WHOAMI

	main() {
	while do_one && [ -n "$PAGES_PER_BILL" ]; do
	:
	done
	}

	do_scanadf() {
	mkdir -p $TD_SCANNING \|\| exit 1
	if ! cd $TD_SCANNING; then
	exit 1
	fi
	if ! ls image* &>/dev/null; then
	if $STARTED; then
	return 1
	fi
	if [ -f $DEV_FILE ]; then
	dev=$(cat $DEV_FILE)
	else
	dev=$(scanimage -L \|
	sed -E -n -e "s/^device \`(.)' is a ./\\1/p" \|
	head -1)
	if [ -z $dev ]; then
	echo "Could not find scanner" 1>&2
	exit 1
	fi
	echo $dev > $DEV_FILE
	fi

	if ! scanadf --device "$dev" --mode 'Black & White' \
	--resolution 300 --source "$SOURCE" -y 279.4 >\| \
	scanadf.log 2>&1; then
	rm -f image*
	return 1
	fi
	fi
	if [ -n "$PAGES_PER_BILL" ]; then
	mv $(ls image* \| head -$PAGES_PER_BILL) $TD_WORKING/. \|\| exit 1
	else
	mv image* $TD_WORKING/. \|\| exit 1
	fi
	}

	check_empty() {
	image_file=$1; shift
	if [ ! -f $image_file ]; then
	echo "check_empty: $image_file does not exist" 1>&2
	return
	fi
	rm -f maybe-empty.txt
	tesseract --psm 6 $image_file maybe-empty >\| maybe-empty.log 2>&1 \|\| exit 1
	chars=$(wc -c < maybe-empty.txt)
	if ((chars < 92)); then
	echo "$image_file is empty, removing" 1>&2
	rm -f $image_file
	else
	echo "$image_file has $chars characters in it, preserving" 1>&2
	fi
	rm -f maybe-empty.txt
	}

	eastern_bank_statement_date() {
	MMDDYY="$(tre-agrep -2 'Statement Date:? [0-9][0-9]/[0-9][0-9]/([0-9][0-9])?[0-9][0-9]' page1.txt \| sed -E -n -e 's,.([0-9][0-9]/[0-9][0-9]/)([0-9][0-9])?([0-9][0-9]).,\1\3,p')"
	if [ -n "$MMDDYY" ]; then
	echo "$MMDDYY"
	return
	fi
	ENGLISH_DATE="$(tre-agrep -2 'Statement Date:? [A-Z][a-z][a-z][- ][1-9][0-9], 20[0-9][0-9]' page1.txt \| sed -n -e 's/.\([A-Z][a-z][a-z]\)[- ]\([1-9][0-9],\)[- ]\(20[0-9][0-9]\)./\1 \2 \3/p')"
	if [ -n "$ENGLISH_DATE" ]; then
	date --date "$ENGLISH_DATE" +%m/%d/%y
	return
	fi
	echo "Failed to detect statement date" 1>&2
	exit 1
	}

	do_one() {
	local SUBDIR DIR FILE_BASENAME

	if $RETRY; then
	RETRY=false
	else
	rm -rf $TD_WORKING
	mkdir $TD_WORKING \|\| exit 1
	if ! do_scanadf; then
	if $STARTED; then
	exit 1
	fi
	rm -f $DEV_FILE
	do_scanadf \|\| exit 1
	fi
	fi
	if ! cd $TD_WORKING; then
	exit 1
	fi

	STARTED=true

	tesseract --psm 6 $(ls image* \| head -1) page1 >\| tesseract.log 2>&1 \|\| exit 1

	if tre-agrep -q -s -2 home-loan-account-number page1.txt; then
	if [ ! "$MMDDYY" ]; then
	MMDDYY=$(eastern_bank_statement_date)
	fi
	SUBDIR=eastern_bank/home_loan
	check_empty image-0004
	elif tre-agrep -q -s -2 heloc-account-number page1.txt; then
	if [ ! "$MMDDYY" ]; then
	MMDDYY=$(eastern_bank_statement_date)
	fi
	SUBDIR=eastern_bank/heloc
	check_empty image-0004
	elif tre-agrep -q -s -2 heat-loan-account-number page1.txt; then
	if [ ! "$MMDDYY" ]; then
	MMDDYY=$(eastern_bank_statement_date)
	fi
	SUBDIR=eastern_bank/heat_loan
	check_empty image-0002
	check_empty image-0004
	elif egrep -q -s -i 'JEWISH COMMUNITY DAY SCHOOL\|Afterschool Invoice\|jcdsboston\|JCDS\|57 Stanley' page1.txt
	then
	if [ ! "$MMDDYY" ]; then
	MMDDYY=$(perl -e 'use Date::Parse; use POSIX "strftime"; $t = 0; while (<>) { while (m,(\d\d?/\d\d?/\d\d\s\d\s\d),g) { ($s = $1) =~ s/\s+//g; $t2 = str2time($s); if ($t2 > $t) { $t = $t2; } } } print(strftime("%D", localtime($t))) if ($t);' < page1.txt)
	if [ -z "$MMDDYY" ]; then
	echo "Failed to detect statement date" 1>&2
	exit 1
	fi
	fi
	SUBDIR=jcds
	elif grep -q -s -i 'boston water' page1.txt; then
	echo "Detected Boston Water statement"
	if [ ! "$MMDDYY" ]; then
	MMDDYY=$(sed -E -n -e 's;.([0-9][0-9]/[0-9][0-9]/[0-9][0-9]) previous balance.*;\1;pi' page1.txt)
	if [ -z "$MMDDYY" ]; then
	echo "Failed to detect statement date" 1>&2
	exit 1
	fi
	fi
	SUBDIR=boston_water
	fi

	if [ -n "$MMDDYY" -a -n "$SUBDIR" ]; then
	# The $(expr 0 + ...) trick is to remove leading zeroes from month and day
	# numbers so printf won't treat them like octal numbers.
	MONTH=$(printf '%02d' $(expr 0 + $(expr $MMDDYY : '\(..\?\)/..\?/..')))
	DAY=$(printf '%02d' $(expr 0 + $(expr $MMDDYY : '..\?/\(..\?\)/..')))
	YEAR=20$(expr $MMDDYY : '..\?/..\?/\(..\)')
	DIR=$HOME/closed/finances/statements/$SUBDIR/$YEAR
	FILE_BASENAME=$YEAR$MONTH$DAY
	fi

	convert image* bill.pdf >\| convert.log 2>&1 \|\| exit 1

	if [ -n "$DIR" -a -n "$FILE_BASENAME" ]; then
	SUFFIX=""
	while true; do
	TF="$DIR/$FILE_BASENAME$SUFFIX.pdf"
	if [ -f $TF ]; then
	if ! $MULTIPLE; then
	echo "$TF already exists, aborting." 1>&2
	exit 1
	fi
	if [ ! "$SUFFIX" ]; then
	SUFFIX=-2
	else
	SUFFIX=-$((${SUFFIX#*-}+1))
	fi
	continue
	fi

	mkdir -p $DIR \|\| exit 1
	cp -i bill.pdf "$TF" \|\| exit 1
	echo Saved as "$TF"
	break
	done
	else
	echo Failed to determine bill type 1>&2
	exit 1
	fi

	# If specified on command, only applies to first bill.
	MMDDYY=""
	}

	main