Created
May 20, 2024 14:38
-
-
Save thomaswitt/c16516270859cc631fefbe71cef62a54 to your computer and use it in GitHub Desktop.
Take Invoice Scans from a Canon RS40 into a Hazel-Watched Folder, send them to AWS Textract and rename them into Date and Company
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load Homebrew environment | |
eval "$(/opt/homebrew/bin/brew shellenv)" | |
# Variables | |
dir_path=$(dirname "$1") | |
base_filename=$(basename "$1") | |
log_file="${dir_path}/${base_filename}-error.log" | |
# Base64 encode the PDF file | |
base64_string=$(qpdf "$1" --pages . 1 -- - | base64) | |
# Call AWS Textract and handle errors | |
aws textract analyze-expense --profile textract --document "Bytes=$base64_string" 2> "$log_file" | jq -r '.ExpenseDocuments[0].SummaryFields[]' > "$1-summary.json" | |
if [ ${PIPESTATUS[0]} -ne 0 ]; then | |
say -v Zoe "AWS Textract failed. Check log file." | |
exit 1 | |
fi | |
# Check if the JSON file is not empty | |
if [ -s "$1-summary.json" ]; then | |
# Extract the date string | |
date_str=$(jq -rs 'map(select(.Type.Text == "INVOICE_RECEIPT_DATE")) | sort_by(.Type.Confidence) | reverse | first | .ValueDetection.Text // empty' "$1-summary.json" | sed -e "s/,$//") | |
# If the date string is not empty, process further | |
if [ -n "$date_str" ]; then | |
escaped_date_str=$(printf '%q' "$date_str") | |
# Convert the date to ISO format using Date.parse and Chronic | |
formatted_date=$(ruby -r date -r chronic -e " | |
require 'date'; | |
require 'chronic'; | |
date_str = '$escaped_date_str'; | |
date = Chronic.parse(date_str) | |
if date && date.to_date != Date.today + 1 | |
parsed_date = date | |
else | |
begin | |
parsed_date = Date.parse(date_str) | |
rescue ArgumentError | |
parsed_date = nil | |
end | |
end | |
puts (parsed_date&.strftime('%Y-%m-%d') || '')") | |
# Check if formatted_date was successfully set | |
if [ -z "$formatted_date" ]; then | |
say -v Zoe "Date parse failed. Check log file." | |
exit 1 | |
fi | |
# Extract the vendor name and format it | |
vendor=$(jq -rs 'map(select(.Type.Text == "VENDOR_NAME")) | sort_by(.Type.Confidence) | reverse | first | .ValueDetection.Text // empty' "$1-summary.json" | gsed -E 's/\s+/-/g') | |
[ -n "$vendor" ] && vendor="-${vendor}" | |
# If the formatted date is not empty, proceed with renaming the file | |
if [ -n "$formatted_date" ]; then | |
final_filename="${dir_path}/${formatted_date}${vendor}.pdf" | |
if [ -e "$final_filename" ]; then | |
final_filename="${dir_path}/${formatted_date}${vendor}-$$.pdf" | |
fi | |
mv "$1" "$final_filename" | |
fi | |
else | |
say -v Zoe "Date string empty. Check log file." | |
exit 1 | |
fi | |
# Remove the JSON summary file and log file if everything was successful | |
trash "$1-summary.json" "$log_file" | |
say -v Zoe "Parsing Receipt ${vendor} from ${formatted_date} successful" | |
else | |
echo "Error: JSON file is empty. Textract analysis might have failed." >&2 | |
say -v Zoe "Empty JSON. Check log file." | |
trash "$1-summary.json" | |
exit 1 | |
fi | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment