Created
September 13, 2023 18:43
-
-
Save jtmoon79/3689ec39fbd804f4683c226d5ee74a67 to your computer and use it in GitHub Desktop.
Rename PDF files downloaded from paperlessemployee.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# | |
# rename pdf files from paperlessemployee.com | |
# | |
set -euo pipefail | |
if [[ ${#} -ne 1 ]] && [[ ${#} -ne 2 ]]; then | |
echo "usage: | |
$(basename "${0}") SelectedPayStatements.pdf [Agency Name] | |
about: | |
rename pay statements PDF files from www.paperlessemployee.com from default name 'SelectedPayStatements.pdf' | |
to 'My Agency Pay Statement YYYY-MM-DD to YYYY-MM-DD (X USD).pdf' | |
e.g. | |
'Super Talent Pay Statement 2023-01-21 to 2023-01-28 (total 1,234.00 USD) (final 1,000.00 USD).pdf' | |
Presumes the pay statement is only one pay statement and not multiple pay statements | |
within one PDF file. | |
" | |
exit | |
fi | |
pdf_file=${1-} | |
agency_name=${2-} | |
if [[ ! -r "${pdf_file}" ]]; then | |
echo "ERROR Cannot read file '${pdf_file}'" >&2 | |
exit 1 | |
fi | |
if [[ "${pdf_file:(-4)}" != '.pdf' ]]; then | |
echo "ERROR must pass a file ending with .pdf, given '${pdf_file}'" >&2 | |
exit 1 | |
fi | |
function remove_leading_0 () { | |
# remove all leading '0' from the passed $1 string | |
echo -n "${1}" | sed -Ee 's/^0+//g' | |
} | |
function is_positive_number () { | |
# is the passed number greater than 0? | |
declare -i num=$(remove_leading_0 "${1}") | |
[[ ${num} -gt 0 ]] | |
} | |
function trim_whitespace () { | |
# delete any whitespace surrounding the piped text | |
sed -e 's#[\t ]\+\(.\+\)[\t ]\+#\1#' | |
} | |
function remove_blank_lines () { | |
# delete any blank lines in the piped text | |
sed -Ee '/^[[:space:]]*$/d' | |
} | |
function print_line () { | |
# print line number $1 from the stdin text (piped data) | |
sed -n "${1}p" | |
} | |
if ! which pdftotext &>/dev/null; then | |
echo "ERROR cannot find 'pdftotext' in the PATH" >&2 | |
echo " pdftotext is part of apt package 'poppler-utils'" >&2 | |
exit 1 | |
fi | |
text=$(pdftotext "${pdf_file}" -) | |
text=$(echo -n "${text}" | remove_blank_lines) | |
agency_name_append=' ' | |
if [[ -z "${agency_name}" ]]; then | |
agency_name_append='' | |
fi | |
new_name="${agency_name}${agency_name_append}Pay Statement" | |
# find the unique string before the date text | |
# Example text around the week dates: | |
# | |
# Pay Period Begin: | |
# 07/30/2023 | |
# Pay Period End: | |
# 08/05/2023 | |
# | |
if ! text_dates=$(echo "${text}" | grep -A3 -Ee '^Pay Period Begin:'); then | |
echo "ERROR failed to find unique string 'Pay Period Begin' in '${pdf_file}'" >&2 | |
exit 1 | |
fi | |
echo -e "text_dates:\n${text_dates}" | |
echo | |
text_date_start=$(echo -n "${text_dates}" | print_line 2 | trim_whitespace) | |
text_date_end=$(echo -n "${text_dates}" | print_line 4 | trim_whitespace) | |
echo "text_date_start: '${text_date_start}'" | |
echo "text_date_end: '${text_date_end}'" | |
# | |
# date text 07/30/2023 | |
# index 0123456789 | |
# | |
# re-arrange from 'MM/DD/YYYY' to 'YYYY-MM-DD' | |
# | |
ds_y=${text_date_start:6:4} # Date Start Year | |
ds_m=${text_date_start:0:2} # Date Start Month | |
ds_d=${text_date_start:3:2} # Date Start Day | |
is_positive_number ${ds_y} || { echo "ERROR: bad year '${ds_y}' in Date Start string '${text_date_start}'" >&2; exit 1; } | |
is_positive_number ${ds_m} || { echo "ERROR: bad month '${ds_m}' in Date Start string '${text_date_start}'" >&2; exit 1; } | |
is_positive_number ${ds_d} || { echo "ERROR: bad day '${ds_d}' in Date Start string '${text_date_start}'" >&2; exit 1; } | |
de_y=${text_date_end:6:4} # Date End Year | |
de_m=${text_date_end:0:2} # Date End Month | |
de_d=${text_date_end:3:2} # Date End Day | |
is_positive_number ${de_y} || { echo "ERROR: bad year '${de_y}' in Date End string '${text_date_end}'" >&2; exit 1; } | |
is_positive_number ${de_m} || { echo "ERROR: bad month '${de_m}' in Date End string '${text_date_end}'" >&2; exit 1; } | |
is_positive_number ${de_d} || { echo "ERROR: bad day '${de_d}' in Date End string '${text_date_end}'" >&2; exit 1; } | |
new_name+=" ${ds_y}-${ds_m}-${ds_d} to ${de_y}-${de_m}-${de_d}" | |
# find the unique strings that describe pay | |
# Example of such text: | |
# | |
# TOTAL | |
# TAXES | |
# TOTAL | |
# DEDUCTIONS | |
# NET PAY | |
# Current: | |
# 1,234.00 | |
# 123.45 | |
# 12.34 | |
# 1,234.12 | |
# 1,234.12 | |
# 1,234.12 | |
# YTD: | |
# 9,999.99 | |
# 999.99 | |
# 99.99 | |
# 9,900.00 | |
# 9,000.00 | |
# 9,000.00 | |
# 0.00 | |
# 111.11 | |
# 222.22 | |
# 3,333.33 | |
# 0.00 | |
# 1,444.44 | |
# 333.33 | |
# 3,210.00 | |
# LEAVE BALANCE | |
# | |
if text_pay_raw=$(echo "${text}" | grep -A12 -Ee '^NET PAY'); then | |
# pay total current period | |
if text_pay=$(echo -n "${text_pay_raw}" | print_line 3 | trim_whitespace); then | |
declare -i text_pay_num=${text_pay%%.??} # trim trailing '.00' (sub-dollar amount) | |
if [[ "${text_pay}" != '' ]] && [[ "${text_pay_num}" -gt 0 ]]; then | |
echo "text_pay (total): '${text_pay}'" | |
new_name+=" (total ${text_pay} USD)" | |
else | |
echo "Failed to convert text_pay total" >&2 | |
fi | |
else | |
echo "Failed to extract text_pay total from text_pay_raw" >&2 | |
fi | |
# pay final current period (after taxes amount) | |
if text_pay=$(echo -n "${text_pay_raw}" | grep -B1 -Ee '^YTD:' | print_line 1 | trim_whitespace); then | |
declare -i text_pay_num=${text_pay%%.??} # trim trailing '.00' (sub-dollar amount) | |
if [[ "${text_pay}" != '' ]] && [[ "${text_pay_num}" -gt 0 ]]; then | |
echo "text_pay (final): '${text_pay}'" | |
new_name+=" (final ${text_pay} USD)" | |
else | |
echo "Failed to convert text_pay final" >&2 | |
fi | |
else | |
echo "Failed to extract text_pay final from text_pay_raw" >&2 | |
fi | |
else | |
echo "Failed to find NET PAY" >&2 | |
fi | |
# find the unique string that describe hours | |
# Example of such text: | |
# | |
# Current Hours | |
# Current Amt | |
# YTD Hours | |
# YTD Amt | |
# 40.00 | |
# 1,234.00 | |
# 123.00 | |
# 99,999.00 | |
# 40.00 | |
# 123.00 | |
# 99,999.00 | |
# EARNINGS | |
# | |
if text_hours_raw=$(echo "${text}" | grep -A5 -Ee '^Current Hours'); then | |
text_hours=$(echo "${text_hours_raw}" | print_line 5 | trim_whitespace) | |
if [[ "${text_hours}" != '' ]]; then | |
echo "text_hours : '${text_hours}'" | |
new_name+=" (${text_hours} hours)" | |
else | |
echo "Failed to extract text_hours from text_hours_raw" >&2 | |
fi | |
fi | |
new_name+=".pdf" | |
echo "new_name: '${new_name}'" | |
echo | |
pdf_path=$(dirname -- "${pdf_file}") | |
set -x | |
mv -v "${pdf_file}" "${pdf_path}/${new_name}" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment