Last active
December 10, 2023 00:53
-
-
Save bencharb/13249225206dd2b2979942d98efe87cd to your computer and use it in GitHub Desktop.
parse bank of america pdf statements
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Transform Bank of America PDF text into tables. | |
Example: | |
python parse_boa_statements.py test/joint_dump.txt test/pers_dump.txt --output test/combined.csv | |
Example of original PDF text (copy/pasted from PDF): | |
MISTER NAME | Account # 999 99999 9999 | November 24, 2015 to December 24, 2015 | |
Your checking account | |
Page 3 of 4 | |
Deposits and other additions | |
Date Description Amount | |
11/30/15 Online Banking transfer from CHK 9999 Confirmation# 0986430911 200.00 | |
12/07/15 MISTEREMPLOYER, DES:PAYROLL ID:00099999000X INDN:MISTER, NAME | |
ID:000299229294 PPD | |
918.57 | |
12/22/15 MISTEREMPLOYER, DES:PAYROLL ID: 00099999000X INDN:MISTER, NAME | |
ID:000299229294 PPD | |
918.57 | |
Total deposits and other additions $2,037.14 | |
Withdrawals and other subtractions | |
Date Description Amount | |
11/30/15 BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST -120.00 | |
12/07/15 Online Banking transfer to CHK 9999 Confirmation# 09810423895 -500.00 | |
Total withdrawals and other subtractions -$3,120.00 | |
To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab | |
in Online Banking for a printable version of the How to Balance Your Account Worksheet. | |
Checks | |
Date Check # Amount Date Check # Amount | |
10/14/14 833 -183.00 10/24/14 835 -61.00 | |
10/09/14 833 -9.95 | |
Total checks -$253.95 | |
Total # of checks 3 | |
Note your Ending Balance already reflects the subtraction of Service Fees. | |
To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab | |
in Online Banking for a printable version of the How to Balance Your Account Worksheet. | |
Example output: | |
Date,Description,Amount,Account | |
11/30/15 ,Online Banking transfer from CHK 9999 Confirmation# 0986430911,200.00,testaccount | |
11/30/15 ,BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST,-120.00,testaccount | |
10/14/14 ,833,-183.0,testaccount | |
10/24/14 ,835,-61.0,testaccount' | |
''' | |
import re | |
import csv | |
import shutil | |
# Meh, cant test now, hard to assert equal. | |
# def test_transform(): | |
# test_data = '''MISTER NAME | Account # 999 99999 9999 | November 24, 2015 to December 24, 2015 | |
# Your checking account | |
# Page 3 of 4 | |
# Deposits and other additions | |
# Date Description Amount | |
# 11/30/15 Online Banking transfer from CHK 9999 Confirmation# 0986430911 200.00 | |
# 12/07/15 MISTEREMPLOYER, DES:PAYROLL ID:00099999000X INDN:MISTER, NAME | |
# ID:000299229294 PPD | |
# 918.57 | |
# 12/22/15 MISTEREMPLOYER, DES:PAYROLL ID: 00099999000X INDN:MISTER, NAME | |
# ID:000299229294 PPD | |
# 918.57 | |
# Total deposits and other additions $2,037.14 | |
# Withdrawals and other subtractions | |
# Date Description Amount | |
# 11/30/15 BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST -120.00 | |
# 12/07/15 Online Banking transfer to CHK 9999 Confirmation# 09810423895 -500.00 | |
# Total withdrawals and other subtractions -$3,120.00 | |
# To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab | |
# in Online Banking for a printable version of the How to Balance Your Account Worksheet. | |
# Checks | |
# Date Check # Amount Date Check # Amount | |
# 10/14/14 833 -183.00 10/24/14 835 -61.00 | |
# 10/09/14 833 -9.95 | |
# Total checks -$253.95 | |
# Total # of checks 3 | |
# Note your Ending Balance already reflects the subtraction of Service Fees. | |
# To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab | |
# in Online Banking for a printable version of the How to Balance Your Account Worksheet.''' | |
# expected = '''Date,Description,Amount,Account | |
# 11/30/15 ,Online Banking transfer from CHK 9999 Confirmation# 0986430911,200.00,testaccount | |
# 11/30/15 ,BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ALEXANDRIA VA,-120.00,testaccount | |
# 10/14/14 ,833,-183.0,testaccount | |
# 10/24/14 ,835,-61.0,testaccount''' | |
# # import tempfile | |
# # tmp_path = os.path.join(tempfile.mkdtemp(),'testaccount') | |
# # tmp_path_output = tmp_path+'_final_output.csv' | |
# # tmp_path_output_expected = tmp_path+'_final_output_expected.csv' | |
# # with open(tmp_path, 'w') as fout: | |
# # fout.write(test_data) | |
# # columns = ('Date', 'Description', 'Amount', 'Account',) | |
# # rowcounts = main([tmp_path], tmp_path_output, columns=columns) | |
# # with open(tmp_path_output_expected, 'w') as fout: | |
# # writer = csv.DictWriter(fout, columns) | |
# # writer.writeheader() | |
# # writer.writerows(expected.split('\n')) | |
# # with open(tmp_path_output, 'r') as fin: | |
# # orig_reader = csv.reader(fin) | |
# # orig_data = [l for l in orig_reader] | |
# # with open(tmp_path_output_expected, 'r') as fin: | |
# # expected_reader = csv.reader(fin) | |
# # expected_data = [l for l in expected_reader] | |
# # print 'expected_data', expected_data | |
# # print 'orig_data', orig_data | |
# # assert expected_data == orig_data | |
# # # try: | |
# # # except AssertionError: | |
# # # print 'tmp_path_output_expected', tmp_path_output_expected | |
# # # print 'tmp_path_output', tmp_path_output | |
# # # raise | |
def is_true(val): | |
return unicode(val).lower() in ['true', 'yes'] | |
def validate_line(l): | |
return unicode(l[0]).isnumeric() and '/' in l.split()[0] | |
def validate_header_line(l): | |
ok_len = len(l) < len('Date Check # Amount Date Check # Amount') + 10 | |
valid = ok_len and l.startswith('Date') | |
return valid | |
def validate_data(data): | |
if isinstance(data, basestring): | |
working_data = data.split('\n') | |
else: | |
working_data = data | |
invalids = [] | |
for line in working_data: | |
if not validate_line(line): | |
if not validate_header_line(line): | |
invalids.append(line) | |
if invalids: | |
raise Exception('Invalid lines %s' % '\n'.join(invalids)) | |
return True | |
def line_to_dict(line): | |
parts = re.findall(r'(\d+/\d+/\d+)\s(.*)\s(.*\d+)$', line) | |
if parts: | |
date, des, amt = parts[0] | |
return {'Date':date, 'Description':des, 'Amount':amt} | |
boa_rx = re.compile('\d\d\/\d\d\/\d\d(?:.(?!\nTotal)(?!\d\d\/\d\d\/\d\d)(?!\n\d\d\/\d\d\/\d\d)|\n)+\.\d\d', re.IGNORECASE) | |
def parse_boa_dump(stringdata): | |
records = boa_rx.findall(stringdata) | |
records = [r.replace('\n', ' ') for r in records] | |
return records | |
class TransformedStatement(object): | |
def __init__(self, dump_path, account='unknown'): | |
super(TransformedStatement, self).__init__() | |
self.dump_path = dump_path | |
self.account = account | |
def write(self, out_path, columns, mode='a'): | |
fullpath = os.path.abspath(out_path) | |
with open(out_path, mode) as fout: | |
writer = csv.DictWriter(fout, columns) | |
with open(self.dump_path, 'r') as fin: | |
data = [] | |
def add_account(items, account_name): | |
for d in items: | |
d['Account'] = account_name | |
# remove line endings | |
file_data = fin.read() | |
parsed_lines = parse_boa_dump(file_data) | |
joined_dict = map(line_to_dict, parsed_lines) | |
add_account(joined_dict, self.account) | |
writer.writerows(joined_dict) | |
return len(joined_dict) | |
def transform_dump(transformed_statements, | |
columns=None, | |
out_path='transformed_statements.csv', | |
override_existing_file=False): | |
out_path = os.path.abspath(out_path) | |
exists = os.path.exists(out_path) | |
if not override_existing_file and exists: | |
raise Exception('File exists: %s' % out_path) | |
if isinstance(transformed_statements, TransformedStatement): | |
transformed_statements = [transformed_statements] | |
else: | |
for ts in transformed_statements: | |
if not isinstance(ts, TransformedStatement): | |
raise TypeError('%s not of type TransformedStatement' % ts) | |
out_path_original = out_path | |
out_path_temp = out_path+'.temp' | |
with open(out_path_temp, 'w') as fout: | |
writer = csv.DictWriter(fout, columns) | |
writer.writeheader() | |
rowcounts = [] | |
for stmt in transformed_statements: | |
rowcounts.append(stmt.write(out_path_temp, columns, mode='a')) | |
shutil.move(out_path_temp, out_path) | |
return rowcounts | |
def main(files, output_file, columns=None, override_existing_file=True): | |
basenames = [os.path.basename(os.path.splitext(f)[0]) for f in files] | |
source_paths = zip(files,basenames) | |
stmts = [TransformedStatement(path, account=act) for path, act in source_paths] | |
rowcounts = transform_dump(stmts, columns=columns, out_path=output_file, override_existing_file=override_existing_file) | |
return rowcounts | |
if __name__ == '__main__': | |
import sys | |
import argparse | |
import os | |
description='Parse copy/pasted text from BofA statements. \nPrerequisites: \n(1) Download BofA pdfs. \n(2) For each PDF, keep only pages with tables on them; trash all other pages without tables. \n(3) For each PDF, copy/paste PDF text content from each statement into one text file. Just Cmd+a/Ctl+a > copy and paste into a file for the account name, such as "checking6443.txt"' | |
description+='\nExample: \npython parse_boa_statements.py test/joint_dump.txt test/pers_dump.txt --output test/combined.csv\n\n' | |
import textwrap | |
description = '\n'.join(['\n'.join(textwrap.wrap(line, 100)) for line in description.split('\n')]) | |
parser = argparse.ArgumentParser(description) | |
parser.add_argument('file', nargs='*', help='Separate multiple files with a comma. Paths to the files you created for each account. Each file should contain content for only one account. Generate the content for each file by copy/pasting the text from the PDFs into one file. Detailed description elsewhere in this help file.') | |
parser.add_argument('--output', help='Output path', default='combined_statements.csv') | |
parser.add_argument('--override_output', help='Override existing output file if it exists', action='store_true', default=True) | |
if len(sys.argv)==1: | |
parser.print_help() | |
sys.exit(1) | |
args = parser.parse_args() | |
# files = args.files.split(',') | |
files = args.file #.split(',') | |
out_path = args.output | |
override_existing_file = is_true(args.override_output) | |
columns = ('Date', 'Description', 'Amount', 'Account',) | |
rowcounts = main(files, out_path, columns=columns, override_existing_file=override_existing_file) | |
if not rowcounts: | |
message = 'No rows transformed to file %s' % out_path | |
else: | |
message = '%d rows from %d files transformed to file %s' % (sum(rowcounts),len(rowcounts), out_path,) | |
print message |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment