Created
September 8, 2018 13:05
-
-
Save mzhukovs/c432f9749053debea1b7ae95c601f4fa to your computer and use it in GitHub Desktop.
Python Script to Scrape Transaction Records from United Overseas Bank CASA Statement PDFs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Parses UOB CASA eStatements transaction data | |
#! first download tika-server from http://www.apache.org/dyn/closer.cgi/tika/tika-server-1.18.jar and then go to cmd: | |
#* cd C:\Users\YourUserName\Documents\Python Scripts\_util # or wherever you installed it | |
#* java -jar tika-server-1.18.jar --port 1111 | |
#region Imports | |
import time | |
import datetime | |
import pandas as pd | |
import regex as re | |
import glob | |
import tika | |
from tika import parser | |
#endregion | |
#region Settings | |
tika.TikaClientOnly = True | |
tikaServerPath = "http://localhost:2929/" | |
rootDir = "eStatements" | |
subDir = "Uniplus" | |
fileNameStartsWith = "eStatement" | |
amountsPlaceholder = "numstart" | |
#endregion | |
#region UDFs | |
def getDescription(stringy): | |
lines = stringy[0:stringy.find(amountsPlaceholder)].split('\n') | |
note = lines[0] | |
if (note[0:4] == 'Cash'): | |
note = 'Cash' | |
elif (note.find('Cheque Deposit') > -1): | |
note = 'Cheque Deposit' | |
elif (note.find('Interest Credit') > -1): | |
note = 'Interest Credit' | |
if (len(lines) == 1): | |
return (note, '') | |
# remove the note now that we have it captured | |
del lines[0] | |
# only applies if you have USD transactions; append original rate to note description | |
if (lines[0][0:3] == 'USD'): | |
note += ' ' + lines[0] | |
del lines[0] | |
if (bool(re.match(transactionRowPattern, lines[0]))): | |
del lines[0] | |
# get rid of [sometimes trailing] transaction number that sometimes appears on last line (optional) | |
#if (lines[-1].isdigit()): | |
# del lines[-1] | |
recipient = ''.join(lines) | |
return (note, recipient) | |
def convertToFloat(stringy): | |
try: | |
floatie = float(stringy[stringy.find(amountsPlaceholder)+8:].split(' ')[-1].replace(',', '')) | |
except ValueError: | |
return None | |
else: | |
return floatie | |
#endregion | |
# pull in PDF files | |
pdf_files = glob.glob(f"{rootDir}/{subDir}/{fileNameStartsWith}*.pdf") | |
# declare patterns used for regex splitting/matching | |
transactionRowPDelimiters = " \n\n|SGD \n\n" # all transaction rows will be split by 2 spaces followed by 2 newlines EXCEPT the FIRST ones on a page will only have 1 space and be preceded by SGD | |
transactionRowPattern = re.compile("\\d{2}\\s\\w{3}\\s") #e.g. 13 Mar | |
statementYearPattern = re.compile("\nPeriod: .*\n{2}") #Period: 01 Mar 2015 to 31 Mar 2015 | |
# initialize empty list to house our final results | |
transactionList = [] | |
for pdf_file in pdf_files: | |
raw = parser.from_file(pdf_file, tikaServerPath) | |
text = raw['content'] #.encode("utf-8") | |
year = re.findall(statementYearPattern, text)[0].replace('\n\n', '')[-4:] #! note that match only checks as of the BEGINNING of the string, so need to use search or findall | |
transactions = pd.DataFrame(re.split(transactionRowPDelimiters, text)) | |
transactions.columns = ['rawRow'] | |
transactions['Year'] = year | |
# filter on actual transaction rows | |
transactions = transactions[transactions.rawRow.apply(lambda x: bool(re.match(transactionRowPattern, x)))] | |
#region Parse Transaction Row Data into Fields | |
transactions['MonthDay'] = transactions.rawRow.apply(lambda x: x[0:6]) | |
transactions['Date'] = transactions.MonthDay + ' ' + transactions.Year | |
transactions['rawRow'] = transactions.rawRow.apply(lambda x: x[7:].replace('\n\n', amountsPlaceholder)) | |
transactions['Description'] = transactions.rawRow.apply(lambda x: getDescription(x)) | |
transactions['Note'] = transactions.Description.apply(lambda x: x[0]) | |
transactions['Payee'] = transactions.Description.apply(lambda x: x[1]) | |
transactions['Balance'] = transactions.rawRow.apply(lambda x: convertToFloat(x)) | |
transactions.reset_index(drop=True, inplace=True) | |
statementBegBalance = transactions.iloc[0].Balance # first row is starting balance | |
transactions.drop(transactions.index[0], inplace=True) # drop it after capturing, note NOT re-setting the index so now it starts at 1 | |
# calculate transaction amount | |
transactions['Amount'] = 0.00 | |
transactions['Amount'][1] = transactions['Balance'][1] - statementBegBalance | |
for i in range(2, len(transactions) + 1): | |
transactions.Amount[i] = transactions.Balance[i] - transactions.Balance[i-1] | |
# drop stuff we don't need in the output | |
del transactions['rawRow'] | |
del transactions['Year'] | |
del transactions['MonthDay'] | |
del transactions['Description'] | |
# append to master list | |
transactionList.append(transactions) | |
#endregion | |
#region Output to CSV | |
transactions = pd.concat(transactionList) | |
transactions['Date'] = pd.to_datetime(transactions.Date) | |
transactions = transactions.reset_index(drop=True) | |
transactions['Indexx'] = transactions.index | |
transactions.sort_values(by=['Date', 'Indexx'], inplace=True) | |
del transactions['Indexx'] | |
ts = time.time() | |
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d-%H.%M.%S') | |
transactions.to_csv('output' + st + '.csv', index=False) | |
print('Success!') | |
#endregion |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment