Last active
October 23, 2024 15:26
-
-
Save gccollect/cda80827e2dd1181dc02850d1c899a84 to your computer and use it in GitHub Desktop.
Convert Plutus pdf statement to csv file with running balance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
import pandas as pd | |
import pdfplumber | |
re_transaction = re.compile( | |
r'(Transaction|Card Transaction|Service Fee|Card Transfer)[\n\s](\d{4}/\d{2}/\d{2}, \d{2}:\d{2})\s+(-?\s?.\d*.?\d{0,2})\n?(.*)') | |
re_deposit = re.compile(r'(Card Deposit|Deposit)[\n\s](\d{4}/\d{2}/\d{2}, \d{2}:\d{2})\s+(.\d*.?\d{0,2})()') | |
re_page = re.compile(r'\n\d{1,2}/\d{1,2}/\d{4} \d+ / \d+') | |
def clean_row(row): | |
return [cell for cell in row if cell is not None and cell != ''] | |
def parse(pdf_file_loc, csv_file_loc): | |
with pdfplumber.open(pdf_file_loc) as pdf: | |
content = '\n'.join([page.extract_text_simple() for page in pdf.pages]).replace('\xa0', ' ') | |
columns = ['Type', 'Date', 'Amount', 'Transaction'] | |
content = re_page.sub('', content) | |
deposits = re_deposit.findall(content) | |
transactions = re_transaction.findall(content) | |
df = pd.DataFrame(deposits + transactions, columns=columns) | |
df['Amount'] = df['Amount'].replace(r'^(-?)\s?.(\d*\.?\d{0,2})$', value=r"\1\2", regex=True).astype(float) | |
df['Date'] = pd.to_datetime(df['Date'], format='%Y/%d/%m, %H:%M') | |
df = df.set_index('Date').sort_index() | |
df['Balance'] = df['Amount'].cumsum() | |
print(df) | |
df.to_csv(csv_file_loc, float_format="%.2f") | |
if __name__ == '__main__': | |
args = sys.argv[1:] | |
parse(*args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment