Created
November 15, 2024 23:14
-
-
Save garyo/8674239bc61df315b65befb3d7c6f552 to your computer and use it in GitHub Desktop.
Convert PDF statements from Vanguard into csv files suitable for uploading balance info to Monarch Money
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.11" | |
# dependencies = [ | |
# "pandas", | |
# "pdfplumber", | |
# ] | |
# /// | |
import pdfplumber | |
import pandas as pd | |
import os | |
import re | |
from datetime import datetime | |
import sys | |
# Use the last 4 digits of the Vanguard account on the left side of the mapping, | |
# and fill in the exact Monarch account names on the right side. | |
AccountNames = { | |
"0419": "Vanguard (...bxvb)", | |
"1111": "Vanguard (...1a8df", | |
"3222": "Vanguard (...3faf)", | |
"4333": "Vanguard (...4asf)", | |
"4444": "Vanguard (...4asd)", | |
"9555": "Vanguard (...9dfas)" | |
} | |
def extract_statement_data(pdf_path): | |
""" | |
Extract account value, account number, and date from a Vanguard statement PDF. | |
Returns tuple of (account_value, account_number, statement_date) | |
""" | |
value = 0 | |
account_number = "" | |
statement_date = 0 | |
with pdfplumber.open(pdf_path) as pdf: | |
# Get account value from page 2 which has the statement overview | |
page = pdf.pages[1] | |
text = page.extract_text() | |
print(f"=== Opened {pdf_path}") | |
# Find total value (appears at top of page) | |
for line in text.split('\n'): | |
if '$' in line: | |
value = float(line.split('$')[1].replace(',', '')) | |
break | |
# Get account number and date from page 3 | |
page = pdf.pages[2] | |
text = page.extract_text() | |
# Extract account number using regex | |
account_matches = re.findall(r'([\dX]{4}\d{4})', text) | |
if not account_matches: | |
raise ValueError("Could not find account number in statement") | |
account_number = account_matches[0] | |
# Extract date | |
pattern = r"Total account value as of (\b\w+\b \d{1,2}, \d{4})" | |
for line in text.split('\n'): | |
match = re.search(pattern, line) | |
if match: | |
date_str = match.group(1) | |
statement_date = datetime.strptime(date_str, '%B %d, %Y') | |
break | |
print(f" Processing: Account Number: {account_number}: Value: ${value:,.2f}, Statement Date: {statement_date}") | |
return value, account_number, statement_date | |
def process_statement_folder(folder_path, output_folder): | |
""" | |
Process all PDFs in a folder and create separate CSV files for each account. | |
""" | |
# Create output folder if it doesn't exist | |
os.makedirs(output_folder, exist_ok=True) | |
# Dictionary to store data for each account | |
account_data = {} | |
for filename in os.listdir(folder_path): | |
if filename.endswith('.pdf'): | |
pdf_path = os.path.join(folder_path, filename) | |
try: | |
value, account_number, date = extract_statement_data(pdf_path) | |
account_number = AccountNames[account_number[-4:]] | |
# Initialize list for this account if we haven't seen it before | |
if account_number not in account_data: | |
account_data[account_number] = [] | |
# Add data for this statement | |
account_data[account_number].append({ | |
'Date': date.strftime('%Y-%m-%d'), # Format date as YYYY-MM-DD | |
'Balance': value, | |
'Account': account_number | |
}) | |
except Exception as e: | |
print(f"Error processing {filename}: {str(e)}") | |
# Create separate CSV for each account | |
for account_number, data in account_data.items(): | |
df = pd.DataFrame(data) | |
df.sort_values('Date', inplace=True) | |
output_path = os.path.join(output_folder, f'account-{account_number}.csv') | |
df.to_csv(output_path, index=False) | |
print(f"Created CSV for account {account_number}") | |
# Print summary statistics for this account | |
print(f" Statements processed: {len(data)}") | |
print(f" Date range: {df['Date'].min()} to {df['Date'].max()}") | |
print(f" Latest value: ${df.iloc[-1]['Balance']:,.2f}") | |
if __name__ == "__main__": | |
folder_path = "statements" # Folder containing PDF statements | |
output_folder = "account_values" # Folder for output CSVs | |
process_statement_folder(folder_path, output_folder) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment