Skip to content

Instantly share code, notes, and snippets.

@garyo
Created November 15, 2024 23:14
Show Gist options
  • Save garyo/8674239bc61df315b65befb3d7c6f552 to your computer and use it in GitHub Desktop.
Save garyo/8674239bc61df315b65befb3d7c6f552 to your computer and use it in GitHub Desktop.
Convert PDF statements from Vanguard into csv files suitable for uploading balance info to Monarch Money
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "pandas",
# "pdfplumber",
# ]
# ///
import pdfplumber
import pandas as pd
import os
import re
from datetime import datetime
import sys
# Use the last 4 digits of the Vanguard account on the left side of the mapping,
# and fill in the exact Monarch account names on the right side.
AccountNames = {
"0419": "Vanguard (...bxvb)",
"1111": "Vanguard (...1a8df",
"3222": "Vanguard (...3faf)",
"4333": "Vanguard (...4asf)",
"4444": "Vanguard (...4asd)",
"9555": "Vanguard (...9dfas)"
}
def extract_statement_data(pdf_path):
"""
Extract account value, account number, and date from a Vanguard statement PDF.
Returns tuple of (account_value, account_number, statement_date)
"""
value = 0
account_number = ""
statement_date = 0
with pdfplumber.open(pdf_path) as pdf:
# Get account value from page 2 which has the statement overview
page = pdf.pages[1]
text = page.extract_text()
print(f"=== Opened {pdf_path}")
# Find total value (appears at top of page)
for line in text.split('\n'):
if '$' in line:
value = float(line.split('$')[1].replace(',', ''))
break
# Get account number and date from page 3
page = pdf.pages[2]
text = page.extract_text()
# Extract account number using regex
account_matches = re.findall(r'([\dX]{4}\d{4})', text)
if not account_matches:
raise ValueError("Could not find account number in statement")
account_number = account_matches[0]
# Extract date
pattern = r"Total account value as of (\b\w+\b \d{1,2}, \d{4})"
for line in text.split('\n'):
match = re.search(pattern, line)
if match:
date_str = match.group(1)
statement_date = datetime.strptime(date_str, '%B %d, %Y')
break
print(f" Processing: Account Number: {account_number}: Value: ${value:,.2f}, Statement Date: {statement_date}")
return value, account_number, statement_date
def process_statement_folder(folder_path, output_folder):
"""
Process all PDFs in a folder and create separate CSV files for each account.
"""
# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)
# Dictionary to store data for each account
account_data = {}
for filename in os.listdir(folder_path):
if filename.endswith('.pdf'):
pdf_path = os.path.join(folder_path, filename)
try:
value, account_number, date = extract_statement_data(pdf_path)
account_number = AccountNames[account_number[-4:]]
# Initialize list for this account if we haven't seen it before
if account_number not in account_data:
account_data[account_number] = []
# Add data for this statement
account_data[account_number].append({
'Date': date.strftime('%Y-%m-%d'), # Format date as YYYY-MM-DD
'Balance': value,
'Account': account_number
})
except Exception as e:
print(f"Error processing {filename}: {str(e)}")
# Create separate CSV for each account
for account_number, data in account_data.items():
df = pd.DataFrame(data)
df.sort_values('Date', inplace=True)
output_path = os.path.join(output_folder, f'account-{account_number}.csv')
df.to_csv(output_path, index=False)
print(f"Created CSV for account {account_number}")
# Print summary statistics for this account
print(f" Statements processed: {len(data)}")
print(f" Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f" Latest value: ${df.iloc[-1]['Balance']:,.2f}")
if __name__ == "__main__":
folder_path = "statements" # Folder containing PDF statements
output_folder = "account_values" # Folder for output CSVs
process_statement_folder(folder_path, output_folder)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment