garyo · November 15, 2024 23:14
diff --git a/vanguard-to-monarch-balance.py b/vanguard-to-monarch-balance.py
 # /// script
 # requires-python = ">=3.11"
 # dependencies = [
 #     "pandas",
 #     "pdfplumber",
 # ]
 # ///
 import pdfplumber
 import pandas as pd
 import os
 import re
 from datetime import datetime
 import sys

 # Use the last 4 digits of the Vanguard account on the left side of the mapping,
 # and fill in the exact Monarch account names on the right side.
 AccountNames = {
    "0419": "Vanguard (...bxvb)",
    "1111": "Vanguard (...1a8df",
    "3222": "Vanguard (...3faf)",
    "4333": "Vanguard (...4asf)",
    "4444": "Vanguard (...4asd)",
    "9555": "Vanguard (...9dfas)"
 }

 def extract_statement_data(pdf_path):
    """
    Extract account value, account number, and date from a Vanguard statement PDF.
    Returns tuple of (account_value, account_number, statement_date)
    """
    value = 0
    account_number = ""
    statement_date = 0
    with pdfplumber.open(pdf_path) as pdf:
        # Get account value from page 2 which has the statement overview
        page = pdf.pages[1]
        text = page.extract_text()
        
        print(f"=== Opened {pdf_path}")

        # Find total value (appears at top of page)
        for line in text.split('\n'):
            if '$' in line:
                value = float(line.split('$')[1].replace(',', ''))
                break
        
        # Get account number and date from page 3
        page = pdf.pages[2]
        text = page.extract_text()
        
        # Extract account number using regex
        account_matches = re.findall(r'([\dX]{4}\d{4})', text)
        if not account_matches:
            raise ValueError("Could not find account number in statement")
        account_number = account_matches[0]
        
        # Extract date
        pattern = r"Total account value as of (\b\w+\b \d{1,2}, \d{4})"
        for line in text.split('\n'):
            match = re.search(pattern, line)
            if match:
                date_str = match.group(1)
                statement_date = datetime.strptime(date_str, '%B %d, %Y')
                break
                
    print(f" Processing: Account Number: {account_number}: Value: ${value:,.2f}, Statement Date: {statement_date}")
    return value, account_number, statement_date

 def process_statement_folder(folder_path, output_folder):
    """
    Process all PDFs in a folder and create separate CSV files for each account.
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Dictionary to store data for each account
    account_data = {}
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            try:
                value, account_number, date = extract_statement_data(pdf_path)
                
                account_number = AccountNames[account_number[-4:]]
                # Initialize list for this account if we haven't seen it before
                if account_number not in account_data:
                    account_data[account_number] = []
                    
                # Add data for this statement
                account_data[account_number].append({
                    'Date': date.strftime('%Y-%m-%d'),  # Format date as YYYY-MM-DD
                    'Balance': value,
                    'Account': account_number
                })
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
    
    # Create separate CSV for each account
    for account_number, data in account_data.items():
        df = pd.DataFrame(data)
        df.sort_values('Date', inplace=True)
        
        output_path = os.path.join(output_folder, f'account-{account_number}.csv')
        df.to_csv(output_path, index=False)
        print(f"Created CSV for account {account_number}")
        
        # Print summary statistics for this account
        print(f"  Statements processed: {len(data)}")
        print(f"  Date range: {df['Date'].min()} to {df['Date'].max()}")
        print(f"  Latest value: ${df.iloc[-1]['Balance']:,.2f}")

 if __name__ == "__main__":
    folder_path = "statements"  # Folder containing PDF statements
    output_folder = "account_values"  # Folder for output CSVs
    process_statement_folder(folder_path, output_folder)
	# /// script
	# requires-python = ">=3.11"
	# dependencies = [
	# "pandas",
	# "pdfplumber",
	# ]
	# ///
	import pdfplumber
	import pandas as pd
	import os
	import re
	from datetime import datetime
	import sys

	# Use the last 4 digits of the Vanguard account on the left side of the mapping,
	# and fill in the exact Monarch account names on the right side.
	AccountNames = {
	"0419": "Vanguard (...bxvb)",
	"1111": "Vanguard (...1a8df",
	"3222": "Vanguard (...3faf)",
	"4333": "Vanguard (...4asf)",
	"4444": "Vanguard (...4asd)",
	"9555": "Vanguard (...9dfas)"
	}

	def extract_statement_data(pdf_path):
	"""
	Extract account value, account number, and date from a Vanguard statement PDF.
	Returns tuple of (account_value, account_number, statement_date)
	"""
	value = 0
	account_number = ""
	statement_date = 0
	with pdfplumber.open(pdf_path) as pdf:
	# Get account value from page 2 which has the statement overview
	page = pdf.pages[1]
	text = page.extract_text()

	print(f"=== Opened {pdf_path}")

	# Find total value (appears at top of page)
	for line in text.split('\n'):
	if '$' in line:
	value = float(line.split('$')[1].replace(',', ''))
	break

	# Get account number and date from page 3
	page = pdf.pages[2]
	text = page.extract_text()

	# Extract account number using regex
	account_matches = re.findall(r'([\dX]{4}\d{4})', text)
	if not account_matches:
	raise ValueError("Could not find account number in statement")
	account_number = account_matches[0]

	# Extract date
	pattern = r"Total account value as of (\b\w+\b \d{1,2}, \d{4})"
	for line in text.split('\n'):
	match = re.search(pattern, line)
	if match:
	date_str = match.group(1)
	statement_date = datetime.strptime(date_str, '%B %d, %Y')
	break

	print(f" Processing: Account Number: {account_number}: Value: ${value:,.2f}, Statement Date: {statement_date}")
	return value, account_number, statement_date

	def process_statement_folder(folder_path, output_folder):
	"""
	Process all PDFs in a folder and create separate CSV files for each account.
	"""
	# Create output folder if it doesn't exist
	os.makedirs(output_folder, exist_ok=True)

	# Dictionary to store data for each account
	account_data = {}

	for filename in os.listdir(folder_path):
	if filename.endswith('.pdf'):
	pdf_path = os.path.join(folder_path, filename)
	try:
	value, account_number, date = extract_statement_data(pdf_path)

	account_number = AccountNames[account_number[-4:]]
	# Initialize list for this account if we haven't seen it before
	if account_number not in account_data:
	account_data[account_number] = []

	# Add data for this statement
	account_data[account_number].append({
	'Date': date.strftime('%Y-%m-%d'), # Format date as YYYY-MM-DD
	'Balance': value,
	'Account': account_number
	})
	except Exception as e:
	print(f"Error processing {filename}: {str(e)}")

	# Create separate CSV for each account
	for account_number, data in account_data.items():
	df = pd.DataFrame(data)
	df.sort_values('Date', inplace=True)

	output_path = os.path.join(output_folder, f'account-{account_number}.csv')
	df.to_csv(output_path, index=False)
	print(f"Created CSV for account {account_number}")

	# Print summary statistics for this account
	print(f" Statements processed: {len(data)}")
	print(f" Date range: {df['Date'].min()} to {df['Date'].max()}")
	print(f" Latest value: ${df.iloc[-1]['Balance']:,.2f}")

	if __name__ == "__main__":
	folder_path = "statements" # Folder containing PDF statements
	output_folder = "account_values" # Folder for output CSVs
	process_statement_folder(folder_path, output_folder)