Skip to content

Instantly share code, notes, and snippets.

@JaosnHsieh
Created August 24, 2025 21:16
Show Gist options
  • Save JaosnHsieh/936271a27d734f868ef9d60d243b43c3 to your computer and use it in GitHub Desktop.
Save JaosnHsieh/936271a27d734f868ef9d60d243b43c3 to your computer and use it in GitHub Desktop.
python3 script to generate interactive chart html report for rogers credit card bank statement
import pdfplumber
import csv
import re
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
# ---- Config ----
PDF_FILE = "1.pdf"
CSV_FILE = "bank_statement.csv"
OUTPUT_HTML = "report.html"
LARGE_THRESHOLD = 300.0
# Keyword-based categorization rules
CATEGORY_RULES = {
"food": ["TIM HORTONS", "MCDONALD", "STARBUCKS", "BURGER", "RESTAURANT", "SHANGHAI360"],
"groceries": ["COSTCO", "SUPERMARKET", "FOODMART", "METRO", "NOFRILLS", "WAL-MART"],
"transport": ["PRESTO", "UBER", "LYFT", "PARKING", "IMARK", "GAS", "ESSO"],
"shopping": ["AMAZON", "HOME DEPOT", "IKEA", "SHOPPERS"],
"utilities": ["BELL", "ROGERS", "VIRGIN", "FIDOMOBILE", "ECONOMICAL INSURANCE"],
"health": ["PHARMACY", "HOSPITAL"],
"other": []
}
def extract_transactions(pdf_path, csv_path):
transactions = []
in_transactions = False
# Regex to capture transactions
pattern = re.compile(r"^([A-Za-z]{3}\d{1,2})\s+([A-Za-z]{3}\d{1,2})\s+(.+?)(-?\d[\d,]*\.\d{2})$")
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if not text:
continue
lines = text.splitlines()
for line in lines:
# Detect start of transactions
if "TransactionDetails" in line:
in_transactions = True
continue
# Detect possible end of section
if "InterestRateChart" in line or "AccountNumber" in line:
in_transactions = False
if in_transactions:
line = line.strip()
match = pattern.match(line)
if match:
trans_date = match.group(1)
post_date = match.group(2)
desc = match.group(3).strip()
amount = match.group(4).replace(",", "")
transactions.append([trans_date, post_date, desc, amount])
# Write to CSV
with open(csv_path, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["Trans Date", "Post Date", "Description", "Amount ($)"])
writer.writerows(transactions)
def categorize(description: str) -> str:
desc_upper = description.upper()
for category, keywords in CATEGORY_RULES.items():
for kw in keywords:
if kw in desc_upper:
return category
return "other"
def generate_report(csv_path, output_html):
df = pd.read_csv(csv_path)
# Clean data
df["Amount ($)"] = df["Amount ($)"].astype(str).str.replace(",", "").astype(float)
# Categorization
df["Category"] = df["Description"].apply(categorize)
# Separate out payments (negative amounts = credit card payments)
payments_df = df[df["Amount ($)"] < 0].copy()
spend_df = df[df["Amount ($)"] > 0].copy()
# Large transactions (only consider spending)
spend_df["Is Large"] = spend_df["Amount ($)"].abs() > LARGE_THRESHOLD
# Totals per category (only spending)
category_totals = spend_df.groupby("Category")["Amount ($)"].sum().reset_index()
# Total money spent
total_spent = spend_df["Amount ($)"].sum()
# ---- Visualization with Plotly (interactive) ----
figures = []
# Pie chart (absolute values)
category_totals["AbsAmount"] = category_totals["Amount ($)"].abs()
pie = px.pie(
category_totals,
names="Category",
values="AbsAmount",
title="Spending by Category"
)
pie.update_traces(textinfo="percent+label", textfont=dict(size=14), sort=False)
pie.update_layout(
title_font=dict(size=20),
legend=dict(font=dict(size=12)),
width=500,
height=400
)
figures.append(pie)
# Bar chart
bar = px.bar(
category_totals,
x="Category",
y="Amount ($)",
title="Spending Amount per Category",
text="Amount ($)"
)
bar.update_traces(texttemplate="%{text:.2s}", textposition="outside")
bar.update_layout(
title_font=dict(size=20),
xaxis=dict(tickfont=dict(size=10)),
yaxis=dict(tickfont=dict(size=10)),
width=600,
height=400
)
figures.append(bar)
# Timeline (only spending)
spend_df["Post Date"] = pd.to_datetime(spend_df["Post Date"] + ", 2025", errors="coerce", format="%b%d, %Y")
timeline = px.line(
spend_df,
x="Post Date",
y="Amount ($)",
color="Category",
title="Timeline of Spending",
markers=True
)
timeline.update_layout(
title_font=dict(size=20),
xaxis=dict(tickfont=dict(size=10)),
yaxis=dict(tickfont=dict(size=10)),
width=700,
height=400
)
figures.append(timeline)
# Large transactions table (only spending)
large_txn = spend_df[spend_df["Is Large"]]
if not large_txn.empty:
table = go.Figure(data=[go.Table(
header=dict(values=list(large_txn.columns), fill_color="paleturquoise", align="left", font=dict(size=14)),
cells=dict(values=[large_txn[c] for c in large_txn.columns], fill_color="lavender", align="left", font=dict(size=12))
)])
table.update_layout(
title="Large Transactions (>{})".format(LARGE_THRESHOLD),
title_font=dict(size=20),
width=800,
height=300
)
figures.append(table)
# Payments table (separate section for negative transactions)
if not payments_df.empty:
pay_table = go.Figure(data=[go.Table(
header=dict(values=list(payments_df.columns), fill_color="lightgrey", align="left", font=dict(size=14)),
cells=dict(values=[payments_df[c] for c in payments_df.columns], fill_color="white", align="left", font=dict(size=12))
)])
pay_table.update_layout(
title="Payments (Card Balance)",
title_font=dict(size=20),
width=800,
height=300
)
figures.append(pay_table)
# Total money spent summary FIRST with simple HTML
first_date = spend_df["Post Date"].min()
last_date = spend_df["Post Date"].max()
date_range_text = f" ({first_date.date()} → {last_date.date()})" if pd.notna(first_date) and pd.notna(last_date) else ""
summary_text = f"<h1>Total Money Spent: ${total_spent:,.2f}{date_range_text}</h1>"
# Ensure summary is written at the very top of HTML
summary_html = summary_text + "<br><br>"
# Write interactive HTML with JS interactivity
with open(output_html, "w") as f:
# Write summary first
f.write(summary_html)
# Add pie chart
f.write(figures[0].to_html(full_html=False, include_plotlyjs="cdn", div_id="piechart"))
f.write('<div id="category-table"></div>')
f.write("<br><br>\n")
# Add remaining figures (skip the pie we already wrote, summary already added separately)
for fig in figures[1:]:
f.write(fig.to_html(full_html=False, include_plotlyjs=False))
f.write("<br><br>\n")
# Embed transaction data for interactivity
f.write("<script>\n")
f.write("const transactionData = {};\n")
for category in spend_df["Category"].unique():
cat_df = spend_df[spend_df["Category"] == category].sort_values("Post Date")
f.write(f"transactionData['{category}'] = {cat_df.to_json(orient='records', date_format='iso')};\n")
f.write("""
// Function to render table into #category-table
function renderTable(category) {
const container = document.getElementById('category-table');
if (!transactionData[category] || transactionData[category].length === 0) {
container.innerHTML = "<p>No transactions found.</p>";
return;
}
let html = "<h3>Transactions for " + category + "</h3>";
html += "<table border='1' cellspacing='0' cellpadding='4'><tr>";
for (const key of Object.keys(transactionData[category][0])) {
html += "<th>" + key + "</th>";
}
html += "</tr>";
for (const row of transactionData[category]) {
html += "<tr>";
for (const key of Object.keys(row)) {
html += "<td>" + row[key] + "</td>";
}
html += "</tr>";
}
html += "</table>";
container.innerHTML = html;
}
// Attach click event to pie chart
var pie = document.getElementById('piechart');
pie.on('plotly_click', function(data){
var category = data.points[0].label;
renderTable(category);
});
""")
f.write("</script>")
def main():
extract_transactions(PDF_FILE, CSV_FILE)
print("Extraction complete! Saved to bank_statement.csv")
generate_report(CSV_FILE, OUTPUT_HTML)
print(f"Analysis complete with Plotly interactive charts! Report saved to {OUTPUT_HTML}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment