Skip to content

Instantly share code, notes, and snippets.

@claudioc
Last active January 7, 2024 10:58
Show Gist options
  • Save claudioc/cde3009fb130f45ed9a37ad6201701a6 to your computer and use it in GitHub Desktop.
Save claudioc/cde3009fb130f45ed9a37ad6201701a6 to your computer and use it in GitHub Desktop.
Extract information from TradeRepublic statements PDF
#!/usr/bin/env python3
from pypdf import PdfReader
import re
import pprint
re_label = "Ausführung Handel Direktkauf Kauf"
re_date = r"\d{2}\.\d{2}\.\d{4}"
re_isin = r"[A-Z]{2}[A-Z0-9]{9}[0-9]"
re_amount = r"-?\d{1,3}(?:\.\d{3})*(?:,\d{2})"
line_pattern = re.compile(
rf"^({re_date}){re_label}\s({re_isin})\s.*?\s?({re_amount})?$"
)
amount_pattern = re.compile(rf"({re_amount})$")
reader = PdfReader("stmt2.pdf")
full_text = ""
for page in reader.pages:
full_text += page.extract_text() + "\n"
lines = full_text.split("\n")
records = {}
for index, line in enumerate(lines):
match = line_pattern.search(line)
if match:
date = match.group(1)
isin = match.group(2)
amount = 0
if match.group(3):
amount = match.group(3)
else: # Amount is on the next line
match = amount_pattern.search(lines[index + 1])
if match:
amount = match.group(1)
if isin not in records:
records[isin] = []
records[isin].append({"date": date, "amount": amount})
pprint.pprint(records)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment