Last active
January 7, 2024 10:58
-
-
Save claudioc/cde3009fb130f45ed9a37ad6201701a6 to your computer and use it in GitHub Desktop.
Extract information from TradeRepublic statements PDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from pypdf import PdfReader | |
import re | |
import pprint | |
re_label = "Ausführung Handel Direktkauf Kauf" | |
re_date = r"\d{2}\.\d{2}\.\d{4}" | |
re_isin = r"[A-Z]{2}[A-Z0-9]{9}[0-9]" | |
re_amount = r"-?\d{1,3}(?:\.\d{3})*(?:,\d{2})" | |
line_pattern = re.compile( | |
rf"^({re_date}){re_label}\s({re_isin})\s.*?\s?({re_amount})?$" | |
) | |
amount_pattern = re.compile(rf"({re_amount})$") | |
reader = PdfReader("stmt2.pdf") | |
full_text = "" | |
for page in reader.pages: | |
full_text += page.extract_text() + "\n" | |
lines = full_text.split("\n") | |
records = {} | |
for index, line in enumerate(lines): | |
match = line_pattern.search(line) | |
if match: | |
date = match.group(1) | |
isin = match.group(2) | |
amount = 0 | |
if match.group(3): | |
amount = match.group(3) | |
else: # Amount is on the next line | |
match = amount_pattern.search(lines[index + 1]) | |
if match: | |
amount = match.group(1) | |
if isin not in records: | |
records[isin] = [] | |
records[isin].append({"date": date, "amount": amount}) | |
pprint.pprint(records) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment