claudioc · January 7, 2024 10:58
diff --git a/tr_extract.py b/tr_extract.py
 #!/usr/bin/env python3
 from pypdf import PdfReader
 import re
 import pprint

 re_label = "Ausführung Handel Direktkauf Kauf"
 re_date = r"\d{2}\.\d{2}\.\d{4}"
 re_isin = r"[A-Z]{2}[A-Z0-9]{9}[0-9]"
 re_amount = r"-?\d{1,3}(?:\.\d{3})*(?:,\d{2})"

 line_pattern = re.compile(
    rf"^({re_date}){re_label}\s({re_isin})\s.*?\s?({re_amount})?$"
 )

 amount_pattern = re.compile(rf"({re_amount})$")

 reader = PdfReader("stmt2.pdf")

 full_text = ""
 for page in reader.pages:
    full_text += page.extract_text() + "\n"

 lines = full_text.split("\n")

 records = {}
 for index, line in enumerate(lines):
    match = line_pattern.search(line)
    if match:
        date = match.group(1)
        isin = match.group(2)
        amount = 0
        if match.group(3):
            amount = match.group(3)
        else:  # Amount is on the next line
            match = amount_pattern.search(lines[index + 1])
            if match:
                amount = match.group(1)
        if isin not in records:
            records[isin] = []

        records[isin].append({"date": date, "amount": amount})

 pprint.pprint(records)
	#!/usr/bin/env python3
	from pypdf import PdfReader
	import re
	import pprint

	re_label = "Ausführung Handel Direktkauf Kauf"
	re_date = r"\d{2}\.\d{2}\.\d{4}"
	re_isin = r"[A-Z]{2}[A-Z0-9]{9}[0-9]"
	re_amount = r"-?\d{1,3}(?:\.\d{3})*(?:,\d{2})"

	line_pattern = re.compile(
	rf"^({re_date}){re_label}\s({re_isin})\s.*?\s?({re_amount})?$"
	)

	amount_pattern = re.compile(rf"({re_amount})$")

	reader = PdfReader("stmt2.pdf")

	full_text = ""
	for page in reader.pages:
	full_text += page.extract_text() + "\n"

	lines = full_text.split("\n")

	records = {}
	for index, line in enumerate(lines):
	match = line_pattern.search(line)
	if match:
	date = match.group(1)
	isin = match.group(2)
	amount = 0
	if match.group(3):
	amount = match.group(3)
	else: # Amount is on the next line
	match = amount_pattern.search(lines[index + 1])
	if match:
	amount = match.group(1)
	if isin not in records:
	records[isin] = []

	records[isin].append({"date": date, "amount": amount})

	pprint.pprint(records)
No results found