Last active
September 28, 2018 14:50
-
-
Save pschichtel/6e4af6fba42e5737ac95ae7d02e7e06d to your computer and use it in GitHub Desktop.
Takes the output of poppler's pdftohtml tool when feeding in an ING-DiBa bank statement and produces csv that can be imported into e.g. Excel. Works mostly correct. Additional kinds might need to be added.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import os | |
import re | |
import html | |
closure = 'Abschluss' | |
kinds = { | |
'Lastschrift': 'Lastschrift', | |
'Gehalt/Rente': 'Gehalt/Rente', | |
'Ueberweisung': 'Überweisung', | |
'Dauerauftrag/Terminueberw.': 'Dauerauftrag / Terminueberweisung', | |
'Gutschrift': 'Gutschrift', | |
'Abschluss': closure, | |
'Abbuchung': 'Abbuchung', | |
'Gutschrift/Dauerauftrag': 'Gutschrift / Dauerauftrag', | |
'Retoure': 'Retoure' | |
} | |
def isClosure(kind): | |
return kind == closure or (kind in kinds and kinds[kind] == closure) | |
def parseDate(date): | |
dates = re.findall('^(\d\d)\.(\d\d)\.(\d\d\d\d)$', date) | |
if len(dates) != 1: | |
return None | |
else: | |
(d, m, y) = dates[0] | |
#return y + "-" + m + "-" + d | |
return "{}.{}.{}".format(d, m, y) | |
def preprocessPart(part): | |
nbspStripped = part.replace(' ', ' ') | |
htmlStripped = re.sub('<[^>]+>', '', nbspStripped) | |
return htmlStripped.strip() | |
def chunkEntry(entry): | |
processed = [preprocessPart(s) for part in entry.split('</b>') for s in part.split("<br/>")] | |
return [s for s in processed if len(s) > 0] | |
def parseEntry(entry): | |
parsed = dict() | |
extractInitiation(entry, parsed) | |
extractValuta(entry, parsed) | |
extractKind(entry, parsed) | |
extractPartner(entry, parsed) | |
extractAmount(entry, parsed) | |
extractApplication(entry, parsed) | |
extractReference(entry, parsed) | |
extractMandate(entry, parsed) | |
if 'valuta' in parsed and not 'initiation' in parsed: | |
parsed['initiation'] = parsed['valuta'] | |
if 'initiation' in parsed and not 'valuta' in parsed: | |
parsed['valuta'] = parsed['initiation'] | |
#parsed['raw'] = entry | |
return parsed | |
def extractValuta(raw, parsed): | |
date = parseDate(raw[-1]) | |
if date != None: | |
parsed['valuta'] = date | |
def extractInitiation(raw, parsed): | |
if len(raw) > 3: | |
date = parseDate(raw[3]) | |
if date != None: | |
parsed['initiation'] = date | |
def extractApplication(raw, parsed): | |
if len(raw) > 4: | |
parsed['application'] = raw[4] | |
def extractReference(raw, parsed): | |
for p in raw: | |
refs = re.findall('^Referenz:\s*(.+)$', p) | |
if len(refs) == 1: | |
parsed['reference'] = refs[0] | |
return | |
def extractMandate(raw, parsed): | |
for p in raw: | |
refs = re.findall('^Mandat:\s*(.+)$', p) | |
if len(refs) == 1: | |
parsed['mandate'] = refs[0] | |
return | |
def extractKind(raw, parsed): | |
kind = raw[0] | |
if kind in kinds: | |
parsed['kind'] = kinds[kind] | |
#else: | |
# print("UNKNOWN KIND:", kind) | |
def extractPartner(raw, parsed): | |
if 'kind' in parsed and not isClosure(parsed['kind']): | |
parsed['partner'] = raw[1] | |
def extractAmount(raw, parsed): | |
index = 2 | |
if 'kind' in parsed and isClosure(parsed['kind']): | |
index = 1 | |
preprocessed = raw[index].replace('.', '').replace(',', '.') | |
amounts = re.findall('^(-?\d+\.\d\d)', preprocessed) | |
if len(amounts) == 1: | |
try: | |
parsed['amount'] = float(amounts[0]) | |
except: | |
pass | |
def dejunk(content): | |
noFooters = re.sub('<hr/>\s*<a[^>]+>[\s\S]+?(<img[^>]+><br/>\s*)+.+?<br/>\s*', '', content) | |
noTrailer = re.sub('<b>Abschlussbetrag[\S\s]+', '', noFooters) | |
noRandomCode = re.sub('^.+?_T<br/>\s*', '', noTrailer, flags=re.MULTILINE) | |
return noRandomCode | |
content = "" | |
with open(sys.argv[1], 'r') as content_file: | |
content = content_file.read() | |
dejunked = dejunk(content) | |
#print(dejunked) | |
tableEntryRegex = '<b>(?:[\s\S](?!<b>))+' | |
tableEntries = re.findall(tableEntryRegex, dejunked) | |
chunkedEntries = [chunkEntry(e) for e in tableEntries] | |
plausibleEntries = [e for e in chunkedEntries if len(e) > 2] | |
parsedEntries = [parseEntry(e) for e in plausibleEntries] | |
transactions = [e for e in parsedEntries if 'kind' in e and 'amount' in e and 'initiation' in e] | |
for entry in transactions: | |
#print("###########") | |
print('"{}","{}","{}","{}","{}",,EUR,{},EUR,"{}","{}"'.format(entry['initiation'], entry.get('valuta', ''), entry.get('partner', ''), entry.get('kind', ''), entry.get('application', ''), entry.get('amount', ''), entry.get('reference', ''), entry.get('mandate', ''))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
here="$(dirname "$(readlink -f "$0")")" | |
target="${here}/transactions.csv" | |
rm "$target" | |
tmp="$(mktemp -d)" | |
cd "$tmp" | |
for f in "$here"/Girokonto_*_Kontoauszug*.pdf | |
do | |
base="$(basename "$f")" | |
tmpf="$tmp/$base" | |
cp "$f" "$tmpf" | |
pdftohtml "$tmpf" | |
python3 "${here}/html2csv.py" "${tmpf%.*}s.html" >> "$target" | |
done | |
rm -R "$tmp" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This could potentially be used to convert the PDF to html without using external tools, but the output is even less structured compared to poppler: