Skip to content

Instantly share code, notes, and snippets.

@pschichtel
Last active September 28, 2018 14:50
Show Gist options
  • Save pschichtel/6e4af6fba42e5737ac95ae7d02e7e06d to your computer and use it in GitHub Desktop.
Save pschichtel/6e4af6fba42e5737ac95ae7d02e7e06d to your computer and use it in GitHub Desktop.
Takes the output of poppler's pdftohtml tool when feeding in an ING-DiBa bank statement and produces csv that can be imported into e.g. Excel. Works mostly correct. Additional kinds might need to be added.
#!/usr/bin/env python3
import sys
import os
import re
import html
closure = 'Abschluss'
kinds = {
'Lastschrift': 'Lastschrift',
'Gehalt/Rente': 'Gehalt/Rente',
'Ueberweisung': 'Überweisung',
'Dauerauftrag/Terminueberw.': 'Dauerauftrag / Terminueberweisung',
'Gutschrift': 'Gutschrift',
'Abschluss': closure,
'Abbuchung': 'Abbuchung',
'Gutschrift/Dauerauftrag': 'Gutschrift / Dauerauftrag',
'Retoure': 'Retoure'
}
def isClosure(kind):
return kind == closure or (kind in kinds and kinds[kind] == closure)
def parseDate(date):
dates = re.findall('^(\d\d)\.(\d\d)\.(\d\d\d\d)$', date)
if len(dates) != 1:
return None
else:
(d, m, y) = dates[0]
#return y + "-" + m + "-" + d
return "{}.{}.{}".format(d, m, y)
def preprocessPart(part):
nbspStripped = part.replace(' ', ' ')
htmlStripped = re.sub('<[^>]+>', '', nbspStripped)
return htmlStripped.strip()
def chunkEntry(entry):
processed = [preprocessPart(s) for part in entry.split('</b>') for s in part.split("<br/>")]
return [s for s in processed if len(s) > 0]
def parseEntry(entry):
parsed = dict()
extractInitiation(entry, parsed)
extractValuta(entry, parsed)
extractKind(entry, parsed)
extractPartner(entry, parsed)
extractAmount(entry, parsed)
extractApplication(entry, parsed)
extractReference(entry, parsed)
extractMandate(entry, parsed)
if 'valuta' in parsed and not 'initiation' in parsed:
parsed['initiation'] = parsed['valuta']
if 'initiation' in parsed and not 'valuta' in parsed:
parsed['valuta'] = parsed['initiation']
#parsed['raw'] = entry
return parsed
def extractValuta(raw, parsed):
date = parseDate(raw[-1])
if date != None:
parsed['valuta'] = date
def extractInitiation(raw, parsed):
if len(raw) > 3:
date = parseDate(raw[3])
if date != None:
parsed['initiation'] = date
def extractApplication(raw, parsed):
if len(raw) > 4:
parsed['application'] = raw[4]
def extractReference(raw, parsed):
for p in raw:
refs = re.findall('^Referenz:\s*(.+)$', p)
if len(refs) == 1:
parsed['reference'] = refs[0]
return
def extractMandate(raw, parsed):
for p in raw:
refs = re.findall('^Mandat:\s*(.+)$', p)
if len(refs) == 1:
parsed['mandate'] = refs[0]
return
def extractKind(raw, parsed):
kind = raw[0]
if kind in kinds:
parsed['kind'] = kinds[kind]
#else:
# print("UNKNOWN KIND:", kind)
def extractPartner(raw, parsed):
if 'kind' in parsed and not isClosure(parsed['kind']):
parsed['partner'] = raw[1]
def extractAmount(raw, parsed):
index = 2
if 'kind' in parsed and isClosure(parsed['kind']):
index = 1
preprocessed = raw[index].replace('.', '').replace(',', '.')
amounts = re.findall('^(-?\d+\.\d\d)', preprocessed)
if len(amounts) == 1:
try:
parsed['amount'] = float(amounts[0])
except:
pass
def dejunk(content):
noFooters = re.sub('<hr/>\s*<a[^>]+>[\s\S]+?(<img[^>]+><br/>\s*)+.+?<br/>\s*', '', content)
noTrailer = re.sub('<b>Abschlussbetrag[\S\s]+', '', noFooters)
noRandomCode = re.sub('^.+?_T<br/>\s*', '', noTrailer, flags=re.MULTILINE)
return noRandomCode
content = ""
with open(sys.argv[1], 'r') as content_file:
content = content_file.read()
dejunked = dejunk(content)
#print(dejunked)
tableEntryRegex = '<b>(?:[\s\S](?!<b>))+'
tableEntries = re.findall(tableEntryRegex, dejunked)
chunkedEntries = [chunkEntry(e) for e in tableEntries]
plausibleEntries = [e for e in chunkedEntries if len(e) > 2]
parsedEntries = [parseEntry(e) for e in plausibleEntries]
transactions = [e for e in parsedEntries if 'kind' in e and 'amount' in e and 'initiation' in e]
for entry in transactions:
#print("###########")
print('"{}","{}","{}","{}","{}",,EUR,{},EUR,"{}","{}"'.format(entry['initiation'], entry.get('valuta', ''), entry.get('partner', ''), entry.get('kind', ''), entry.get('application', ''), entry.get('amount', ''), entry.get('reference', ''), entry.get('mandate', '')))
#!/usr/bin/env bash
here="$(dirname "$(readlink -f "$0")")"
target="${here}/transactions.csv"
rm "$target"
tmp="$(mktemp -d)"
cd "$tmp"
for f in "$here"/Girokonto_*_Kontoauszug*.pdf
do
base="$(basename "$f")"
tmpf="$tmp/$base"
cp "$f" "$tmpf"
pdftohtml "$tmpf"
python3 "${here}/html2csv.py" "${tmpf%.*}s.html" >> "$target"
done
rm -R "$tmp"
@pschichtel
Copy link
Author

pschichtel commented Sep 28, 2018

This could potentially be used to convert the PDF to html without using external tools, but the output is even less structured compared to poppler:

#!/usr/bin/env python3

import sys, os, os.path
import pdfminer
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter
from pdfminer.layout import LAParams

def convert(input, output, codec='utf-8', maxpages=0, pagenos=None):
    # perform conversion and
    # send the results over the network.
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = HTMLConverter(rsrcmgr, output, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(input, pagenos, maxpages=maxpages):
        interpreter.process_page(page)
    device.close()
    return

with open("auszug.pdf", 'rb') as input:
  with open("auszug.html", "wb") as output:
    convert(input, output)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment