Last active
November 23, 2016 06:27
-
-
Save eliask/4dea2b18e68cd6e09db19e61e3e79f16 to your computer and use it in GitHub Desktop.
Scrape transaction data from Nordea web bank PDF account statements
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# | |
# Hacky but functional script for extracting data from (Finnish) Nordea web bank account statement PDF files. | |
# | |
# 1. Presumes you have some account statements to clean up (for hledger, for instance). | |
# 2. The normal {Transactions,Tapahtumat}_<IBAN>.txt CSV files are by far superior to work with, | |
# but they are no longer available after just a year or two (better archive them while you can!). | |
# 3. I wanted to have more history available, so I devised this hacky method to extract it | |
# sort of cleanly from the PDF account statements in Finnish. | |
# | |
# Usage: Get the account statements, pdftotext them, pass the .txt-ified versions as arguments to this script. And profit. | |
import sys,re,decimal,csv,glob,os,decimal | |
from itertools import chain | |
import ujson | |
import datetime as dt | |
def get_nordea_csv_rows(): | |
seen=set() | |
for path in glob.glob('Transactions*.txt'): | |
#print(path) | |
A=list(csv.reader(open(path, 'rt'), delimiter='\t', quotechar='\x12')) # Nordea fails at CSV. Ignore quotes. | |
header = A[2] | |
for row in A[4::2]: | |
key = tuple(zip(header, row)) | |
if key in seen: | |
print('WARN: seen already: ', key, file=sys.stderr) | |
continue | |
seen.add(key) | |
d = dict(key) | |
assert d['Beneficiary/Remitter'] or d['Card number'], d | |
if True or b'digital' in d['Beneficiary/Remitter'].encode('utf-8').lower(): | |
amount = decimal.Decimal(d['Amount'].replace('.','').replace(',','.')) | |
msg = d['Message'].strip() | |
yield amount, d['Payment date'], msg, path | |
Forbidden=''' | |
*1011 | |
-8 | |
-E | |
00530 HELSINKI | |
0200 3000 (pvm/mpm) | |
1 | |
1011 Helsinki-Hakaniemi | |
101135-493712 | |
2 | |
3 | |
4 | |
Nordea Asiakaspalvelu | |
ma-su 24h/vrk | |
29.11.2014 | |
9/10 | |
==> append : to filename to view the PDF source | |
=E | |
=K | |
=M | |
=V | |
Check-in -asiakas | |
Erittely | |
Helsinki, Y-tunnus 1680235-8 | |
Kausi | |
Kirjaus- Arvop | |
KÄÄNNÄ | |
Käyttötili | |
Loppusaldo | |
Maksullinen kuitti | |
Maksup Viesti | |
Maksuton kuitti | |
Määrä | |
Nordea Pankki Suomi Oyj, Aleksanterinkatu 36, 00020 NORDEA, kotipaikka | |
Nordea Pankki Suomi Oyj, Aleksanterinkatu 36, 00020 NORDEA, kotipaikka Helsinki, Y-tunnus 1680235-8 | |
Otot yhteensä | |
Panot yhteensä | |
Puh. 0200 3000 (pvm/mpm) | |
Puh. 0200 3000 (pvm/mpm) | |
Pyydämme Teitä tarkistamaan tilillenne kirjatut erät ja säilyttämään ti | |
Pyydämme Teitä tarkistamaan tilillenne kirjatut erät ja säilyttämään tiliotteen kuittina maksuista. | |
Päivämäärä | |
Saaja ja tilinumero/Maksaja | |
Saldo | |
Siltasaarenkatu 12 | |
Sivu | |
TILIOTE | |
Verotuskuitti | |
[G0Lqsw7a-1_1.png] | |
EtuTili uudistuu. Nyt voit nostaa varoja ja tehdä tilisiirtoja ilman | |
nostorajoituksia, liittää EtuTilin hoitotiliksi myös rahasto-salkkuun j | |
a | |
arvo-osuustilille. Lisätietoja saat nordea.fi, konttoristamme tai | |
soittamalla Nordea Asiakaspalveluun 0200 3000 (pvm/mpm) ma-pe klo 8-20. | |
Pyydämme Teitä tarkistamaan tilillenne kirjatut erät ja säilyttämään | |
tiliotteen kuittina maksuista. | |
Hintamuutoksia 1.11.2010 alkaen. | |
Lisätietoja konttoreistamme, www.nordea.fi ja | |
Asiakaspalvelusta 0200 3000 (pvm/mpm) ma-pe 8-20. | |
__________________________________________________________________ | |
liotteen kuittina maksuista. | |
päivä | |
Puh. | |
K | |
. | |
9 | |
V | |
i | |
* | |
'''.strip().lower().splitlines() | |
re_asd = re.compile('\d\d[+-]$|\d\d[+-][^0-9A-Za-z]') | |
re_foo = re.compile('\d\d\.\d\d\.\d\d$') | |
re_foo1 = re.compile('^\d\d\.\d\d\s+[^0-9]') | |
re_foo2 = re.compile('^\d\d\.\d\d \d\d\.\d\d') | |
re_term = re.compile('^\d\d/\d\d$') | |
def get_legacy_pdf_stuff(paths): | |
for path in paths: | |
if not os.path.exists(path): continue | |
A=open(path,'rb').read().decode('latin1').splitlines() | |
A = [x.strip() for x in A | |
if x.lower().strip() not in Forbidden | |
and not x.strip().startswith('[') | |
and not x.strip().startswith('==>') | |
and x | |
] | |
values = [(i,x) for i,x in enumerate(A) if re_asd.search(x)] | |
if not values: | |
print('ASD', A) | |
continue | |
A = A[1+values[0][0]:] | |
i=0 | |
acc=[] | |
has_date2=False | |
has_date1=False | |
while A: | |
_has_date2 = re_foo2.search(A[0]) | |
if acc and _has_date2: | |
yield [path] + acc | |
acc=[] | |
elif not A[1:]: | |
acc.append(A[0]) | |
yield [path] + acc | |
del A[0] | |
else: | |
acc.append(A[0]) | |
del A[0] | |
has_date2 = any(re_foo2.search(x) for x in acc) | |
has_date1 = any(re_foo1.search(x) for x in acc) | |
def get_year(path, date): | |
''' | |
Very very very ad hoc for my data only. I had files like 01.pdf up to 83.pdf, | |
where the lexicographically earlier file corresponds to an older statement. | |
Actually the first statements I had were from 2008 or so. But I left them out | |
completely because pdftotext (and text highlight + copy) would only produce garbled output. | |
It was not worth the effort for me to deal with them so I just left them out. | |
''' | |
day, month = date.split('.') | |
if path <= '16' and month != '01': return '2010' | |
elif path <= '28': return '2011' | |
elif path <= '40': return '2012' | |
elif path <= '52': return '2013' | |
else: return '2014' | |
import csv | |
W=csv.writer(sys.stdout) | |
def handle_thing(x): | |
a=[] | |
for y in x: | |
if re_term.search(y): | |
break | |
a.append(y) | |
entry_date, value_date = a[1].split()[:2] | |
to = ' '.join(a[1].split()[2:]) | |
year = get_year(a[0], value_date) | |
value_date = '{}.{}'.format(value_date, year) | |
payment_date = '{}.{}'.format(value_date, year) #whatever | |
entry_date = '{}.{}'.format(entry_date, year) | |
amounts = [x for x in a if re_asd.search(x)] | |
if False and amounts[1:]: | |
print(a) | |
return | |
else: | |
amount = amounts[0][:-1] if amounts[0][-1] == '+' else '-' + amounts[0][:-1] | |
amount = decimal.Decimal(amount.replace('.', '').replace(',','.')) | |
beneficiary_or_remitter = to | |
account_number = None | |
bic = None | |
reference_number = None # a[0] | |
originators_reference = None | |
message = ' '.join(a[4:]) if a[4:] else None | |
receipt = None | |
card_number = None | |
transaction = None | |
W.writerow([entry_date, value_date, payment_date, amount, beneficiary_or_remitter, account_number, bic, transaction, reference_number, originators_reference, message, card_number, receipt, None]) | |
L = list(get_legacy_pdf_stuff(sys.argv[1:])) | |
prev = None | |
for x in L: | |
if prev is not None and prev[0] != x[0]: | |
prev = prev[:-4] | |
if prev: handle_thing(prev) | |
prev = x | |
prev = prev[:-4] | |
handle_thing(prev) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment