Skip to content

Instantly share code, notes, and snippets.

@eliask
Last active November 23, 2016 06:27
Show Gist options
  • Save eliask/4dea2b18e68cd6e09db19e61e3e79f16 to your computer and use it in GitHub Desktop.
Save eliask/4dea2b18e68cd6e09db19e61e3e79f16 to your computer and use it in GitHub Desktop.
Scrape transaction data from Nordea web bank PDF account statements
#!/usr/bin/env python3
#
# Hacky but functional script for extracting data from (Finnish) Nordea web bank account statement PDF files.
#
# 1. Presumes you have some account statements to clean up (for hledger, for instance).
# 2. The normal {Transactions,Tapahtumat}_<IBAN>.txt CSV files are by far superior to work with,
# but they are no longer available after just a year or two (better archive them while you can!).
# 3. I wanted to have more history available, so I devised this hacky method to extract it
# sort of cleanly from the PDF account statements in Finnish.
#
# Usage: Get the account statements, pdftotext them, pass the .txt-ified versions as arguments to this script. And profit.
import sys,re,decimal,csv,glob,os,decimal
from itertools import chain
import ujson
import datetime as dt
def get_nordea_csv_rows():
seen=set()
for path in glob.glob('Transactions*.txt'):
#print(path)
A=list(csv.reader(open(path, 'rt'), delimiter='\t', quotechar='\x12')) # Nordea fails at CSV. Ignore quotes.
header = A[2]
for row in A[4::2]:
key = tuple(zip(header, row))
if key in seen:
print('WARN: seen already: ', key, file=sys.stderr)
continue
seen.add(key)
d = dict(key)
assert d['Beneficiary/Remitter'] or d['Card number'], d
if True or b'digital' in d['Beneficiary/Remitter'].encode('utf-8').lower():
amount = decimal.Decimal(d['Amount'].replace('.','').replace(',','.'))
msg = d['Message'].strip()
yield amount, d['Payment date'], msg, path
Forbidden='''
*1011
-8
-E
00530 HELSINKI
0200 3000 (pvm/mpm)
1
1011 Helsinki-Hakaniemi
101135-493712
2
3
4
Nordea Asiakaspalvelu
ma-su 24h/vrk
29.11.2014
9/10
==> append : to filename to view the PDF source
=E
=K
=M
=V
Check-in -asiakas
Erittely
Helsinki, Y-tunnus 1680235-8
Kausi
Kirjaus- Arvop
KÄÄNNÄ
Käyttötili
Loppusaldo
Maksullinen kuitti
Maksup Viesti
Maksuton kuitti
Määrä
Nordea Pankki Suomi Oyj, Aleksanterinkatu 36, 00020 NORDEA, kotipaikka
Nordea Pankki Suomi Oyj, Aleksanterinkatu 36, 00020 NORDEA, kotipaikka Helsinki, Y-tunnus 1680235-8
Otot yhteensä
Panot yhteensä
Puh. 0200 3000 (pvm/mpm)
Puh. 0200 3000 (pvm/mpm)
Pyydämme Teitä tarkistamaan tilillenne kirjatut erät ja säilyttämään ti
Pyydämme Teitä tarkistamaan tilillenne kirjatut erät ja säilyttämään tiliotteen kuittina maksuista.
Päivämäärä
Saaja ja tilinumero/Maksaja
Saldo
Siltasaarenkatu 12
Sivu
TILIOTE
Verotuskuitti
[G0Lqsw7a-1_1.png]
EtuTili uudistuu. Nyt voit nostaa varoja ja tehdä tilisiirtoja ilman
nostorajoituksia, liittää EtuTilin hoitotiliksi myös rahasto-salkkuun j
a
arvo-osuustilille. Lisätietoja saat nordea.fi, konttoristamme tai
soittamalla Nordea Asiakaspalveluun 0200 3000 (pvm/mpm) ma-pe klo 8-20.
Pyydämme Teitä tarkistamaan tilillenne kirjatut erät ja säilyttämään
tiliotteen kuittina maksuista.
Hintamuutoksia 1.11.2010 alkaen.
Lisätietoja konttoreistamme, www.nordea.fi ja
Asiakaspalvelusta 0200 3000 (pvm/mpm) ma-pe 8-20.
__________________________________________________________________
liotteen kuittina maksuista.
päivä
Puh.
K
.
9
V
i
*
'''.strip().lower().splitlines()
re_asd = re.compile('\d\d[+-]$|\d\d[+-][^0-9A-Za-z]')
re_foo = re.compile('\d\d\.\d\d\.\d\d$')
re_foo1 = re.compile('^\d\d\.\d\d\s+[^0-9]')
re_foo2 = re.compile('^\d\d\.\d\d \d\d\.\d\d')
re_term = re.compile('^\d\d/\d\d$')
def get_legacy_pdf_stuff(paths):
for path in paths:
if not os.path.exists(path): continue
A=open(path,'rb').read().decode('latin1').splitlines()
A = [x.strip() for x in A
if x.lower().strip() not in Forbidden
and not x.strip().startswith('[')
and not x.strip().startswith('==>')
and x
]
values = [(i,x) for i,x in enumerate(A) if re_asd.search(x)]
if not values:
print('ASD', A)
continue
A = A[1+values[0][0]:]
i=0
acc=[]
has_date2=False
has_date1=False
while A:
_has_date2 = re_foo2.search(A[0])
if acc and _has_date2:
yield [path] + acc
acc=[]
elif not A[1:]:
acc.append(A[0])
yield [path] + acc
del A[0]
else:
acc.append(A[0])
del A[0]
has_date2 = any(re_foo2.search(x) for x in acc)
has_date1 = any(re_foo1.search(x) for x in acc)
def get_year(path, date):
'''
Very very very ad hoc for my data only. I had files like 01.pdf up to 83.pdf,
where the lexicographically earlier file corresponds to an older statement.
Actually the first statements I had were from 2008 or so. But I left them out
completely because pdftotext (and text highlight + copy) would only produce garbled output.
It was not worth the effort for me to deal with them so I just left them out.
'''
day, month = date.split('.')
if path <= '16' and month != '01': return '2010'
elif path <= '28': return '2011'
elif path <= '40': return '2012'
elif path <= '52': return '2013'
else: return '2014'
import csv
W=csv.writer(sys.stdout)
def handle_thing(x):
a=[]
for y in x:
if re_term.search(y):
break
a.append(y)
entry_date, value_date = a[1].split()[:2]
to = ' '.join(a[1].split()[2:])
year = get_year(a[0], value_date)
value_date = '{}.{}'.format(value_date, year)
payment_date = '{}.{}'.format(value_date, year) #whatever
entry_date = '{}.{}'.format(entry_date, year)
amounts = [x for x in a if re_asd.search(x)]
if False and amounts[1:]:
print(a)
return
else:
amount = amounts[0][:-1] if amounts[0][-1] == '+' else '-' + amounts[0][:-1]
amount = decimal.Decimal(amount.replace('.', '').replace(',','.'))
beneficiary_or_remitter = to
account_number = None
bic = None
reference_number = None # a[0]
originators_reference = None
message = ' '.join(a[4:]) if a[4:] else None
receipt = None
card_number = None
transaction = None
W.writerow([entry_date, value_date, payment_date, amount, beneficiary_or_remitter, account_number, bic, transaction, reference_number, originators_reference, message, card_number, receipt, None])
L = list(get_legacy_pdf_stuff(sys.argv[1:]))
prev = None
for x in L:
if prev is not None and prev[0] != x[0]:
prev = prev[:-4]
if prev: handle_thing(prev)
prev = x
prev = prev[:-4]
handle_thing(prev)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment