eliask · November 23, 2016 06:27
diff --git a/nordea_scrape_pdf_dumps.py b/nordea_scrape_pdf_dumps.py
 #!/usr/bin/env python3
 #
 # Hacky but functional script for extracting data from (Finnish) Nordea web bank account statement PDF files.
 # 
 # 1. Presumes you have some account statements to clean up (for hledger, for instance).
 # 2. The normal {Transactions,Tapahtumat}_<IBAN>.txt CSV files are by far superior to work with,
 #    but they are no longer available after just a year or two (better archive them while you can!).
 # 3. I wanted to have more history available, so I devised this hacky method to extract it
 #    sort of cleanly from the PDF account statements in Finnish.
 #
 # Usage: Get the account statements, pdftotext them, pass the .txt-ified versions as arguments to this script. And profit.
 
 import sys,re,decimal,csv,glob,os,decimal
 from itertools import chain
 import ujson
 import datetime as dt

 def get_nordea_csv_rows():
    seen=set()
    for path in glob.glob('Transactions*.txt'):
        #print(path)
        A=list(csv.reader(open(path, 'rt'), delimiter='\t', quotechar='\x12')) # Nordea fails at CSV. Ignore quotes.
        header = A[2]
        for row in A[4::2]:
            key = tuple(zip(header, row))
            if key in seen:
                print('WARN: seen already: ', key, file=sys.stderr)
                continue

            seen.add(key)
            d = dict(key)
            assert d['Beneficiary/Remitter'] or d['Card number'], d

            if True or b'digital' in d['Beneficiary/Remitter'].encode('utf-8').lower():
                amount = decimal.Decimal(d['Amount'].replace('.','').replace(',','.'))
                msg = d['Message'].strip()
                yield  amount, d['Payment date'], msg, path


 Forbidden='''
 *1011
 -8
 -E
 00530 HELSINKI
 0200 3000 (pvm/mpm)
 1
 1011 Helsinki-Hakaniemi
 101135-493712
 2
 3
 4
 Nordea Asiakaspalvelu
 ma-su 24h/vrk
 29.11.2014
 9/10
 ==> append : to filename to view the PDF source
 =E
 =K
 =M
 =V
 Check-in -asiakas
 Erittely
 Helsinki, Y-tunnus 1680235-8
 Kausi
 Kirjaus- Arvop
 KÄÄNNÄ
 Käyttötili
 Loppusaldo
 Maksullinen kuitti
 Maksup Viesti
 Maksuton kuitti
 Määrä
 Nordea Pankki Suomi Oyj, Aleksanterinkatu 36, 00020 NORDEA, kotipaikka
 Nordea Pankki Suomi Oyj, Aleksanterinkatu 36, 00020 NORDEA, kotipaikka Helsinki, Y-tunnus 1680235-8
 Otot yhteensä
 Panot yhteensä
 Puh.  0200 3000 (pvm/mpm)
 Puh. 0200 3000 (pvm/mpm)
 Pyydämme Teitä tarkistamaan tilillenne kirjatut erät ja säilyttämään ti
 Pyydämme Teitä tarkistamaan tilillenne kirjatut erät ja säilyttämään tiliotteen kuittina maksuista.
 Päivämäärä
 Saaja ja tilinumero/Maksaja
 Saldo
 Siltasaarenkatu 12
 Sivu
 TILIOTE
 Verotuskuitti
 [G0Lqsw7a-1_1.png]
 EtuTili uudistuu. Nyt voit nostaa varoja ja tehdä tilisiirtoja ilman
 nostorajoituksia, liittää EtuTilin hoitotiliksi myös rahasto-salkkuun j
 a
 arvo-osuustilille. Lisätietoja saat nordea.fi, konttoristamme tai
 soittamalla Nordea Asiakaspalveluun 0200 3000 (pvm/mpm) ma-pe klo 8-20.
 Pyydämme Teitä tarkistamaan tilillenne kirjatut erät ja säilyttämään
 tiliotteen kuittina maksuista.
 Hintamuutoksia 1.11.2010 alkaen.
 Lisätietoja konttoreistamme, www.nordea.fi ja
 Asiakaspalvelusta 0200 3000 (pvm/mpm) ma-pe 8-20.
 __________________________________________________________________
 liotteen kuittina maksuista.
 päivä
 Puh.
 K
 .
 9
 V
 i
 *
 '''.strip().lower().splitlines()



 re_asd = re.compile('\d\d[+-]$|\d\d[+-][^0-9A-Za-z]')
 re_foo = re.compile('\d\d\.\d\d\.\d\d$')
 re_foo1 = re.compile('^\d\d\.\d\d\s+[^0-9]')
 re_foo2 = re.compile('^\d\d\.\d\d \d\d\.\d\d')
 re_term = re.compile('^\d\d/\d\d$')
 def get_legacy_pdf_stuff(paths):
    for path in paths:
        if not os.path.exists(path): continue
        A=open(path,'rb').read().decode('latin1').splitlines()

        A = [x.strip() for x in A
             if x.lower().strip() not in Forbidden
             and not x.strip().startswith('[')
             and not x.strip().startswith('==>')
             and x
        ]

        values = [(i,x) for i,x in enumerate(A) if re_asd.search(x)]
        if not values:
            print('ASD', A)
            continue
        A = A[1+values[0][0]:]

        i=0
        acc=[]
        has_date2=False
        has_date1=False
        while A:
            _has_date2 = re_foo2.search(A[0])
            if acc and _has_date2:
                yield [path] + acc
                acc=[]
            elif not A[1:]:
                acc.append(A[0])
                yield [path] + acc
                del A[0]
            else:
                acc.append(A[0])
                del A[0]
            has_date2 = any(re_foo2.search(x) for x in acc)
            has_date1 = any(re_foo1.search(x) for x in acc)

 def get_year(path, date):
    '''
    Very very very ad hoc for my data only. I had files like 01.pdf up to 83.pdf,
    where the lexicographically earlier file corresponds to an older statement.

    Actually the first statements I had were from 2008 or so. But I left them out
    completely because pdftotext (and text highlight + copy) would only produce garbled output.

    It was not worth the effort for me to deal with them so I just left them out.
    '''
    day, month = date.split('.')
    if path <= '16' and month != '01': return '2010'
    elif path <= '28': return '2011'
    elif path <= '40': return '2012'
    elif path <= '52': return '2013'
    else: return '2014'

 import csv
 W=csv.writer(sys.stdout)
 def handle_thing(x):
    a=[]
    for y in x:
        if re_term.search(y):
            break
        a.append(y)

    entry_date, value_date = a[1].split()[:2]
    to = ' '.join(a[1].split()[2:])
    year = get_year(a[0], value_date)
    value_date = '{}.{}'.format(value_date, year)
    payment_date = '{}.{}'.format(value_date, year) #whatever
    entry_date = '{}.{}'.format(entry_date, year)

    amounts = [x for x in a if re_asd.search(x)]

    if False and amounts[1:]:
        print(a)
        return
    else:
        amount = amounts[0][:-1] if amounts[0][-1] == '+' else '-' + amounts[0][:-1]
        amount = decimal.Decimal(amount.replace('.', '').replace(',','.'))
        beneficiary_or_remitter = to
        account_number = None
        bic = None
        reference_number = None # a[0]
        originators_reference = None
        message = ' '.join(a[4:]) if a[4:] else None
        receipt = None
        card_number = None
        transaction = None

    W.writerow([entry_date, value_date, payment_date, amount, beneficiary_or_remitter, account_number, bic, transaction, reference_number, originators_reference, message, card_number, receipt, None])


 L = list(get_legacy_pdf_stuff(sys.argv[1:]))
 prev = None
 for x in L:
    if prev is not None and prev[0] != x[0]:
        prev = prev[:-4]

    if prev: handle_thing(prev)
    prev = x

 prev = prev[:-4]
 handle_thing(prev)
	#!/usr/bin/env python3
	#
	# Hacky but functional script for extracting data from (Finnish) Nordea web bank account statement PDF files.
	#
	# 1. Presumes you have some account statements to clean up (for hledger, for instance).
	# 2. The normal {Transactions,Tapahtumat}_<IBAN>.txt CSV files are by far superior to work with,
	# but they are no longer available after just a year or two (better archive them while you can!).
	# 3. I wanted to have more history available, so I devised this hacky method to extract it
	# sort of cleanly from the PDF account statements in Finnish.
	#
	# Usage: Get the account statements, pdftotext them, pass the .txt-ified versions as arguments to this script. And profit.

	import sys,re,decimal,csv,glob,os,decimal
	from itertools import chain
	import ujson
	import datetime as dt

	def get_nordea_csv_rows():
	seen=set()
	for path in glob.glob('Transactions*.txt'):
	#print(path)
	A=list(csv.reader(open(path, 'rt'), delimiter='\t', quotechar='\x12')) # Nordea fails at CSV. Ignore quotes.
	header = A[2]
	for row in A[4::2]:
	key = tuple(zip(header, row))
	if key in seen:
	print('WARN: seen already: ', key, file=sys.stderr)
	continue

	seen.add(key)
	d = dict(key)
	assert d['Beneficiary/Remitter'] or d['Card number'], d

	if True or b'digital' in d['Beneficiary/Remitter'].encode('utf-8').lower():
	amount = decimal.Decimal(d['Amount'].replace('.','').replace(',','.'))
	msg = d['Message'].strip()
	yield amount, d['Payment date'], msg, path


	Forbidden='''
	*1011
	-8
	-E
	00530 HELSINKI
	0200 3000 (pvm/mpm)
	1
	1011 Helsinki-Hakaniemi
	101135-493712
	2
	3
	4
	Nordea Asiakaspalvelu
	ma-su 24h/vrk
	29.11.2014
	9/10
	==> append : to filename to view the PDF source
	=E
	=K
	=M
	=V
	Check-in -asiakas
	Erittely
	Helsinki, Y-tunnus 1680235-8
	Kausi
	Kirjaus- Arvop
	KÄÄNNÄ
	Käyttötili
	Loppusaldo
	Maksullinen kuitti
	Maksup Viesti
	Maksuton kuitti
	Määrä
	Nordea Pankki Suomi Oyj, Aleksanterinkatu 36, 00020 NORDEA, kotipaikka
	Nordea Pankki Suomi Oyj, Aleksanterinkatu 36, 00020 NORDEA, kotipaikka Helsinki, Y-tunnus 1680235-8
	Otot yhteensä
	Panot yhteensä
	Puh. 0200 3000 (pvm/mpm)
	Puh. 0200 3000 (pvm/mpm)
	Pyydämme Teitä tarkistamaan tilillenne kirjatut erät ja säilyttämään ti
	Pyydämme Teitä tarkistamaan tilillenne kirjatut erät ja säilyttämään tiliotteen kuittina maksuista.
	Päivämäärä
	Saaja ja tilinumero/Maksaja
	Saldo
	Siltasaarenkatu 12
	Sivu
	TILIOTE
	Verotuskuitti
	[G0Lqsw7a-1_1.png]
	EtuTili uudistuu. Nyt voit nostaa varoja ja tehdä tilisiirtoja ilman
	nostorajoituksia, liittää EtuTilin hoitotiliksi myös rahasto-salkkuun j
	a
	arvo-osuustilille. Lisätietoja saat nordea.fi, konttoristamme tai
	soittamalla Nordea Asiakaspalveluun 0200 3000 (pvm/mpm) ma-pe klo 8-20.
	Pyydämme Teitä tarkistamaan tilillenne kirjatut erät ja säilyttämään
	tiliotteen kuittina maksuista.
	Hintamuutoksia 1.11.2010 alkaen.
	Lisätietoja konttoreistamme, www.nordea.fi ja
	Asiakaspalvelusta 0200 3000 (pvm/mpm) ma-pe 8-20.
	__________________________________________________________________
	liotteen kuittina maksuista.
	päivä
	Puh.
	K
	.
	9
	V
	i
	*
	'''.strip().lower().splitlines()



	re_asd = re.compile('\d\d[+-]$\|\d\d[+-][^0-9A-Za-z]')
	re_foo = re.compile('\d\d\.\d\d\.\d\d$')
	re_foo1 = re.compile('^\d\d\.\d\d\s+[^0-9]')
	re_foo2 = re.compile('^\d\d\.\d\d \d\d\.\d\d')
	re_term = re.compile('^\d\d/\d\d$')
	def get_legacy_pdf_stuff(paths):
	for path in paths:
	if not os.path.exists(path): continue
	A=open(path,'rb').read().decode('latin1').splitlines()

	A = [x.strip() for x in A
	if x.lower().strip() not in Forbidden
	and not x.strip().startswith('[')
	and not x.strip().startswith('==>')
	and x
	]

	values = [(i,x) for i,x in enumerate(A) if re_asd.search(x)]
	if not values:
	print('ASD', A)
	continue
	A = A[1+values[0][0]:]

	i=0
	acc=[]
	has_date2=False
	has_date1=False
	while A:
	_has_date2 = re_foo2.search(A[0])
	if acc and _has_date2:
	yield [path] + acc
	acc=[]
	elif not A[1:]:
	acc.append(A[0])
	yield [path] + acc
	del A[0]
	else:
	acc.append(A[0])
	del A[0]
	has_date2 = any(re_foo2.search(x) for x in acc)
	has_date1 = any(re_foo1.search(x) for x in acc)

	def get_year(path, date):
	'''
	Very very very ad hoc for my data only. I had files like 01.pdf up to 83.pdf,
	where the lexicographically earlier file corresponds to an older statement.

	Actually the first statements I had were from 2008 or so. But I left them out
	completely because pdftotext (and text highlight + copy) would only produce garbled output.

	It was not worth the effort for me to deal with them so I just left them out.
	'''
	day, month = date.split('.')
	if path <= '16' and month != '01': return '2010'
	elif path <= '28': return '2011'
	elif path <= '40': return '2012'
	elif path <= '52': return '2013'
	else: return '2014'

	import csv
	W=csv.writer(sys.stdout)
	def handle_thing(x):
	a=[]
	for y in x:
	if re_term.search(y):
	break
	a.append(y)

	entry_date, value_date = a[1].split()[:2]
	to = ' '.join(a[1].split()[2:])
	year = get_year(a[0], value_date)
	value_date = '{}.{}'.format(value_date, year)
	payment_date = '{}.{}'.format(value_date, year) #whatever
	entry_date = '{}.{}'.format(entry_date, year)

	amounts = [x for x in a if re_asd.search(x)]

	if False and amounts[1:]:
	print(a)
	return
	else:
	amount = amounts[0][:-1] if amounts[0][-1] == '+' else '-' + amounts[0][:-1]
	amount = decimal.Decimal(amount.replace('.', '').replace(',','.'))
	beneficiary_or_remitter = to
	account_number = None
	bic = None
	reference_number = None # a[0]
	originators_reference = None
	message = ' '.join(a[4:]) if a[4:] else None
	receipt = None
	card_number = None
	transaction = None

	W.writerow([entry_date, value_date, payment_date, amount, beneficiary_or_remitter, account_number, bic, transaction, reference_number, originators_reference, message, card_number, receipt, None])


	L = list(get_legacy_pdf_stuff(sys.argv[1:]))
	prev = None
	for x in L:
	if prev is not None and prev[0] != x[0]:
	prev = prev[:-4]

	if prev: handle_thing(prev)
	prev = x

	prev = prev[:-4]
	handle_thing(prev)