Last active
September 16, 2019 09:40
-
-
Save artkpv/28982b2ba7af373607e6fcb21b099a91 to your computer and use it in GitHub Desktop.
Convert PDF to CSV for Rocketbank (Рокетбанк)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/python3 | |
""" | |
Конвертирует транзакции Рокетбанка (https://rocketbank.ru) из PDF в CSV. | |
Выводит в UTF8. | |
Зависимости: | |
- Poppler. В PATH: 'pdftotext' | |
https://poppler.freedesktop.org/ | |
https://jlk.fjfi.cvut.cz/arch/manpages/man/pdftotext.1 | |
Автор: w1ld at inbox dot ru | |
Releases: | |
- Before 2019-09-16. Using SumatraPDf | |
- 2019-09-16. Using Poppler. Adding checks for a day. Output to STDOUT. | |
""" | |
import sys | |
import re | |
import csv | |
from decimal import Decimal | |
from subprocess import getoutput | |
PDFTOTEXTCLI = 'pdftotext' | |
if len(sys.argv) == 0: | |
print('Usage: import-rocket-pdftocsv.py [pdf]') | |
exit() | |
def parse_amount(amount_raw): | |
assert amount_raw | |
amount = amount_raw.replace(' ', '') | |
amount = amount.replace(',', '.') | |
return Decimal(amount) | |
# Construct re patterns. | |
currency_p = '[A-Z]{1,4}' | |
amount_p = '-?\d[\d ]*(,\d+)? ' + currency_p | |
def parse_amount_currency(amount_currency): | |
*amount_vals, currency = amount_currency.split(' ') | |
amount = parse_amount(' '.join(amount_vals)) | |
return amount, currency | |
transaction_p_base = r'(?m)^$^\n( +)(?P<d1>\S[^\n]+\S) {{2,}}(?P<a>{})(?P<d2>(\n\1\S[^\n]*)*)\n^$' | |
transaction_re = re.compile(transaction_p_base.format(amount_p)) | |
table_header_re = re.compile(r'\s*Дата\s+Описание\s+Расход\s+Приход\s+Входящий остаток\s*') | |
table_bottom_re = re.compile(r'^\s*Итог:\s+(?P<income>' + amount_p + ')\s+(?P<outcom>' + amount_p + ')\s*$') | |
date_re = re.compile('(?m)^(?P<date>\d{2}\.\d{2}\.\d{4})\s+(?P<d_amount>' + amount_p + ')$') | |
total_in_re = re.compile('Входящий остаток:\s+(' + amount_p + ')') | |
total_out_re = re.compile('Исходящий остаток:\s+(' + amount_p + ')') | |
def _transaction_iterator(pdftext): | |
# Iterate from table header till table bottom by a date. | |
th_match = table_header_re.search(pdftext) | |
assert th_match | |
from_ = th_match.end(0) | |
date_m = date_re.search(pdftext, from_) | |
total_in_m = total_in_re.search(pdftext) | |
assert total_in_m | |
balance_amount, balance_currency = parse_amount_currency(total_in_m.group(1)) | |
total_out_m = total_out_re.search(pdftext) | |
assert total_out_m | |
balance_out_amount, balance_out_currency = parse_amount_currency(total_out_m.group(1)) | |
while date_m: | |
date_balance, date_currency = parse_amount_currency(date_m.groupdict()['d_amount']) | |
if balance_amount != date_balance: | |
raise Exception('Invalid balance: {} != {}.'.format(balance_amount, date_balance)) | |
from_ = date_m.start(0) | |
next_date_m = date_re.search(pdftext, date_m.end(0)) | |
to_ = next_date_m.start(0) if next_date_m else len(pdftext)-1 | |
date_val = date_m.groupdict()['date'] | |
# Iterate over all transactions in this day. | |
for transaction_m in transaction_re.finditer(pdftext, pos=from_, endpos=to_): | |
mdict = transaction_m.groupdict() | |
description = mdict['d1'] | |
amount, currency = parse_amount_currency(mdict['a']) | |
assert currency == balance_currency | |
balance_amount += amount # For checks. Expense comes with '-'. Income is positive. | |
d_remain = mdict['d2'] | |
if d_remain: | |
d_remain = d_remain.strip() | |
# Concat: | |
d_remain = re.sub(r' {2,}', r' ', d_remain) | |
description += ' ' + d_remain | |
description = description.replace('\n', '') | |
yield [date_val, description, amount, currency, balance_amount] | |
date_m = next_date_m | |
# Check balance for last day. | |
if balance_amount != balance_out_amount: | |
raise Exception('Invalid balance: {} != {}'.format(balance_amount, balance_out_amount)) | |
pdffilename = sys.argv[1] | |
pdftext = getoutput(PDFTOTEXTCLI + ' -layout "' + pdffilename + '" -') | |
assert pdftext | |
with sys.stdout as f: | |
w = csv.writer(f) | |
w.writerow(['date','description','amount', 'currency', 'balance']) | |
for trn in _transaction_iterator(pdftext): | |
w.writerow(trn) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment