Created
October 29, 2013 12:37
-
-
Save sergray/7213857 to your computer and use it in GitHub Desktop.
Python PDF utililties
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf8 | |
""" | |
Генератор CSV отчета из PDF отчетов банка Авангард о поступлении средств на | |
транзитный валютный счет. | |
По умолчанию читает из стандартного ввода данные извлеченные из PDF при помощи | |
textpdf.py утилиты. | |
Генерируемый CSV отчет печатается в стандартный вывод и включает следующие колонки: | |
* date - дата получения средств | |
* doc - номер уведомления | |
* usd - полученная сумма в долларах | |
* xchg - курс обмена на дату получения | |
* rub - полученная сумма в рублях | |
""" | |
import csv | |
import re | |
from decimal import Decimal | |
ACCOUNT = '' # заменить на свой номер счета \d{21}\s{1}\d{2} | |
rx = re.compile( | |
"(?P<prefix>\d+)\t(?P<date>\d{2}/\d{2}/\d{2}) (?P<doc>\d+)\t" + \ | |
ACCOUNT + "\t(?P<dollars>[^0][\d ]+\.\d{2})" | |
) | |
def main(src): | |
out = csv.writer(sys.stdout) | |
delim = '\t' | |
decimal_value = lambda str: Decimal(str.replace(' ', '')) | |
extract_field = lambda line, col: line.split(delim)[col] | |
rows = [] | |
for line in src: | |
mo = rx.match(line) | |
if not mo: | |
continue | |
src.next() # пропуск строки "Итого" | |
dollars = decimal_value(extract_field(src.next(), 2)) | |
rubles = decimal_value(extract_field(src.next(), 2)) | |
exch_rate = decimal_value(extract_field(src.next(), 2)) | |
# sanity checks | |
assert(decimal_value(mo.group('dollars')) == dollars) | |
calc_rub = exch_rate * dollars | |
assert(calc_rub.quantize(Decimal('0.01')) == rubles) | |
rows.append([mo.group('date'), mo.group('doc'), dollars, exch_rate, rubles]) | |
if rows: | |
out.writerow(['date', 'doc', 'usd', 'xchg', 'rub']) | |
out.writerows(rows) | |
if __name__ == '__main__': | |
import sys | |
main(sys.stdin) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Prints inline text boxes from PDF using pdfminer. | |
Assumes that PDF is autogenerated and inline boxes are on the same height and have the same font size/style. | |
Requires pdf2txt.py from pdfminer and lxml, so please do:: | |
pip install pdfminer lxml | |
Usage example:: | |
pdf2txt.py -t xml input.pdf | python text_pdf.py | |
""" | |
from __future__ import print_function | |
from lxml import etree | |
from itertools import groupby | |
def line2box(textline): | |
left, top, right, bottom = map(float, textline.get('bbox').split(',')) | |
word = u''.join(c.text for c in textline.getchildren()) | |
return -top, left, word | |
def process(page): | |
textlines = page.xpath('.//textline') | |
boxes = map(line2box, textlines) | |
boxes.sort() | |
page_id = page.get('id') | |
for k, g in groupby(boxes, lambda b: b[0]): | |
print(page_id, *[b[2].rstrip().encode('utf-8') for b in g], sep='\t') | |
def main(src): | |
tree = etree.parse(src) | |
pages = tree.xpath('//page') | |
for page in pages: | |
process(page) | |
if __name__ == '__main__': | |
import sys | |
main(sys.stdin) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment