Last active
August 9, 2017 18:03
-
-
Save kosua20/613c32d8be702284deb74e6164c8575f to your computer and use it in GitHub Desktop.
LCL PDF account sheets extraction. Usage: `extract-lcl.py file-to-extract.pdf`. Based on https://github.com/aaugustin/lcl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#from __future__ import unicode_literals | |
import collections | |
import datetime | |
import decimal | |
import sys | |
from pdfminer.converter import PDFPageAggregator | |
from pdfminer.layout import LAParams, LTTextBox | |
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager | |
from pdfminer.pdfpage import PDFPage | |
class CustomConverter(PDFPageAggregator): | |
# Horizontal coordinates of vertical lines that delimit columns. | |
boundaries = [38, 70, 358, 404, 484, 560] | |
# Create consecutive pairs. | |
boundaries = zip(boundaries[:-1], boundaries[1:]) | |
def __init__(self, *args, **kwargs): | |
laparams = LAParams(char_margin=1.2, line_margin=0.1) | |
kwargs.setdefault('laparams', laparams) | |
super(CustomConverter, self).__init__(*args, **kwargs) | |
# Public variable holding relevant rows when processing is complete. | |
self.rows = [] | |
def receive_layout(self, ltpage): | |
"""Extract data rows.""" | |
# Group pieces of text by vertical coordinate: | |
rows = collections.defaultdict(lambda: [[] for _ in self.boundaries]) | |
for item in ltpage: | |
# If an item on the page is a text box... | |
if isinstance(item, LTTextBox): | |
# And it falls inside one of the "boundaries" range... | |
for col, (left, right) in enumerate(self.boundaries): | |
if left - 1 < item.x0 < item.x1 < right + 1: | |
# Store it in the corresponding column. | |
# Columns elements are grouped based on their vertical position. | |
rows[int(item.y0)][col].append(item.get_text().strip()) | |
break | |
# Sort rows. | |
sorted_rows = sorted(rows.items(), reverse=True) | |
# Merge close rows: | |
for (y1, row1), (y2, row2) in zip(sorted_rows[:-1], sorted_rows[1:]): | |
# If they are almost vertically aligned (6px arbitrary threshold)... | |
if y1 - y2 < 6: | |
# Merge their elements column-wise. | |
rows[y1] = [item1 + item2 for item1, item2 in zip(row1, row2)] | |
# Remove merged row. | |
del rows[y2] | |
# Process relevant rows and store their data: | |
for _, row in sorted(rows.items(), reverse=True): | |
row = [' '.join(item) for item in row] | |
if not all(row[:3]): | |
continue | |
# Skip headers. | |
if row == ['DATE', 'LIBELLE', 'VALEUR', 'DEBIT', 'CREDIT']: | |
continue | |
# Extract label, date, amount? | |
label = row[1] | |
day, month, year = map(int, row[2].split('.')) | |
date = datetime.date(2000 + year, month, day) | |
amount = '-' + row[3] if ',' in row[3] else row[4] | |
amount = decimal.Decimal(amount.replace(' ', '').replace(',', '.')) | |
# Store the result. | |
self.rows.append((label, date, amount)) | |
def render_image(self, name, stream): | |
"""Ignore images.""" | |
def paint_path(self, gstate, stroke, fill, evenodd, path): | |
"""Ignore paths.""" | |
# Argument. | |
if len(sys.argv) < 2: | |
print("Usage: extract-lcl.py file-to-extract.pdf") | |
exit() | |
# Files | |
filePath = sys.argv[1] | |
fp = file(filePath, 'rb') | |
# We will write the raw result in a txt with the same name as the input file. | |
fout = open(filePath.replace('.pdf','.txt'), 'w') | |
# PDF Miner setup | |
rsrcmgr = PDFResourceManager() | |
device = CustomConverter(rsrcmgr) | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
# Extraction | |
for page in PDFPage.get_pages(fp): | |
interpreter.process_page(page) | |
rows = device.rows | |
#Write rows to file. | |
for label, date, amount in rows: | |
fout.write("{} {:+8.2f} {}\n".format(date, amount, label)) | |
fout.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment