Created
February 21, 2019 13:46
-
-
Save gnardini/48ed9e4f1ac2ca925951d335fe1a7d51 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from xlrd import open_workbook | |
import re | |
import json | |
date_regex = re.compile('^[0-9][0-9]-[0-9][0-9]$') | |
new_page_regex = re.compile('^Página [0-9]+ de [0-9]+$') | |
def is_new_page(sheet, row, number_of_columns): | |
val = str(sheet.cell(row, 0).value) | |
if val.startswith('El importe adeudado'): | |
return True | |
for col in range(number_of_columns): | |
val = str(sheet.cell(row, col).value) | |
if new_page_regex.match(val): | |
return True | |
return False | |
def starting_new_items(sheet, row, description_col): | |
return 'SALDO ANTERIOR' == str(sheet.cell(row, description_col).value) | |
def find_raw_cost(sheet, row, cost_col, number_of_columns): | |
cost_val = sheet.cell(row, cost_col).value | |
if not cost_val: | |
return '' | |
for col in range(cost_col + 1, number_of_columns): | |
val = sheet.cell(row, col).value | |
if val: | |
return cost_val | |
return sheet.cell(row, cost_col - 1).value | |
def parse_real_xls(file_name): | |
wb = open_workbook(file_name) | |
date_col = 0 | |
description_col = 1 | |
cost_col = 2 | |
items = [] | |
for sheet in wb.sheets(): | |
for row in range(7, sheet.nrows): | |
date = str(sheet.cell(row, date_col).value) | |
description = str(sheet.cell(row, description_col).value).replace('\n', '') | |
raw_cost = str(sheet.cell(row, cost_col).value)[1:] | |
cost = float(raw_cost.replace('.', '').replace(',', '.')) | |
if cost > 0: | |
items.append({ | |
'date': date, | |
'cost': cost, | |
'description': description | |
}) | |
return items | |
def parse_xls_from_pdf(file_name): | |
wb = open_workbook(file_name) | |
description_col = -1 | |
cost_col = -1 | |
active_item = False | |
active_paused = False | |
items = [] | |
for sheet in wb.sheets(): | |
number_of_rows = sheet.nrows | |
number_of_columns = sheet.ncols | |
rows = [] | |
for row in range(number_of_rows): | |
date = str(sheet.cell(row, 0).value) | |
if date == 'Fecha': | |
for col in range(number_of_columns): | |
val = str(sheet.cell(row, col).value) | |
if val == 'Descripción': | |
description_col = col | |
elif val == 'Débito(-)': | |
cost_col = col | |
if date is not None and date_regex.match(date): | |
try: | |
description = sheet.cell(row, description_col).value | |
raw_cost = find_raw_cost(sheet, row, cost_col, number_of_columns) | |
cost = float(raw_cost.replace('.', '').replace(',', '.')) | |
items.append({ | |
'date': date, | |
'cost': cost, | |
'description': description | |
}) | |
active_item = True | |
except ValueError: | |
active_item = False | |
pass | |
elif is_new_page(sheet, row, number_of_columns): | |
if active_item: | |
active_item = False | |
active_paused = True | |
elif starting_new_items(sheet, row, description_col): | |
if active_paused: | |
active_item = True | |
active_paused = False | |
elif active_item: | |
desc_cell = str(sheet.cell(row, description_col).value) | |
date_cell = str(sheet.cell(row, 0).value) | |
last_item = items[-1] | |
last_item['description'] = last_item['description'] + desc_cell + date_cell | |
return items | |
def add_if(data, category, keyword, description, cost, log = False): | |
if keyword in description: | |
data[category] += cost | |
if log: | |
print(description) | |
return True | |
return False | |
def add_item(data, item): | |
description = item['description'] | |
classified = False | |
classified = classified or add_if(data, 'super', 'DIA%', description, item['cost']) | |
classified = classified or add_if(data, 'super', 'COTO', description, item['cost']) | |
classified = classified or add_if(data, 'gas', 'MTGA', description, item['cost']) | |
classified = classified or add_if(data, 'luz', 'ENOR', description, item['cost']) | |
classified = classified or add_if(data, 'agua', 'AYSA', description, item['cost']) | |
classified = classified or add_if(data, 'abl', 'MNBA', description, item['cost']) | |
classified = classified or add_if(data, 'internet', 'CABLEVISION SA', description, item['cost']) | |
classified = classified or add_if(data, 'internet', 'TELECOM ARGENTIN PAGO ABONO', description, item['cost']) | |
places = ['BURGERTIFY', 'WILLIAMSBURG', 'THE ALCHEMIST', 'THE BARBEER', 'MANIAS', 'Express', | |
'SAY MY NAME', 'DEAN & DENNYS', 'CONCEPTO GASTRONOMICO'] | |
for place in places: | |
classified = classified or add_if(data, 'salidas', place, description, item['cost']) | |
classified = classified or add_if(data, 'efectivo', 'EXTRACCION CAJERO', description, item['cost']) | |
classified = classified or add_if(data, 'rent', '123123123', description, item['cost']) | |
classified = classified or add_if(data, 'rent', '456456456', description, item['cost']) | |
classified = classified or add_if(data, 'tarjeta', 'PAGO TARJETA VISA', description, item['cost']) | |
classified = classified or add_if(data, 'banco', 'COM. CUSTODIA DE TIT.', description, item['cost']) | |
classified = classified or add_if(data, 'banco', 'COM. BANELCO OTROS BCOS', description, item['cost']) | |
classified = classified or add_if(data, 'banco', 'IVA', description, item['cost']) | |
should_ignore = False | |
to_ignore = ['nombredepersona', '098908098'] | |
for ignore in to_ignore: | |
if ignore in description: | |
should_ignore = True | |
if not should_ignore: | |
data['total'] += item['cost'] | |
if not classified: | |
data['otro'] += item['cost'] | |
print(item) | |
items = parse_xls_from_pdf('resumen-galicia.xlsx') | |
real_items = parse_real_xls('report-galicia.xlsx') | |
total = {} | |
for item in real_items: | |
month = item['date'][3:] | |
if month not in total: | |
total[month] = { | |
'total': 0, | |
'super': 0, | |
'gas': 0, | |
'luz': 0, | |
'agua': 0, | |
'abl': 0, | |
'internet': 0, | |
'salidas': 0, | |
'efectivo': 0, | |
'tarjeta': 0, | |
'banco': 0, | |
'rent': 0, | |
'otro': 0 | |
} | |
add_item(total[month], item) | |
print(json.dumps(total, indent=4)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment