Skip to content

Instantly share code, notes, and snippets.

@gnardini
Created February 21, 2019 13:46
Show Gist options
  • Save gnardini/48ed9e4f1ac2ca925951d335fe1a7d51 to your computer and use it in GitHub Desktop.
Save gnardini/48ed9e4f1ac2ca925951d335fe1a7d51 to your computer and use it in GitHub Desktop.
from xlrd import open_workbook
import re
import json
date_regex = re.compile('^[0-9][0-9]-[0-9][0-9]$')
new_page_regex = re.compile('^Página [0-9]+ de [0-9]+$')
def is_new_page(sheet, row, number_of_columns):
val = str(sheet.cell(row, 0).value)
if val.startswith('El importe adeudado'):
return True
for col in range(number_of_columns):
val = str(sheet.cell(row, col).value)
if new_page_regex.match(val):
return True
return False
def starting_new_items(sheet, row, description_col):
return 'SALDO ANTERIOR' == str(sheet.cell(row, description_col).value)
def find_raw_cost(sheet, row, cost_col, number_of_columns):
cost_val = sheet.cell(row, cost_col).value
if not cost_val:
return ''
for col in range(cost_col + 1, number_of_columns):
val = sheet.cell(row, col).value
if val:
return cost_val
return sheet.cell(row, cost_col - 1).value
def parse_real_xls(file_name):
wb = open_workbook(file_name)
date_col = 0
description_col = 1
cost_col = 2
items = []
for sheet in wb.sheets():
for row in range(7, sheet.nrows):
date = str(sheet.cell(row, date_col).value)
description = str(sheet.cell(row, description_col).value).replace('\n', '')
raw_cost = str(sheet.cell(row, cost_col).value)[1:]
cost = float(raw_cost.replace('.', '').replace(',', '.'))
if cost > 0:
items.append({
'date': date,
'cost': cost,
'description': description
})
return items
def parse_xls_from_pdf(file_name):
wb = open_workbook(file_name)
description_col = -1
cost_col = -1
active_item = False
active_paused = False
items = []
for sheet in wb.sheets():
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
rows = []
for row in range(number_of_rows):
date = str(sheet.cell(row, 0).value)
if date == 'Fecha':
for col in range(number_of_columns):
val = str(sheet.cell(row, col).value)
if val == 'Descripción':
description_col = col
elif val == 'Débito(-)':
cost_col = col
if date is not None and date_regex.match(date):
try:
description = sheet.cell(row, description_col).value
raw_cost = find_raw_cost(sheet, row, cost_col, number_of_columns)
cost = float(raw_cost.replace('.', '').replace(',', '.'))
items.append({
'date': date,
'cost': cost,
'description': description
})
active_item = True
except ValueError:
active_item = False
pass
elif is_new_page(sheet, row, number_of_columns):
if active_item:
active_item = False
active_paused = True
elif starting_new_items(sheet, row, description_col):
if active_paused:
active_item = True
active_paused = False
elif active_item:
desc_cell = str(sheet.cell(row, description_col).value)
date_cell = str(sheet.cell(row, 0).value)
last_item = items[-1]
last_item['description'] = last_item['description'] + desc_cell + date_cell
return items
def add_if(data, category, keyword, description, cost, log = False):
if keyword in description:
data[category] += cost
if log:
print(description)
return True
return False
def add_item(data, item):
description = item['description']
classified = False
classified = classified or add_if(data, 'super', 'DIA%', description, item['cost'])
classified = classified or add_if(data, 'super', 'COTO', description, item['cost'])
classified = classified or add_if(data, 'gas', 'MTGA', description, item['cost'])
classified = classified or add_if(data, 'luz', 'ENOR', description, item['cost'])
classified = classified or add_if(data, 'agua', 'AYSA', description, item['cost'])
classified = classified or add_if(data, 'abl', 'MNBA', description, item['cost'])
classified = classified or add_if(data, 'internet', 'CABLEVISION SA', description, item['cost'])
classified = classified or add_if(data, 'internet', 'TELECOM ARGENTIN PAGO ABONO', description, item['cost'])
places = ['BURGERTIFY', 'WILLIAMSBURG', 'THE ALCHEMIST', 'THE BARBEER', 'MANIAS', 'Express',
'SAY MY NAME', 'DEAN & DENNYS', 'CONCEPTO GASTRONOMICO']
for place in places:
classified = classified or add_if(data, 'salidas', place, description, item['cost'])
classified = classified or add_if(data, 'efectivo', 'EXTRACCION CAJERO', description, item['cost'])
classified = classified or add_if(data, 'rent', '123123123', description, item['cost'])
classified = classified or add_if(data, 'rent', '456456456', description, item['cost'])
classified = classified or add_if(data, 'tarjeta', 'PAGO TARJETA VISA', description, item['cost'])
classified = classified or add_if(data, 'banco', 'COM. CUSTODIA DE TIT.', description, item['cost'])
classified = classified or add_if(data, 'banco', 'COM. BANELCO OTROS BCOS', description, item['cost'])
classified = classified or add_if(data, 'banco', 'IVA', description, item['cost'])
should_ignore = False
to_ignore = ['nombredepersona', '098908098']
for ignore in to_ignore:
if ignore in description:
should_ignore = True
if not should_ignore:
data['total'] += item['cost']
if not classified:
data['otro'] += item['cost']
print(item)
items = parse_xls_from_pdf('resumen-galicia.xlsx')
real_items = parse_real_xls('report-galicia.xlsx')
total = {}
for item in real_items:
month = item['date'][3:]
if month not in total:
total[month] = {
'total': 0,
'super': 0,
'gas': 0,
'luz': 0,
'agua': 0,
'abl': 0,
'internet': 0,
'salidas': 0,
'efectivo': 0,
'tarjeta': 0,
'banco': 0,
'rent': 0,
'otro': 0
}
add_item(total[month], item)
print(json.dumps(total, indent=4))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment