gnardini · February 21, 2019 13:46
diff --git a/resumen-parser.py b/resumen-parser.py
 from xlrd import open_workbook
 import re
 import json

 date_regex = re.compile('^[0-9][0-9]-[0-9][0-9]$')
 new_page_regex = re.compile('^Página [0-9]+ de [0-9]+$')

 def is_new_page(sheet, row, number_of_columns):
    val = str(sheet.cell(row, 0).value)
    if val.startswith('El importe adeudado'):
        return True
    for col in range(number_of_columns):
        val = str(sheet.cell(row, col).value)
        if new_page_regex.match(val):
            return True
    return False

 def starting_new_items(sheet, row, description_col):
    return 'SALDO ANTERIOR' == str(sheet.cell(row, description_col).value)

 def find_raw_cost(sheet, row, cost_col, number_of_columns):
    cost_val = sheet.cell(row, cost_col).value
    if not cost_val:
        return ''
    for col in range(cost_col + 1, number_of_columns):
        val = sheet.cell(row, col).value
        if val:
            return cost_val
    return sheet.cell(row, cost_col - 1).value

 def parse_real_xls(file_name):
    wb = open_workbook(file_name)
    date_col = 0
    description_col = 1
    cost_col = 2

    items = []

    for sheet in wb.sheets():
        for row in range(7, sheet.nrows):
            date = str(sheet.cell(row, date_col).value)
            description = str(sheet.cell(row, description_col).value).replace('\n', '')
            raw_cost = str(sheet.cell(row, cost_col).value)[1:]
            cost = float(raw_cost.replace('.', '').replace(',', '.'))
            if cost > 0:
                items.append({
                    'date': date,
                    'cost': cost,
                    'description': description
                })

    return items

 def parse_xls_from_pdf(file_name):
    wb = open_workbook(file_name)

    description_col = -1
    cost_col = -1
    active_item = False
    active_paused = False

    items = []

    for sheet in wb.sheets():
        number_of_rows = sheet.nrows
        number_of_columns = sheet.ncols

        rows = []
        for row in range(number_of_rows):
            date = str(sheet.cell(row, 0).value)
            if date == 'Fecha':
                for col in range(number_of_columns):
                    val = str(sheet.cell(row, col).value)
                    if val == 'Descripción':
                        description_col = col
                    elif val == 'Débito(-)':
                        cost_col = col
            if date is not None and date_regex.match(date):
                try:
                    description = sheet.cell(row, description_col).value
                    raw_cost = find_raw_cost(sheet, row, cost_col, number_of_columns)
                    cost = float(raw_cost.replace('.', '').replace(',', '.'))
                    items.append({
                        'date': date,
                        'cost': cost,
                        'description': description
                    })
                    active_item = True
                except ValueError:
                    active_item = False
                    pass
            elif is_new_page(sheet, row, number_of_columns):
                if active_item:
                    active_item = False
                    active_paused = True
            elif starting_new_items(sheet, row, description_col):
                if active_paused:
                    active_item = True
                    active_paused = False
            elif active_item:
                desc_cell = str(sheet.cell(row, description_col).value)
                date_cell = str(sheet.cell(row, 0).value)
                last_item = items[-1]
                last_item['description'] = last_item['description'] + desc_cell + date_cell
    return items

 def add_if(data, category, keyword, description, cost, log = False):
    if keyword in description:
        data[category] += cost
        if log:
            print(description)
        return True
    return False

 def add_item(data, item):
    description = item['description']
    classified = False
    classified = classified or add_if(data, 'super', 'DIA%', description, item['cost'])
    classified = classified or add_if(data, 'super', 'COTO', description, item['cost'])
    classified = classified or add_if(data, 'gas', 'MTGA', description, item['cost'])
    classified = classified or add_if(data, 'luz', 'ENOR', description, item['cost'])
    classified = classified or add_if(data, 'agua', 'AYSA', description, item['cost'])
    classified = classified or add_if(data, 'abl', 'MNBA', description, item['cost'])
    classified = classified or add_if(data, 'internet', 'CABLEVISION SA', description, item['cost'])
    classified = classified or add_if(data, 'internet', 'TELECOM ARGENTIN PAGO ABONO', description, item['cost'])
    places = ['BURGERTIFY', 'WILLIAMSBURG', 'THE ALCHEMIST', 'THE BARBEER', 'MANIAS', 'Express',
        'SAY MY NAME', 'DEAN & DENNYS', 'CONCEPTO GASTRONOMICO']
    for place in places:
        classified = classified or add_if(data, 'salidas', place, description, item['cost'])
    classified = classified or add_if(data, 'efectivo', 'EXTRACCION CAJERO', description, item['cost'])
    classified = classified or add_if(data, 'rent', '123123123', description, item['cost'])
    classified = classified or add_if(data, 'rent', '456456456', description, item['cost'])
    classified = classified or add_if(data, 'tarjeta', 'PAGO TARJETA VISA', description, item['cost'])
    classified = classified or add_if(data, 'banco', 'COM. CUSTODIA DE TIT.', description, item['cost'])
    classified = classified or add_if(data, 'banco', 'COM. BANELCO OTROS BCOS', description, item['cost'])
    classified = classified or add_if(data, 'banco', 'IVA', description, item['cost'])

    should_ignore = False
    to_ignore = ['nombredepersona', '098908098']
    for ignore in to_ignore:
        if ignore in description:
            should_ignore = True
    if not should_ignore:
        data['total'] += item['cost']
        if not classified:
            data['otro'] += item['cost']
            print(item)

 items = parse_xls_from_pdf('resumen-galicia.xlsx')
 real_items = parse_real_xls('report-galicia.xlsx')

 total = {}
 for item in real_items:
    month = item['date'][3:]
    if month not in total:
        total[month] = {
            'total': 0,
            'super': 0,
            'gas': 0,
            'luz': 0,
            'agua': 0,
            'abl': 0,
            'internet': 0,
            'salidas': 0,
            'efectivo': 0,
            'tarjeta': 0,
            'banco': 0,
            'rent': 0,
            'otro': 0
        }
    add_item(total[month], item)

 print(json.dumps(total, indent=4))
	from xlrd import open_workbook
	import re
	import json

	date_regex = re.compile('^[0-9][0-9]-[0-9][0-9]$')
	new_page_regex = re.compile('^Página [0-9]+ de [0-9]+$')

	def is_new_page(sheet, row, number_of_columns):
	val = str(sheet.cell(row, 0).value)
	if val.startswith('El importe adeudado'):
	return True
	for col in range(number_of_columns):
	val = str(sheet.cell(row, col).value)
	if new_page_regex.match(val):
	return True
	return False

	def starting_new_items(sheet, row, description_col):
	return 'SALDO ANTERIOR' == str(sheet.cell(row, description_col).value)

	def find_raw_cost(sheet, row, cost_col, number_of_columns):
	cost_val = sheet.cell(row, cost_col).value
	if not cost_val:
	return ''
	for col in range(cost_col + 1, number_of_columns):
	val = sheet.cell(row, col).value
	if val:
	return cost_val
	return sheet.cell(row, cost_col - 1).value

	def parse_real_xls(file_name):
	wb = open_workbook(file_name)
	date_col = 0
	description_col = 1
	cost_col = 2

	items = []

	for sheet in wb.sheets():
	for row in range(7, sheet.nrows):
	date = str(sheet.cell(row, date_col).value)
	description = str(sheet.cell(row, description_col).value).replace('\n', '')
	raw_cost = str(sheet.cell(row, cost_col).value)[1:]
	cost = float(raw_cost.replace('.', '').replace(',', '.'))
	if cost > 0:
	items.append({
	'date': date,
	'cost': cost,
	'description': description
	})

	return items

	def parse_xls_from_pdf(file_name):
	wb = open_workbook(file_name)

	description_col = -1
	cost_col = -1
	active_item = False
	active_paused = False

	items = []

	for sheet in wb.sheets():
	number_of_rows = sheet.nrows
	number_of_columns = sheet.ncols

	rows = []
	for row in range(number_of_rows):
	date = str(sheet.cell(row, 0).value)
	if date == 'Fecha':
	for col in range(number_of_columns):
	val = str(sheet.cell(row, col).value)
	if val == 'Descripción':
	description_col = col
	elif val == 'Débito(-)':
	cost_col = col
	if date is not None and date_regex.match(date):
	try:
	description = sheet.cell(row, description_col).value
	raw_cost = find_raw_cost(sheet, row, cost_col, number_of_columns)
	cost = float(raw_cost.replace('.', '').replace(',', '.'))
	items.append({
	'date': date,
	'cost': cost,
	'description': description
	})
	active_item = True
	except ValueError:
	active_item = False
	pass
	elif is_new_page(sheet, row, number_of_columns):
	if active_item:
	active_item = False
	active_paused = True
	elif starting_new_items(sheet, row, description_col):
	if active_paused:
	active_item = True
	active_paused = False
	elif active_item:
	desc_cell = str(sheet.cell(row, description_col).value)
	date_cell = str(sheet.cell(row, 0).value)
	last_item = items[-1]
	last_item['description'] = last_item['description'] + desc_cell + date_cell
	return items

	def add_if(data, category, keyword, description, cost, log = False):
	if keyword in description:
	data[category] += cost
	if log:
	print(description)
	return True
	return False

	def add_item(data, item):
	description = item['description']
	classified = False
	classified = classified or add_if(data, 'super', 'DIA%', description, item['cost'])
	classified = classified or add_if(data, 'super', 'COTO', description, item['cost'])
	classified = classified or add_if(data, 'gas', 'MTGA', description, item['cost'])
	classified = classified or add_if(data, 'luz', 'ENOR', description, item['cost'])
	classified = classified or add_if(data, 'agua', 'AYSA', description, item['cost'])
	classified = classified or add_if(data, 'abl', 'MNBA', description, item['cost'])
	classified = classified or add_if(data, 'internet', 'CABLEVISION SA', description, item['cost'])
	classified = classified or add_if(data, 'internet', 'TELECOM ARGENTIN PAGO ABONO', description, item['cost'])
	places = ['BURGERTIFY', 'WILLIAMSBURG', 'THE ALCHEMIST', 'THE BARBEER', 'MANIAS', 'Express',
	'SAY MY NAME', 'DEAN & DENNYS', 'CONCEPTO GASTRONOMICO']
	for place in places:
	classified = classified or add_if(data, 'salidas', place, description, item['cost'])
	classified = classified or add_if(data, 'efectivo', 'EXTRACCION CAJERO', description, item['cost'])
	classified = classified or add_if(data, 'rent', '123123123', description, item['cost'])
	classified = classified or add_if(data, 'rent', '456456456', description, item['cost'])
	classified = classified or add_if(data, 'tarjeta', 'PAGO TARJETA VISA', description, item['cost'])
	classified = classified or add_if(data, 'banco', 'COM. CUSTODIA DE TIT.', description, item['cost'])
	classified = classified or add_if(data, 'banco', 'COM. BANELCO OTROS BCOS', description, item['cost'])
	classified = classified or add_if(data, 'banco', 'IVA', description, item['cost'])

	should_ignore = False
	to_ignore = ['nombredepersona', '098908098']
	for ignore in to_ignore:
	if ignore in description:
	should_ignore = True
	if not should_ignore:
	data['total'] += item['cost']
	if not classified:
	data['otro'] += item['cost']
	print(item)

	items = parse_xls_from_pdf('resumen-galicia.xlsx')
	real_items = parse_real_xls('report-galicia.xlsx')

	total = {}
	for item in real_items:
	month = item['date'][3:]
	if month not in total:
	total[month] = {
	'total': 0,
	'super': 0,
	'gas': 0,
	'luz': 0,
	'agua': 0,
	'abl': 0,
	'internet': 0,
	'salidas': 0,
	'efectivo': 0,
	'tarjeta': 0,
	'banco': 0,
	'rent': 0,
	'otro': 0
	}
	add_item(total[month], item)

	print(json.dumps(total, indent=4))