danieroux · April 6, 2016 11:18
diff --git a/parse_standard_bank_pdf_to_xero_csv.py b/parse_standard_bank_pdf_to_xero_csv.py
 #!/usr/bin/evn python
 #
 # Rough but serviceable Standard Bank South Africa PDF statement to CSV extraction.
 # Exports to Xero's import format
 # 
 # Needs:
 # - poppler 
 # - lxml for python to be installed (pip install lxml)

 from lxml import etree as ET
 from subprocess import call

 import re
 import os
 import csv

 class ParseStandardBankPDF:
    def __init__(self, year, pdf_location):
        self.pdf_location = pdf_location

        filename, ext = os.path.splitext(pdf_location)
        self.filename = filename
        self.csv_location = filename + ".csv"
        self.xml_location = filename + ".xml"
        self.year = year

    def makeXmlTree(self, pdf_location):
        cmd = "pdftohtml -xml -nodrm '{}'".format(pdf_location)
        os.system(cmd)
        filename, ext = os.path.splitext(pdf_location)
        return ET.parse(self.xml_location)

    def parseTransactions(self):
        tree = self.makeXmlTree(self.pdf_location)
        transactions = []

        first_page_done = False
        for page in tree.getroot().iter('page'):
            found = page.xpath('.//text[text()="BALANCE BROUGHT FORWARD"]')
            if (found):
                transactions += self.doPageWithEntries(found[0].itersiblings(), first_page_done)
                first_page_done = True

        return transactions

    def doPageWithEntries(self, cursor, first_page_done):
        cursor.next()
        transactions_on_page = []

        if (not first_page_done):
            cursor.next()

        while True:
            try:
                type_and_number = cursor.next()
                description = cursor.next()
                amount = cursor.next()
                if (amount.text == '##'):
                    amount = cursor.next()
                date_ish = cursor.next()
                balance = cursor.next()

                transactions_on_page.append([type_and_number.text, description.text, amount.text, date_ish.text, balance.text])
            except StopIteration:
                break

        return transactions_on_page

    def makeAmountFromMatch(self, matchobject):
        rand = matchobject.group(1).replace(".", "")
        cents = matchobject.group(2)
        minus = matchobject.group(3)
        return "{}{}.{}".format(minus, rand, cents)
        
    def makeAmount(self, string_amount):
        return re.sub(r'([\d.]*),(\d*)(-?)', self.makeAmountFromMatch, string_amount)
        
    def makeDate(self, string_date):
        month, day = string_date.split(" ")
        return "{}-{}-{}".format(self.year, month, day)

    def makeLine(self, list):
        type_and_number, description, amount, date_ish, balance = list
        return { 'Reference': type_and_number,
                 'Description': description,
                 'Amount': self.makeAmount(amount),
                 'Date': self.makeDate(date_ish) }

    def writeTransactions(self, listOfList):
        with open(self.csv_location, 'w') as csvfile:
            fieldnames = ['Date', 'Amount', 'Description', 'Reference']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            for list in listOfList:
                writer.writerow(self.makeLine(list))

    def go(self):
        parsed = self.parseTransactions()
        self.writeTransactions(parsed)

 ParseStandardBankPDF("2015", 'Statement number 24.pdf').go()
 ParseStandardBankPDF("2015", 'Statement number 25.pdf').go()
 ParseStandardBankPDF("2015", 'Statement number 26.pdf').go()
 ParseStandardBankPDF("2015", 'Statement number 27.pdf').go()
	#!/usr/bin/evn python
	#
	# Rough but serviceable Standard Bank South Africa PDF statement to CSV extraction.
	# Exports to Xero's import format
	#
	# Needs:
	# - poppler
	# - lxml for python to be installed (pip install lxml)

	from lxml import etree as ET
	from subprocess import call

	import re
	import os
	import csv

	class ParseStandardBankPDF:
	def __init__(self, year, pdf_location):
	self.pdf_location = pdf_location

	filename, ext = os.path.splitext(pdf_location)
	self.filename = filename
	self.csv_location = filename + ".csv"
	self.xml_location = filename + ".xml"
	self.year = year

	def makeXmlTree(self, pdf_location):
	cmd = "pdftohtml -xml -nodrm '{}'".format(pdf_location)
	os.system(cmd)
	filename, ext = os.path.splitext(pdf_location)
	return ET.parse(self.xml_location)

	def parseTransactions(self):
	tree = self.makeXmlTree(self.pdf_location)
	transactions = []

	first_page_done = False
	for page in tree.getroot().iter('page'):
	found = page.xpath('.//text[text()="BALANCE BROUGHT FORWARD"]')
	if (found):
	transactions += self.doPageWithEntries(found[0].itersiblings(), first_page_done)
	first_page_done = True

	return transactions

	def doPageWithEntries(self, cursor, first_page_done):
	cursor.next()
	transactions_on_page = []

	if (not first_page_done):
	cursor.next()

	while True:
	try:
	type_and_number = cursor.next()
	description = cursor.next()
	amount = cursor.next()
	if (amount.text == '##'):
	amount = cursor.next()
	date_ish = cursor.next()
	balance = cursor.next()

	transactions_on_page.append([type_and_number.text, description.text, amount.text, date_ish.text, balance.text])
	except StopIteration:
	break

	return transactions_on_page

	def makeAmountFromMatch(self, matchobject):
	rand = matchobject.group(1).replace(".", "")
	cents = matchobject.group(2)
	minus = matchobject.group(3)
	return "{}{}.{}".format(minus, rand, cents)

	def makeAmount(self, string_amount):
	return re.sub(r'([\d.]),(\d)(-?)', self.makeAmountFromMatch, string_amount)

	def makeDate(self, string_date):
	month, day = string_date.split(" ")
	return "{}-{}-{}".format(self.year, month, day)

	def makeLine(self, list):
	type_and_number, description, amount, date_ish, balance = list
	return { 'Reference': type_and_number,
	'Description': description,
	'Amount': self.makeAmount(amount),
	'Date': self.makeDate(date_ish) }

	def writeTransactions(self, listOfList):
	with open(self.csv_location, 'w') as csvfile:
	fieldnames = ['Date', 'Amount', 'Description', 'Reference']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

	writer.writeheader()
	for list in listOfList:
	writer.writerow(self.makeLine(list))

	def go(self):
	parsed = self.parseTransactions()
	self.writeTransactions(parsed)

	ParseStandardBankPDF("2015", 'Statement number 24.pdf').go()
	ParseStandardBankPDF("2015", 'Statement number 25.pdf').go()
	ParseStandardBankPDF("2015", 'Statement number 26.pdf').go()
	ParseStandardBankPDF("2015", 'Statement number 27.pdf').go()