mrtopf · October 2, 2010 23:01
diff --git a/convert_haushalt.py b/convert_haushalt.py
 import sys
 from elementtree.ElementTree import parse

 class Record(object):
    """the final record"""

    def _toint(self, s):
        """convert a financial number to int"""
        if s=="-":
            return -1
        else:
            s = s.replace(" ","")
            return int(s)

    def __init__(self, funktion, zweck, soll11, soll10, ist09):
        self.funktion = funktion
        #self.zweck = unicode(zweck.decode("utf8"))
        self.zweck = zweck
        self.soll11 = self._toint(soll11)
        self.soll10 = self._toint(soll10)
        self.ist09 = self._toint(ist09)

    def __str__(self):
        s= u"""<Record: %s: %s, Soll 2011: %s, Soll 2010: %s, Ist 2009: %s>""" %(
                self.funktion,
                self.zweck.encode("latin-1","ignore"),
                self.soll11,
                self.soll10,
                self.ist09
                )
        return s

 class Row(object):

    def __init__(self, line):
        self.top = int(line.attrib['top'])
        self.left = int(line.attrib['left'])
        self.text = line.text

 class Page(object):
    """a page"""

    def __init__(self, no):
        self.no = no
        self.rows = {}

    def add(self, line):
        top = int(line.attrib['top'])
        self.rows.setdefault(top,[]).append(Row(line))


 class HaushaltsParser(object):
    """parse an XML haushalt"""

    def __init__(self, filename):
        """initialize the Parser with a filename to parse. The file needs to be 
        created with htmltopdf -x <file.pdf>
        """

        self.filename = filename
        self.pages = []
        self.records = [] # the resulting records

    def read(self):
        """read the file and create pages and rows per page"""
        tree = parse(self.filename)
        elem = tree.getroot()
        pageobjs =  elem.findall("page")

        for pageobj in pageobjs:
            no = int(pageobj.attrib['number'])
            page = Page(no)

            for line in pageobj.findall("text"):
                top = line.attrib['top']
                page.add(line)

            self.pages.append(page)

    def process(self):
        """process pages and rows into records"""
        for page in self.pages:
            for row in page.rows.values():
                for column in row:
                    # this is the column with F- in front
                    if column.left==68:
                        self.find_record(column, row, page)

    def find_record(self, column, row, page):
        """try to find a record""" 
        top = column.top
        all_columns = page.rows[top]
        # sort columns by left pos
        all_columns.sort(lambda x,y: cmp(x.left,y.left))
        # extract the text
        cols_text = [c.text for c in all_columns]
        record = Record(*cols_text)
        print record.funktion, record.zweck, record.soll11, record.soll10, record.ist09






 if __name__=="__main__":
    p = HaushaltsParser("epl01.xml")
    p.read()
    p.process()
	import sys
	from elementtree.ElementTree import parse

	class Record(object):
	"""the final record"""

	def _toint(self, s):
	"""convert a financial number to int"""
	if s=="-":
	return -1
	else:
	s = s.replace(" ","")
	return int(s)

	def __init__(self, funktion, zweck, soll11, soll10, ist09):
	self.funktion = funktion
	#self.zweck = unicode(zweck.decode("utf8"))
	self.zweck = zweck
	self.soll11 = self._toint(soll11)
	self.soll10 = self._toint(soll10)
	self.ist09 = self._toint(ist09)

	def __str__(self):
	s= u"""<Record: %s: %s, Soll 2011: %s, Soll 2010: %s, Ist 2009: %s>""" %(
	self.funktion,
	self.zweck.encode("latin-1","ignore"),
	self.soll11,
	self.soll10,
	self.ist09
	)
	return s

	class Row(object):

	def __init__(self, line):
	self.top = int(line.attrib['top'])
	self.left = int(line.attrib['left'])
	self.text = line.text

	class Page(object):
	"""a page"""

	def __init__(self, no):
	self.no = no
	self.rows = {}

	def add(self, line):
	top = int(line.attrib['top'])
	self.rows.setdefault(top,[]).append(Row(line))


	class HaushaltsParser(object):
	"""parse an XML haushalt"""

	def __init__(self, filename):
	"""initialize the Parser with a filename to parse. The file needs to be
	created with htmltopdf -x <file.pdf>
	"""

	self.filename = filename
	self.pages = []
	self.records = [] # the resulting records

	def read(self):
	"""read the file and create pages and rows per page"""
	tree = parse(self.filename)
	elem = tree.getroot()
	pageobjs = elem.findall("page")

	for pageobj in pageobjs:
	no = int(pageobj.attrib['number'])
	page = Page(no)

	for line in pageobj.findall("text"):
	top = line.attrib['top']
	page.add(line)

	self.pages.append(page)

	def process(self):
	"""process pages and rows into records"""
	for page in self.pages:
	for row in page.rows.values():
	for column in row:
	# this is the column with F- in front
	if column.left==68:
	self.find_record(column, row, page)

	def find_record(self, column, row, page):
	"""try to find a record"""
	top = column.top
	all_columns = page.rows[top]
	# sort columns by left pos
	all_columns.sort(lambda x,y: cmp(x.left,y.left))
	# extract the text
	cols_text = [c.text for c in all_columns]
	record = Record(*cols_text)
	print record.funktion, record.zweck, record.soll11, record.soll10, record.ist09






	if __name__=="__main__":
	p = HaushaltsParser("epl01.xml")
	p.read()
	p.process()
No results found