jsfenfen · March 13, 2017 17:00
diff --git a/read_procurements.py b/read_procurements.py
 import csv
 import re

 """ Roseburg_Procurement.txt created with "$ pdftotext -layout Roseburg_Procurment.pdf """

 infile = open("Roseburg_Procurement.txt","r")

 outfile = open("Roseburg_Procurement_output.csv", "w")
 paymentfile = open("Roseburg_Procurement_payment.csv", "w")
 trashfile = open("Roseburg_Procurement_trashlines.txt", "w")

 writer = csv.writer(outfile)
 writer.writerow(["Page", "Payable Number", "Payable date", "payable description", "discount amount", "payable amount", "Vendor Number", "Vendor Name", "Payment Date","Payment Type","Discount Amount", "Payment Amount", "Number"])

 paymentwriter = csv.writer(paymentfile)
 # manual header rows... 
 paymentwriter.writerow(["Vendor Number", "Vendor Name", "Payment Date","Payment Type","Discount Amount", "Payment Amount", "Number"])

 # Regexes for parsing
 PAGE_RE = re.compile(r"Page (\d+) of 142")
 INVOICE_RE = re.compile("(.+?)\s+Invoice\s+(\d\d/\d\d/\d+)\s+(.+?)\s+([\d,]+\.\d+)\s+([\d,]+\.\d+)" )
 PAYMENT_RE = re.compile("(\d\d\d\d)\s+(.+?)\s+(\d\d/\d\d/\d+)\s+(.+?)\s+([\d,]+\.\d+)\s+([\-\d,]+\.\d+)\s+(\d\d\d\d\d)")
 HEADER1_RE = re.compile("Vendor Number\s+Vendor Name\s+Payment Date\s+Payment Type") # the line continues past this
 HEADER2_RE = re.compile("Payable \#\s+Payable Type\s+Payable Date")

 pagenum = 1
 linenum = 0
 last_master_line = ""

 for line in infile:
    # ignore weird chars
    linenum += 1
    line = re.sub(r'[\x80\xa6\xe2]', '', line)

    # ignore asterisks:
    line = line.replace("*","")

    if re.search("Invoice", line):
        invoice = re.search(INVOICE_RE, line)
        if invoice:
            writer.writerow([pagenum, invoice.group(1), invoice.group(2), invoice.group(3), invoice.group(4), invoice.group(5)] + last_master_line)
        else:
            print "No match for invoice: " + line
            trashfile.write("page=%s line=%s:  %s" % (pagenum, linenum, line))
            assert False # This shouldn't happen

    elif re.match("\d\d\d\d\s+", line): # If the line starts with a four digit number it's generally a payment
        payment_data = re.search(PAYMENT_RE, line)
        if payment_data:
            print("Payment " + line)
            last_master_line = [payment_data.group(1), payment_data.group(2), payment_data.group(3), payment_data.group(4), payment_data.group(5), payment_data.group(6), payment_data.group(7)]
            paymentwriter.writerow(last_master_line)
        else:
            print "%s. No payment match for %s" % (linenum, line)
            trashfile.write("page=%s line=%s: %s" % (pagenum, linenum, line))
            # unfortunately this does happen some

    elif re.match(HEADER1_RE, line) or re.search(HEADER2_RE, line):
        pass

    elif len(line)==1:
        pass

    else:
        if line:
            trashfile.write("page=%s line=%s: %s" % (pagenum, linenum, line))

    page = re.search(PAGE_RE, line)
    if page:
        print("Processed page %s" % ( page.group(1) ) )
        pagenum += 1
	import csv
	import re

	""" Roseburg_Procurement.txt created with "$ pdftotext -layout Roseburg_Procurment.pdf """

	infile = open("Roseburg_Procurement.txt","r")

	outfile = open("Roseburg_Procurement_output.csv", "w")
	paymentfile = open("Roseburg_Procurement_payment.csv", "w")
	trashfile = open("Roseburg_Procurement_trashlines.txt", "w")

	writer = csv.writer(outfile)
	writer.writerow(["Page", "Payable Number", "Payable date", "payable description", "discount amount", "payable amount", "Vendor Number", "Vendor Name", "Payment Date","Payment Type","Discount Amount", "Payment Amount", "Number"])

	paymentwriter = csv.writer(paymentfile)
	# manual header rows...
	paymentwriter.writerow(["Vendor Number", "Vendor Name", "Payment Date","Payment Type","Discount Amount", "Payment Amount", "Number"])

	# Regexes for parsing
	PAGE_RE = re.compile(r"Page (\d+) of 142")
	INVOICE_RE = re.compile("(.+?)\s+Invoice\s+(\d\d/\d\d/\d+)\s+(.+?)\s+([\d,]+\.\d+)\s+([\d,]+\.\d+)" )
	PAYMENT_RE = re.compile("(\d\d\d\d)\s+(.+?)\s+(\d\d/\d\d/\d+)\s+(.+?)\s+([\d,]+\.\d+)\s+([\-\d,]+\.\d+)\s+(\d\d\d\d\d)")
	HEADER1_RE = re.compile("Vendor Number\s+Vendor Name\s+Payment Date\s+Payment Type") # the line continues past this
	HEADER2_RE = re.compile("Payable \#\s+Payable Type\s+Payable Date")

	pagenum = 1
	linenum = 0
	last_master_line = ""

	for line in infile:
	# ignore weird chars
	linenum += 1
	line = re.sub(r'[\x80\xa6\xe2]', '', line)

	# ignore asterisks:
	line = line.replace("*","")

	if re.search("Invoice", line):
	invoice = re.search(INVOICE_RE, line)
	if invoice:
	writer.writerow([pagenum, invoice.group(1), invoice.group(2), invoice.group(3), invoice.group(4), invoice.group(5)] + last_master_line)
	else:
	print "No match for invoice: " + line
	trashfile.write("page=%s line=%s: %s" % (pagenum, linenum, line))
	assert False # This shouldn't happen

	elif re.match("\d\d\d\d\s+", line): # If the line starts with a four digit number it's generally a payment
	payment_data = re.search(PAYMENT_RE, line)
	if payment_data:
	print("Payment " + line)
	last_master_line = [payment_data.group(1), payment_data.group(2), payment_data.group(3), payment_data.group(4), payment_data.group(5), payment_data.group(6), payment_data.group(7)]
	paymentwriter.writerow(last_master_line)
	else:
	print "%s. No payment match for %s" % (linenum, line)
	trashfile.write("page=%s line=%s: %s" % (pagenum, linenum, line))
	# unfortunately this does happen some

	elif re.match(HEADER1_RE, line) or re.search(HEADER2_RE, line):
	pass

	elif len(line)==1:
	pass

	else:
	if line:
	trashfile.write("page=%s line=%s: %s" % (pagenum, linenum, line))

	page = re.search(PAGE_RE, line)
	if page:
	print("Processed page %s" % ( page.group(1) ) )
	pagenum += 1