Skip to content

Instantly share code, notes, and snippets.

@jsfenfen
Created March 13, 2017 17:00
Show Gist options
  • Save jsfenfen/a398a597064dc7e1a69c79f82c9c5f97 to your computer and use it in GitHub Desktop.
Save jsfenfen/a398a597064dc7e1a69c79f82c9c5f97 to your computer and use it in GitHub Desktop.
Read Roseburg, OR's pdf "check report"
import csv
import re
""" Roseburg_Procurement.txt created with "$ pdftotext -layout Roseburg_Procurment.pdf """
infile = open("Roseburg_Procurement.txt","r")
outfile = open("Roseburg_Procurement_output.csv", "w")
paymentfile = open("Roseburg_Procurement_payment.csv", "w")
trashfile = open("Roseburg_Procurement_trashlines.txt", "w")
writer = csv.writer(outfile)
writer.writerow(["Page", "Payable Number", "Payable date", "payable description", "discount amount", "payable amount", "Vendor Number", "Vendor Name", "Payment Date","Payment Type","Discount Amount", "Payment Amount", "Number"])
paymentwriter = csv.writer(paymentfile)
# manual header rows...
paymentwriter.writerow(["Vendor Number", "Vendor Name", "Payment Date","Payment Type","Discount Amount", "Payment Amount", "Number"])
# Regexes for parsing
PAGE_RE = re.compile(r"Page (\d+) of 142")
INVOICE_RE = re.compile("(.+?)\s+Invoice\s+(\d\d/\d\d/\d+)\s+(.+?)\s+([\d,]+\.\d+)\s+([\d,]+\.\d+)" )
PAYMENT_RE = re.compile("(\d\d\d\d)\s+(.+?)\s+(\d\d/\d\d/\d+)\s+(.+?)\s+([\d,]+\.\d+)\s+([\-\d,]+\.\d+)\s+(\d\d\d\d\d)")
HEADER1_RE = re.compile("Vendor Number\s+Vendor Name\s+Payment Date\s+Payment Type") # the line continues past this
HEADER2_RE = re.compile("Payable \#\s+Payable Type\s+Payable Date")
pagenum = 1
linenum = 0
last_master_line = ""
for line in infile:
# ignore weird chars
linenum += 1
line = re.sub(r'[\x80\xa6\xe2]', '', line)
# ignore asterisks:
line = line.replace("*","")
if re.search("Invoice", line):
invoice = re.search(INVOICE_RE, line)
if invoice:
writer.writerow([pagenum, invoice.group(1), invoice.group(2), invoice.group(3), invoice.group(4), invoice.group(5)] + last_master_line)
else:
print "No match for invoice: " + line
trashfile.write("page=%s line=%s: %s" % (pagenum, linenum, line))
assert False # This shouldn't happen
elif re.match("\d\d\d\d\s+", line): # If the line starts with a four digit number it's generally a payment
payment_data = re.search(PAYMENT_RE, line)
if payment_data:
print("Payment " + line)
last_master_line = [payment_data.group(1), payment_data.group(2), payment_data.group(3), payment_data.group(4), payment_data.group(5), payment_data.group(6), payment_data.group(7)]
paymentwriter.writerow(last_master_line)
else:
print "%s. No payment match for %s" % (linenum, line)
trashfile.write("page=%s line=%s: %s" % (pagenum, linenum, line))
# unfortunately this does happen some
elif re.match(HEADER1_RE, line) or re.search(HEADER2_RE, line):
pass
elif len(line)==1:
pass
else:
if line:
trashfile.write("page=%s line=%s: %s" % (pagenum, linenum, line))
page = re.search(PAGE_RE, line)
if page:
print("Processed page %s" % ( page.group(1) ) )
pagenum += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment