Created
July 26, 2015 19:03
-
-
Save vinovator/71e001beafa93ee1768d to your computer and use it in GitHub Desktop.
From a set of invoice pdf files within a folder, extract the invoice number and client information and place them in an excel file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'Vinoth_Subramanian' | |
# Python3 | |
# pdfInvoiceMiner.py | |
# Program to extract the client info and invoice no from a bunch of invoice pdf files | |
# pdfminer3k library is used to extract text from pdf | |
# PyPDF2 library does not extract the text from pdf properly | |
# place all the invoice pdf files within a folder named "INVOICE" | |
# place an excel file named "invoice_info.xlsx" in the parent folder of "INVOICE" | |
# First column - invoice no; Second column - client details | |
import PyPDF2 | |
from pdfminer.pdfparser import PDFParser, PDFDocument | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import PDFPageAggregator | |
from pdfminer.layout import LAParams, LTTextBox, LTTextLine | |
import os | |
import openpyxl | |
# The folder path where are all the invoices are placed | |
invoice_path = ".\COMPANY_NAME\INVOICE" # Company name masked - replace (parent folder name) | |
# The excel file where the invoice information is extracted | |
invoice_info_xl = ".\COMPANY_NAME\invoice_info.xlsx" # Company name masked - replace (parent folder name) | |
# Within the pdf, the start block and end block within which the client information is placed | |
client_start = "Ph: 9999999999, 9999999999" # Phone number masked. Replace with actual phone number in invoice | |
client_end = "Qty Description" | |
# From the pdf file name, the start block and end block within which the invoice no is placed | |
invoice_start = "Invoice-" | |
invoice_end = ".pdf" | |
# Given a string, start block and end block; find the substrinng between them | |
def extract_info(strInfo, strStart, strEnd): | |
return strInfo[strInfo.find(strStart) + len(strStart) : strInfo.rfind(strEnd)] | |
# Open the excel file and update the invoice number in column 1 and client info in column 2 | |
def write_excel(client, invoice_no): | |
wb = openpyxl.load_workbook(invoice_info_xl) | |
sheet = wb.get_sheet_by_name("Sheet1") | |
last_entry = sheet.get_highest_row() | |
sheet.cell(row = last_entry+1, column=1).value = int(invoice_no) | |
sheet.cell(row = last_entry+1, column=2).value = client | |
wb.save(invoice_info_xl) | |
# Extract text from pdf using PyPDF2 library. Does not extract properly. Returns empty string | |
def read_invoice_PyPDF2(pdfFile): | |
pdfFileObj = open(os.path.join(invoice_path + "\\" + pdfFile), "rb") | |
pdfReader = PyPDF2.PdfFileReader(pdfFileObj) | |
pageObj = pdfReader.getPage(0) | |
print("extracting text from " + invoice_path + "\\" + pdfFile) | |
print(pageObj.extractText()) | |
# Extract text from pdf using pdfminer3k library. Complicated but does the job | |
def read_invoice_pdfminer3k(pdfFile): | |
fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb") | |
parser = PDFParser(fp) | |
doc = PDFDocument() | |
parser.set_document(doc) | |
doc.set_parser(parser) | |
doc.initialize("") | |
rsrcmgr = PDFResourceManager() | |
laparams = LAParams() | |
device = PDFPageAggregator(rsrcmgr, laparams=laparams) | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
# Process each page contained in the document. | |
invoice_text = "" | |
for page in doc.get_pages(): | |
interpreter.process_page(page) | |
layout = device.get_result() | |
for lt_obj in layout: | |
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): | |
invoice_text += lt_obj.get_text() | |
# Extract client info from the string extracted from pdf | |
client = extract_info(invoice_text, client_start, client_end) | |
print("client :" + client) | |
# Extract invoice no from the pdf file name | |
invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end) | |
print("invoice no :" + invoice_no) | |
# Pass the client info and invoice no to the method which writes to excel file | |
write_excel(client, invoice_no) | |
def main(): | |
invoice_lst = list() | |
for (dirpath, dirnames,filenames) in os.walk(invoice_path): | |
for filename in filenames: | |
invoice_lst.append(filename) | |
#print(invoice_lst) | |
for index in range(len(invoice_lst)): | |
print("File # {0}. Mining from the invoice file name {1}".format(index, invoice_lst[index])) | |
read_invoice_pdfminer3k(invoice_lst[index]) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment