Created
January 26, 2021 06:25
-
-
Save jasalt/f843566fc3fd6cac5a8af07930387181 to your computer and use it in GitHub Desktop.
Extract tables from pdf using pdfplumber and pytesseract
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Extracting tabular data from pdf using Python pdfplumber together with Tesseract OCR | |
| # Author Jarkko Saltiola 2021 (MIT License, Python 3.8.6) | |
| # Pdfplumber, tabula, camelot and probably some other PDF parser utilities have hard | |
| # time parsing tables that have column data overlapping over other columns, and | |
| # probably on many other cases too. | |
| # Pdfplumber gives a good level of control for splitting pdf into parts which can | |
| # be read with it's methods or be passed for pytesseract as PIL image. | |
| # This can be a starting point for hooking these two together. | |
| # I abandoned this as it started getting too complex and there was a better workaround. | |
| # Main issue was matching the row values together and occasional unmatching count of rows that both tools returned. | |
| # Could be improved by matching row values with some intelligence. | |
| # Or row positions could be hard coded so their count would stay the same. | |
| # And maybe run tesseract for all the pre-cropped cells. | |
| import pdfplumber | |
| import pytesseract | |
| ts = { | |
| "vertical_strategy": "text", # "lines" is the default option that didn't work | |
| "horizontal_strategy": "text", | |
| "keep_blank_chars": False, # Try with True as well if False doesn't give good results | |
| } | |
| pdf = pdfplumber.open("inventory.pdf") | |
| for page in pdf.pages: | |
| # Extract first (EAN) column with pdfplumber table extractor | |
| # Select crop box eg. with MacOS Preview, rectangular crop tool, info window crop area. | |
| crop_ean = page.crop([32,52,110,780]) | |
| # crop_ean_img = crop_ean.to_image() | |
| # crop_ean_img.original.show() # preview the crop result | |
| crop_table_ean = crop_ean.extract_table(table_settings=ts) | |
| # Extract the third (count) column with Tesseract OCR | |
| crop_count = page.crop([350,71,410,71+710]) | |
| # Pass cropped image to pytesseract, use higher resolution (default 72) for better accuracy. | |
| crop_count_img = crop_count.to_image(resolution=300) | |
| # crop_count_img.original.show() | |
| # Tesseract setting to read columns and only numbers | |
| ocr = pytesseract.image_to_string(crop_count_img.original,config='--psm 6 -c tessedit_char_whitelist=0123456789') | |
| ocr_list = ocr.split("\n") # string to list | |
| if ocr_list[-1] == '\x0c': # clean carbage item from the end | |
| ocr_list = ocr_list[:-1] | |
| print(f"{len(crop_table_ean)} {len(ocr_list)}") # These counts should match reliably |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment