jasalt · January 26, 2021 06:25
diff --git a/pdfplumber-tesseract.py b/pdfplumber-tesseract.py
 # Extracting tabular data from pdf using Python pdfplumber together with Tesseract OCR
 # Author Jarkko Saltiola 2021 (MIT License, Python 3.8.6)

 # Pdfplumber, tabula, camelot and probably some other PDF parser utilities have hard 
 # time parsing tables that have column data overlapping over other columns, and
 # probably on many other cases too.

 # Pdfplumber gives a good level of control for splitting pdf into parts which can
 # be read with it's methods or be passed for pytesseract as PIL image.

 # This can be a starting point for hooking these two together.
 # I abandoned this as it started getting too complex and there was a better workaround. 
 # Main issue was matching the row values together and occasional unmatching count of rows that both tools returned.

 # Could be improved by matching row values with some intelligence. 
 # Or row positions could be hard coded so their count would stay the same.
 # And maybe run tesseract for all the pre-cropped cells.

 import pdfplumber
 import pytesseract

 ts = {
    "vertical_strategy": "text",  # "lines" is the default option that didn't work
    "horizontal_strategy": "text",
    "keep_blank_chars": False,  # Try with True as well if False doesn't give good results
 }

 pdf = pdfplumber.open("inventory.pdf")

 for page in pdf.pages:
    # Extract first (EAN) column with pdfplumber table extractor

    # Select crop box eg. with MacOS Preview, rectangular crop tool, info window crop area.
    crop_ean = page.crop([32,52,110,780])
    # crop_ean_img = crop_ean.to_image()
    # crop_ean_img.original.show()  # preview the crop result
    crop_table_ean = crop_ean.extract_table(table_settings=ts)


    # Extract the third (count) column with Tesseract OCR

    crop_count = page.crop([350,71,410,71+710])
    # Pass cropped image to pytesseract, use higher resolution (default 72) for better accuracy.
    crop_count_img = crop_count.to_image(resolution=300)
    # crop_count_img.original.show()

    # Tesseract setting to read columns and only numbers
    ocr = pytesseract.image_to_string(crop_count_img.original,config='--psm 6 -c tessedit_char_whitelist=0123456789')

    ocr_list = ocr.split("\n")  # string to list

    if ocr_list[-1] == '\x0c':  # clean carbage item from the end
        ocr_list = ocr_list[:-1]


    print(f"{len(crop_table_ean)} {len(ocr_list)}")  # These counts should match reliably
	# Extracting tabular data from pdf using Python pdfplumber together with Tesseract OCR
	# Author Jarkko Saltiola 2021 (MIT License, Python 3.8.6)

	# Pdfplumber, tabula, camelot and probably some other PDF parser utilities have hard
	# time parsing tables that have column data overlapping over other columns, and
	# probably on many other cases too.

	# Pdfplumber gives a good level of control for splitting pdf into parts which can
	# be read with it's methods or be passed for pytesseract as PIL image.

	# This can be a starting point for hooking these two together.
	# I abandoned this as it started getting too complex and there was a better workaround.
	# Main issue was matching the row values together and occasional unmatching count of rows that both tools returned.

	# Could be improved by matching row values with some intelligence.
	# Or row positions could be hard coded so their count would stay the same.
	# And maybe run tesseract for all the pre-cropped cells.

	import pdfplumber
	import pytesseract

	ts = {
	"vertical_strategy": "text", # "lines" is the default option that didn't work
	"horizontal_strategy": "text",
	"keep_blank_chars": False, # Try with True as well if False doesn't give good results
	}

	pdf = pdfplumber.open("inventory.pdf")

	for page in pdf.pages:
	# Extract first (EAN) column with pdfplumber table extractor

	# Select crop box eg. with MacOS Preview, rectangular crop tool, info window crop area.
	crop_ean = page.crop([32,52,110,780])
	# crop_ean_img = crop_ean.to_image()
	# crop_ean_img.original.show() # preview the crop result
	crop_table_ean = crop_ean.extract_table(table_settings=ts)


	# Extract the third (count) column with Tesseract OCR

	crop_count = page.crop([350,71,410,71+710])
	# Pass cropped image to pytesseract, use higher resolution (default 72) for better accuracy.
	crop_count_img = crop_count.to_image(resolution=300)
	# crop_count_img.original.show()

	# Tesseract setting to read columns and only numbers
	ocr = pytesseract.image_to_string(crop_count_img.original,config='--psm 6 -c tessedit_char_whitelist=0123456789')

	ocr_list = ocr.split("\n") # string to list

	if ocr_list[-1] == '\x0c': # clean carbage item from the end
	ocr_list = ocr_list[:-1]


	print(f"{len(crop_table_ean)} {len(ocr_list)}") # These counts should match reliably