Skip to content

Instantly share code, notes, and snippets.

@simonmesmith
Last active October 10, 2023 13:45
Show Gist options
  • Save simonmesmith/73face2a11e226f1cae2481a3927edf7 to your computer and use it in GitHub Desktop.
Save simonmesmith/73face2a11e226f1cae2481a3927edf7 to your computer and use it in GitHub Desktop.
Pytesseract table reader

Pytesseract tablereader

Here's some code that tries to solve a problem that I think I've identified with regard to converting tables in images into dataframes that we can work with programmatically.

Right now, I see two main solutions in market. One is to use third-party APIs like Amazon's and Google's products in this area, which can get expensive at scale. Another is to use or build upon complex code that uses image processing libraries like OpenCV to find gridlines and use these to determine table rows and columns.

My hypothesis is that we can use only Pytesseract to read tables, since it provides coordinates of text in images, and tables follow a standard structure (rows and columns). I've been working on the code here accordingly.

Usage is very simple:

img = "/path-to-your-image.jpg"
df = read(img)
import cv2
import numpy as np
from PIL import Image
import pandas as pd
import pytesseract
from sklearn.cluster import AgglomerativeClustering
# With thanks to these resources:
# - https://pyimagesearch.com/2022/02/28/multi-column-table-ocr/
# - https://stackoverflow.com/questions/33949831/how-to-remove-all-lines-and-borders-in-an-image-while-keeping-text-programmatica
def read(image_path: str, distance_threshold=25.0) -> pd.DataFrame:
# Preprocess the image.
img = preprocess(image_path)
# Read the image into a Pytesseract data frame.
img_df = pytesseract.image_to_data(img, output_type="data.frame")
# Drop any blank text.
img_df.dropna(inplace=True)
# Add row numbers to the dataframe. We do this by clustering rows according
# to their "top" value. We then determine the max "top" value for each row.
# Then we assign row numbers to the dataframe based on top values.
row_max_tops = get_row_max_tops(img_df, distance_threshold)
img_df["row_number"] = pd.Series([], dtype=object)
for row_number, row_max_top in row_max_tops:
if row_number > 0: lower_bound = row_max_tops[row_number - 1][1] + 1 # E.g. if the prior row has a max top of 50, the lower bound for the next row is 51
else: lower_bound = 0
upper_bound = row_max_top
img_df.loc[img_df["top"].between(lower_bound, upper_bound), "row_number"] = row_number
# Sort the dataframe by row number, left, and word_num so we can build table content logically.
img_df.sort_values(["row_number", "left", "word_num"], inplace=True)
# Build the table content.
table_content = []
for row_number in img_df["row_number"].unique():
row_content = []
cell_content = []
for _, word in img_df[img_df["row_number"] == row_number].iterrows():
if word["word_num"] == 1 and len(cell_content) > 0:
row_content.append(" ".join(cell_content))
cell_content = []
cell_content.append(word["text"])
row_content.append(" ".join(cell_content))
table_content.append(row_content)
# Convert the table content to a dataframe, and return it.
return pd.DataFrame(table_content)
def get_row_max_tops(img_df: pd.DataFrame, distance_threshold: float) -> list:
# Create coordinates to use for clustering top values for rows. Note that
# we use (0, y), where why is "top." We specify 0 for x because we don't
# care here about the left value, only the top value.
row_coordinates = [(0, row["top"]) for _, row in img_df.iterrows()]
# Cluster rows by top values.
row_clusters = AgglomerativeClustering(
n_clusters=None,
affinity="manhattan",
linkage="complete",
distance_threshold=distance_threshold)
row_clusters.fit(row_coordinates)
# Create max row tops values using row clusters and sort ascending.
row_max_tops = []
for row_index in np.unique(row_clusters.labels_):
row_coordinate_indexes = np.where(row_clusters.labels_ == row_index)[0]
row_max_top = max([row_coordinates[row_coordinate_index][1] for row_coordinate_index in row_coordinate_indexes])
row_max_tops.append(row_max_top)
row_max_tops.sort()
# Return the row index and max top for each row.
return [(i, row_max_top) for i, row_max_top in enumerate(row_max_tops)]
def preprocess(image_path: str) -> np.ndarray:
# Get the image.
img = cv2.imread(image_path)
# Convert the image to grayscale.
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Remove backgrounds.
bg_free_img = cv2.threshold(gray_img, 0, 255, cv2.THRESH_OTSU)[1]
# Create an inverse image to use for removing lines.
inverted_img = ~ bg_free_img
# Remove horizontal lines.
# TODO: Set line thickness dynamically.
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
remove_horizontal = cv2.morphologyEx(inverted_img, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts: cv2.drawContours(bg_free_img, [c], -1, (255, 255, 255), 2)
# Remove vertical lines.
# TODO: Set line thickness dynamically.
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,40))
remove_vertical = cv2.morphologyEx(inverted_img, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts: cv2.drawContours(bg_free_img, [c], -1, (255, 255, 255), 2)
# Return the output image.
return bg_free_img
@simonmesmith
Copy link
Author

simonmesmith commented Jul 18, 2022

To do:

  • Remove background noise before running OCR
  • Adjust approach so we’re not using an average row height, but calculating the height per row since some rows may be taller than others (requires us to cluster rows by top values, likely)

@simonmesmith
Copy link
Author

Added clustering to group rows and then calculate row numbers based on max top row values for clusters.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment