import cv2 |
import numpy as np |
from PIL import Image |
import pandas as pd |
import pytesseract |
from sklearn.cluster import AgglomerativeClustering |
# With thanks to these resources: |
# - https://pyimagesearch.com/2022/02/28/multi-column-table-ocr/ |
# - https://stackoverflow.com/questions/33949831/how-to-remove-all-lines-and-borders-in-an-image-while-keeping-text-programmatica |
def read(image_path: str, distance_threshold=25.0) -> pd.DataFrame: |
# Preprocess the image. |
img = preprocess(image_path) |
# Read the image into a Pytesseract data frame. |
img_df = pytesseract.image_to_data(img, output_type="data.frame") |
# Drop any blank text. |
img_df.dropna(inplace=True) |
# Add row numbers to the dataframe. We do this by clustering rows according |
# to their "top" value. We then determine the max "top" value for each row. |
# Then we assign row numbers to the dataframe based on top values. |
row_max_tops = get_row_max_tops(img_df, distance_threshold) |
img_df["row_number"] = pd.Series([], dtype=object) |
for row_number, row_max_top in row_max_tops: |
if row_number > 0: lower_bound = row_max_tops[row_number - 1][1] + 1 # E.g. if the prior row has a max top of 50, the lower bound for the next row is 51 |
else: lower_bound = 0 |
upper_bound = row_max_top |
img_df.loc[img_df["top"].between(lower_bound, upper_bound), "row_number"] = row_number |
# Sort the dataframe by row number, left, and word_num so we can build table content logically. |
img_df.sort_values(["row_number", "left", "word_num"], inplace=True) |
# Build the table content. |
table_content = [] |
for row_number in img_df["row_number"].unique(): |
row_content = [] |
cell_content = [] |
for _, word in img_df[img_df["row_number"] == row_number].iterrows(): |
if word["word_num"] == 1 and len(cell_content) > 0: |
row_content.append(" ".join(cell_content)) |
cell_content = [] |
cell_content.append(word["text"]) |
row_content.append(" ".join(cell_content)) |
table_content.append(row_content) |
# Convert the table content to a dataframe, and return it. |
return pd.DataFrame(table_content) |
def get_row_max_tops(img_df: pd.DataFrame, distance_threshold: float) -> list: |
# Create coordinates to use for clustering top values for rows. Note that |
# we use (0, y), where why is "top." We specify 0 for x because we don't |
# care here about the left value, only the top value. |
row_coordinates = [(0, row["top"]) for _, row in img_df.iterrows()] |
# Cluster rows by top values. |
row_clusters = AgglomerativeClustering( |
n_clusters=None, |
affinity="manhattan", |
linkage="complete", |
distance_threshold=distance_threshold) |
row_clusters.fit(row_coordinates) |
# Create max row tops values using row clusters and sort ascending. |
row_max_tops = [] |
for row_index in np.unique(row_clusters.labels_): |
row_coordinate_indexes = np.where(row_clusters.labels_ == row_index)[0] |
row_max_top = max([row_coordinates[row_coordinate_index][1] for row_coordinate_index in row_coordinate_indexes]) |
row_max_tops.append(row_max_top) |
row_max_tops.sort() |
# Return the row index and max top for each row. |
return [(i, row_max_top) for i, row_max_top in enumerate(row_max_tops)] |
def preprocess(image_path: str) -> np.ndarray: |
# Get the image. |
img = cv2.imread(image_path) |
# Convert the image to grayscale. |
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) |
# Remove backgrounds. |
bg_free_img = cv2.threshold(gray_img, 0, 255, cv2.THRESH_OTSU)[1] |
# Create an inverse image to use for removing lines. |
inverted_img = ~ bg_free_img |
# Remove horizontal lines. |
# TODO: Set line thickness dynamically. |
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)) |
remove_horizontal = cv2.morphologyEx(inverted_img, cv2.MORPH_OPEN, horizontal_kernel, iterations=2) |
cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
cnts = cnts[0] if len(cnts) == 2 else cnts[1] |
for c in cnts: cv2.drawContours(bg_free_img, [c], -1, (255, 255, 255), 2) |
# Remove vertical lines. |
# TODO: Set line thickness dynamically. |
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,40)) |
remove_vertical = cv2.morphologyEx(inverted_img, cv2.MORPH_OPEN, vertical_kernel, iterations=2) |
cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
cnts = cnts[0] if len(cnts) == 2 else cnts[1] |
for c in cnts: cv2.drawContours(bg_free_img, [c], -1, (255, 255, 255), 2) |
# Return the output image. |
return bg_free_img |
Added clustering to group rows and then calculate row numbers based on max top row values for clusters.