|
import cv2 |
|
import numpy as np |
|
from PIL import Image |
|
import pandas as pd |
|
import pytesseract |
|
from sklearn.cluster import AgglomerativeClustering |
|
|
|
# With thanks to these resources: |
|
# - https://pyimagesearch.com/2022/02/28/multi-column-table-ocr/ |
|
# - https://stackoverflow.com/questions/33949831/how-to-remove-all-lines-and-borders-in-an-image-while-keeping-text-programmatica |
|
|
|
def read(image_path: str, distance_threshold=25.0) -> pd.DataFrame: |
|
|
|
# Preprocess the image. |
|
img = preprocess(image_path) |
|
|
|
# Read the image into a Pytesseract data frame. |
|
img_df = pytesseract.image_to_data(img, output_type="data.frame") |
|
|
|
# Drop any blank text. |
|
img_df.dropna(inplace=True) |
|
|
|
# Add row numbers to the dataframe. We do this by clustering rows according |
|
# to their "top" value. We then determine the max "top" value for each row. |
|
# Then we assign row numbers to the dataframe based on top values. |
|
row_max_tops = get_row_max_tops(img_df, distance_threshold) |
|
img_df["row_number"] = pd.Series([], dtype=object) |
|
for row_number, row_max_top in row_max_tops: |
|
if row_number > 0: lower_bound = row_max_tops[row_number - 1][1] + 1 # E.g. if the prior row has a max top of 50, the lower bound for the next row is 51 |
|
else: lower_bound = 0 |
|
upper_bound = row_max_top |
|
img_df.loc[img_df["top"].between(lower_bound, upper_bound), "row_number"] = row_number |
|
|
|
# Sort the dataframe by row number, left, and word_num so we can build table content logically. |
|
img_df.sort_values(["row_number", "left", "word_num"], inplace=True) |
|
|
|
# Build the table content. |
|
table_content = [] |
|
for row_number in img_df["row_number"].unique(): |
|
row_content = [] |
|
cell_content = [] |
|
for _, word in img_df[img_df["row_number"] == row_number].iterrows(): |
|
if word["word_num"] == 1 and len(cell_content) > 0: |
|
row_content.append(" ".join(cell_content)) |
|
cell_content = [] |
|
cell_content.append(word["text"]) |
|
row_content.append(" ".join(cell_content)) |
|
table_content.append(row_content) |
|
|
|
# Convert the table content to a dataframe, and return it. |
|
return pd.DataFrame(table_content) |
|
|
|
def get_row_max_tops(img_df: pd.DataFrame, distance_threshold: float) -> list: |
|
|
|
# Create coordinates to use for clustering top values for rows. Note that |
|
# we use (0, y), where why is "top." We specify 0 for x because we don't |
|
# care here about the left value, only the top value. |
|
row_coordinates = [(0, row["top"]) for _, row in img_df.iterrows()] |
|
|
|
# Cluster rows by top values. |
|
row_clusters = AgglomerativeClustering( |
|
n_clusters=None, |
|
affinity="manhattan", |
|
linkage="complete", |
|
distance_threshold=distance_threshold) |
|
row_clusters.fit(row_coordinates) |
|
|
|
# Create max row tops values using row clusters and sort ascending. |
|
row_max_tops = [] |
|
for row_index in np.unique(row_clusters.labels_): |
|
row_coordinate_indexes = np.where(row_clusters.labels_ == row_index)[0] |
|
row_max_top = max([row_coordinates[row_coordinate_index][1] for row_coordinate_index in row_coordinate_indexes]) |
|
row_max_tops.append(row_max_top) |
|
row_max_tops.sort() |
|
|
|
# Return the row index and max top for each row. |
|
return [(i, row_max_top) for i, row_max_top in enumerate(row_max_tops)] |
|
|
|
def preprocess(image_path: str) -> np.ndarray: |
|
|
|
# Get the image. |
|
img = cv2.imread(image_path) |
|
|
|
# Convert the image to grayscale. |
|
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) |
|
|
|
# Remove backgrounds. |
|
bg_free_img = cv2.threshold(gray_img, 0, 255, cv2.THRESH_OTSU)[1] |
|
|
|
# Create an inverse image to use for removing lines. |
|
inverted_img = ~ bg_free_img |
|
|
|
# Remove horizontal lines. |
|
# TODO: Set line thickness dynamically. |
|
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)) |
|
remove_horizontal = cv2.morphologyEx(inverted_img, cv2.MORPH_OPEN, horizontal_kernel, iterations=2) |
|
cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
|
cnts = cnts[0] if len(cnts) == 2 else cnts[1] |
|
for c in cnts: cv2.drawContours(bg_free_img, [c], -1, (255, 255, 255), 2) |
|
|
|
# Remove vertical lines. |
|
# TODO: Set line thickness dynamically. |
|
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,40)) |
|
remove_vertical = cv2.morphologyEx(inverted_img, cv2.MORPH_OPEN, vertical_kernel, iterations=2) |
|
cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
|
cnts = cnts[0] if len(cnts) == 2 else cnts[1] |
|
for c in cnts: cv2.drawContours(bg_free_img, [c], -1, (255, 255, 255), 2) |
|
|
|
# Return the output image. |
|
return bg_free_img |
Added clustering to group rows and then calculate row numbers based on max top row values for clusters.