simonmesmith · October 10, 2023 13:45 · simonmesmith · Jul 18, 2022
diff --git a/pytesseract_tablereader.md b/pytesseract_tablereader.md
diff --git a/pytesseract_tablereader.py b/pytesseract_tablereader.py
 import cv2
 import numpy as np
 from PIL import Image
 import pandas as pd
 import pytesseract
 from sklearn.cluster import AgglomerativeClustering

 # With thanks to these resources:
 # - https://pyimagesearch.com/2022/02/28/multi-column-table-ocr/
 # - https://stackoverflow.com/questions/33949831/how-to-remove-all-lines-and-borders-in-an-image-while-keeping-text-programmatica

 def read(image_path: str, distance_threshold=25.0) -> pd.DataFrame:

    # Preprocess the image.
    img = preprocess(image_path)

    # Read the image into a Pytesseract data frame.
    img_df = pytesseract.image_to_data(img, output_type="data.frame")

    # Drop any blank text.
    img_df.dropna(inplace=True)

    # Add row numbers to the dataframe. We do this by clustering rows according
    # to their "top" value. We then determine the max "top" value for each row.
    # Then we assign row numbers to the dataframe based on top values.
    row_max_tops = get_row_max_tops(img_df, distance_threshold)
    img_df["row_number"] = pd.Series([], dtype=object)
    for row_number, row_max_top in row_max_tops:
        if row_number > 0: lower_bound = row_max_tops[row_number - 1][1] + 1 # E.g. if the prior row has a max top of 50, the lower bound for the next row is 51
        else: lower_bound = 0 
        upper_bound = row_max_top
        img_df.loc[img_df["top"].between(lower_bound, upper_bound), "row_number"] = row_number

    # Sort the dataframe by row number, left, and word_num so we can build table content logically.
    img_df.sort_values(["row_number", "left", "word_num"], inplace=True)

    # Build the table content.
    table_content = []
    for row_number in img_df["row_number"].unique():
        row_content = []
        cell_content = []
        for _, word in img_df[img_df["row_number"] == row_number].iterrows():
            if word["word_num"] == 1 and len(cell_content) > 0:
                row_content.append(" ".join(cell_content))
                cell_content = []
            cell_content.append(word["text"])
        row_content.append(" ".join(cell_content))
        table_content.append(row_content)

    # Convert the table content to a dataframe, and return it.
    return pd.DataFrame(table_content)  

 def get_row_max_tops(img_df: pd.DataFrame, distance_threshold: float) -> list: 

    # Create coordinates to use for clustering top values for rows. Note that 
    # we use (0, y), where why is "top." We specify 0 for x because we don't 
    # care here about the left value, only the top value.
    row_coordinates = [(0, row["top"]) for _, row in img_df.iterrows()]

    # Cluster rows by top values.
    row_clusters = AgglomerativeClustering(
        n_clusters=None,
        affinity="manhattan",
        linkage="complete",
        distance_threshold=distance_threshold)
    row_clusters.fit(row_coordinates)

    # Create max row tops values using row clusters and sort ascending.
    row_max_tops = []
    for row_index in np.unique(row_clusters.labels_):
        row_coordinate_indexes = np.where(row_clusters.labels_ == row_index)[0]
        row_max_top = max([row_coordinates[row_coordinate_index][1] for row_coordinate_index in row_coordinate_indexes])
        row_max_tops.append(row_max_top)
    row_max_tops.sort()

    # Return the row index and max top for each row.
    return [(i, row_max_top) for i, row_max_top in enumerate(row_max_tops)]

 def preprocess(image_path: str) -> np.ndarray:

    # Get the image.
    img = cv2.imread(image_path)
  
    # Convert the image to grayscale.
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Remove backgrounds.
    bg_free_img = cv2.threshold(gray_img, 0, 255, cv2.THRESH_OTSU)[1]

    # Create an inverse image to use for removing lines.
    inverted_img = ~ bg_free_img 

    # Remove horizontal lines.
    # TODO: Set line thickness dynamically.
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
    remove_horizontal = cv2.morphologyEx(inverted_img, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
    cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts: cv2.drawContours(bg_free_img, [c], -1, (255, 255, 255), 2)

    # Remove vertical lines.
    # TODO: Set line thickness dynamically.
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,40))
    remove_vertical = cv2.morphologyEx(inverted_img, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
    cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts: cv2.drawContours(bg_free_img, [c], -1, (255, 255, 255), 2)

    # Return the output image.
    return bg_free_img
	import cv2
	import numpy as np
	from PIL import Image
	import pandas as pd
	import pytesseract
	from sklearn.cluster import AgglomerativeClustering

	# With thanks to these resources:
	# - https://pyimagesearch.com/2022/02/28/multi-column-table-ocr/
	# - https://stackoverflow.com/questions/33949831/how-to-remove-all-lines-and-borders-in-an-image-while-keeping-text-programmatica

	def read(image_path: str, distance_threshold=25.0) -> pd.DataFrame:

	# Preprocess the image.
	img = preprocess(image_path)

	# Read the image into a Pytesseract data frame.
	img_df = pytesseract.image_to_data(img, output_type="data.frame")

	# Drop any blank text.
	img_df.dropna(inplace=True)

	# Add row numbers to the dataframe. We do this by clustering rows according
	# to their "top" value. We then determine the max "top" value for each row.
	# Then we assign row numbers to the dataframe based on top values.
	row_max_tops = get_row_max_tops(img_df, distance_threshold)
	img_df["row_number"] = pd.Series([], dtype=object)
	for row_number, row_max_top in row_max_tops:
	if row_number > 0: lower_bound = row_max_tops[row_number - 1][1] + 1 # E.g. if the prior row has a max top of 50, the lower bound for the next row is 51
	else: lower_bound = 0
	upper_bound = row_max_top
	img_df.loc[img_df["top"].between(lower_bound, upper_bound), "row_number"] = row_number

	# Sort the dataframe by row number, left, and word_num so we can build table content logically.
	img_df.sort_values(["row_number", "left", "word_num"], inplace=True)

	# Build the table content.
	table_content = []
	for row_number in img_df["row_number"].unique():
	row_content = []
	cell_content = []
	for _, word in img_df[img_df["row_number"] == row_number].iterrows():
	if word["word_num"] == 1 and len(cell_content) > 0:
	row_content.append(" ".join(cell_content))
	cell_content = []
	cell_content.append(word["text"])
	row_content.append(" ".join(cell_content))
	table_content.append(row_content)

	# Convert the table content to a dataframe, and return it.
	return pd.DataFrame(table_content)

	def get_row_max_tops(img_df: pd.DataFrame, distance_threshold: float) -> list:

	# Create coordinates to use for clustering top values for rows. Note that
	# we use (0, y), where why is "top." We specify 0 for x because we don't
	# care here about the left value, only the top value.
	row_coordinates = [(0, row["top"]) for _, row in img_df.iterrows()]

	# Cluster rows by top values.
	row_clusters = AgglomerativeClustering(
	n_clusters=None,
	affinity="manhattan",
	linkage="complete",
	distance_threshold=distance_threshold)
	row_clusters.fit(row_coordinates)

	# Create max row tops values using row clusters and sort ascending.
	row_max_tops = []
	for row_index in np.unique(row_clusters.labels_):
	row_coordinate_indexes = np.where(row_clusters.labels_ == row_index)[0]
	row_max_top = max([row_coordinates[row_coordinate_index][1] for row_coordinate_index in row_coordinate_indexes])
	row_max_tops.append(row_max_top)
	row_max_tops.sort()

	# Return the row index and max top for each row.
	return [(i, row_max_top) for i, row_max_top in enumerate(row_max_tops)]

	def preprocess(image_path: str) -> np.ndarray:

	# Get the image.
	img = cv2.imread(image_path)

	# Convert the image to grayscale.
	gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	# Remove backgrounds.
	bg_free_img = cv2.threshold(gray_img, 0, 255, cv2.THRESH_OTSU)[1]

	# Create an inverse image to use for removing lines.
	inverted_img = ~ bg_free_img

	# Remove horizontal lines.
	# TODO: Set line thickness dynamically.
	horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
	remove_horizontal = cv2.morphologyEx(inverted_img, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
	cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
	cnts = cnts[0] if len(cnts) == 2 else cnts[1]
	for c in cnts: cv2.drawContours(bg_free_img, [c], -1, (255, 255, 255), 2)

	# Remove vertical lines.
	# TODO: Set line thickness dynamically.
	vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,40))
	remove_vertical = cv2.morphologyEx(inverted_img, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
	cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
	cnts = cnts[0] if len(cnts) == 2 else cnts[1]
	for c in cnts: cv2.drawContours(bg_free_img, [c], -1, (255, 255, 255), 2)

	# Return the output image.
	return bg_free_img