Last active
October 1, 2024 09:21
-
-
Save huks0/e48d604fc9dd91731bc687d6e3933db4 to your computer and use it in GitHub Desktop.
A table detection, cell recognition and text extraction algorithm to convert tables in images to excel files, using pytesseract and open cv.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cv2 | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import csv | |
try: | |
from PIL import Image | |
except ImportError: | |
import Image | |
import pytesseract | |
#read your file | |
file=r'/Users/marius/Desktop/Masterarbeit/Medium/Medium.png' | |
img = cv2.imread(file,0) | |
img.shape | |
#thresholding the image to a binary image | |
thresh,img_bin = cv2.threshold(img,128,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU) | |
#inverting the image | |
img_bin = 255-img_bin | |
cv2.imwrite('/Users/marius/Desktop/cv_inverted.png',img_bin) | |
#Plotting the image to see the output | |
plotting = plt.imshow(img_bin,cmap='gray') | |
plt.show() | |
# countcol(width) of kernel as 100th of total width | |
kernel_len = np.array(img).shape[1]//100 | |
# Defining a vertical kernel to detect all vertical lines of image | |
ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len)) | |
# Defining a horizontal kernel to detect all horizontal lines of image | |
hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1)) | |
# A kernel of 2x2 | |
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) | |
#Use vertical kernel to detect and save the vertical lines in a jpg | |
image_1 = cv2.erode(img_bin, ver_kernel, iterations=3) | |
vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=3) | |
cv2.imwrite("/Users/marius/Desktop/vertical.jpg",vertical_lines) | |
#Plot the generated image | |
plotting = plt.imshow(image_1,cmap='gray') | |
plt.show() | |
#Use horizontal kernel to detect and save the horizontal lines in a jpg | |
image_2 = cv2.erode(img_bin, hor_kernel, iterations=3) | |
horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=3) | |
cv2.imwrite("/Users/marius/Desktop/horizontal.jpg",horizontal_lines) | |
#Plot the generated image | |
plotting = plt.imshow(image_2,cmap='gray') | |
plt.show() | |
# Combine horizontal and vertical lines in a new third image, with both having same weight. | |
img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0) | |
#Eroding and thesholding the image | |
img_vh = cv2.erode(~img_vh, kernel, iterations=2) | |
thresh, img_vh = cv2.threshold(img_vh,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) | |
cv2.imwrite("/Users/marius/Desktop/img_vh.jpg", img_vh) | |
bitxor = cv2.bitwise_xor(img,img_vh) | |
bitnot = cv2.bitwise_not(bitxor) | |
#Plotting the generated image | |
plotting = plt.imshow(bitnot,cmap='gray') | |
plt.show() | |
# Detect contours for following box detection | |
contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) | |
def sort_contours(cnts, method="left-to-right"): | |
# initialize the reverse flag and sort index | |
reverse = False | |
i = 0 | |
# handle if we need to sort in reverse | |
if method == "right-to-left" or method == "bottom-to-top": | |
reverse = True | |
# handle if we are sorting against the y-coordinate rather than | |
# the x-coordinate of the bounding box | |
if method == "top-to-bottom" or method == "bottom-to-top": | |
i = 1 | |
# construct the list of bounding boxes and sort them from top to | |
# bottom | |
boundingBoxes = [cv2.boundingRect(c) for c in cnts] | |
(cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes), | |
key=lambda b:b[1][i], reverse=reverse)) | |
# return the list of sorted contours and bounding boxes | |
return (cnts, boundingBoxes) | |
# Sort all the contours by top to bottom. | |
contours, boundingBoxes = sort_contours(contours, method="top-to-bottom") | |
#Creating a list of heights for all detected boxes | |
heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))] | |
#Get mean of heights | |
mean = np.mean(heights) | |
#Create list box to store all boxes in | |
box = [] | |
# Get position (x,y), width and height for every contour and show the contour on image | |
for c in contours: | |
x, y, w, h = cv2.boundingRect(c) | |
if (w<1000 and h<500): | |
image = cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2) | |
box.append([x,y,w,h]) | |
plotting = plt.imshow(image,cmap='gray') | |
plt.show() | |
#Creating two lists to define row and column in which cell is located | |
row=[] | |
column=[] | |
j=0 | |
#Sorting the boxes to their respective row and column | |
for i in range(len(box)): | |
if(i==0): | |
column.append(box[i]) | |
previous=box[i] | |
else: | |
if(box[i][1]<=previous[1]+mean/2): | |
column.append(box[i]) | |
previous=box[i] | |
if(i==len(box)-1): | |
row.append(column) | |
else: | |
row.append(column) | |
column=[] | |
previous = box[i] | |
column.append(box[i]) | |
print(column) | |
print(row) | |
#calculating maximum number of cells | |
countcol = 0 | |
for i in range(len(row)): | |
countcol = len(row[i]) | |
if countcol > countcol: | |
countcol = countcol | |
#Retrieving the center of each column | |
center = [int(row[i][j][0]+row[i][j][2]/2) for j in range(len(row[i])) if row[0]] | |
center=np.array(center) | |
center.sort() | |
print(center) | |
#Regarding the distance to the columns center, the boxes are arranged in respective order | |
finalboxes = [] | |
for i in range(len(row)): | |
lis=[] | |
for k in range(countcol): | |
lis.append([]) | |
for j in range(len(row[i])): | |
diff = abs(center-(row[i][j][0]+row[i][j][2]/4)) | |
minimum = min(diff) | |
indexing = list(diff).index(minimum) | |
lis[indexing].append(row[i][j]) | |
finalboxes.append(lis) | |
#from every single image-based cell/box the strings are extracted via pytesseract and stored in a list | |
outer=[] | |
for i in range(len(finalboxes)): | |
for j in range(len(finalboxes[i])): | |
inner='' | |
if(len(finalboxes[i][j])==0): | |
outer.append(' ') | |
else: | |
for k in range(len(finalboxes[i][j])): | |
y,x,w,h = finalboxes[i][j][k][0],finalboxes[i][j][k][1], finalboxes[i][j][k][2],finalboxes[i][j][k][3] | |
finalimg = bitnot[x:x+h, y:y+w] | |
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1)) | |
border = cv2.copyMakeBorder(finalimg,2,2,2,2, cv2.BORDER_CONSTANT,value=[255,255]) | |
resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC) | |
dilation = cv2.dilate(resizing, kernel,iterations=1) | |
erosion = cv2.erode(dilation, kernel,iterations=2) | |
out = pytesseract.image_to_string(erosion) | |
if(len(out)==0): | |
out = pytesseract.image_to_string(erosion, config='--psm 3') | |
inner = inner +" "+ out | |
outer.append(inner) | |
#Creating a dataframe of the generated OCR list | |
arr = np.array(outer) | |
dataframe = pd.DataFrame(arr.reshape(len(row), countcol)) | |
print(dataframe) | |
data = dataframe.style.set_properties(align="left") | |
#Converting it in a excel-file | |
data.to_excel("/Users/marius/Desktop/output.xlsx") |
How can we get the co-ordinates of the cells
Hi,
In 175 lines you have not declared the bitnot because when I run this code it give me an error Undefined bitnot . How solve it or which bitnot images .please tell me.
On line 182 and 184, wouldn't it be better to only OCR just once (just on line 182), by specifying a general configuration, like
pytesseract.image_to_string(erosion, config='-l eng --psm 3')
Then check if the length of the text equals zero and decide whether to append it into the existing variable string?
Cause the already existing solution you've created, perform OCR twice which increases the run time.
Hi your code worked perfectly thanks. just added a loop to to run through an image folder and append tables to excel file.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I ran it on your medium blog image it shows errors.
check here
Amount values are missing. Also, it read the text outside the table.