Created
September 12, 2019 15:23
-
-
Save mndrake/134b8e71f414ffbc4d34131a91aa82e0 to your computer and use it in GitHub Desktop.
HOCR output parsing from pytesseract
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[11]: | |
# dependencies | |
import pytesseract | |
from bs4 import BeautifulSoup | |
from PIL import Image | |
import re | |
import numpy as np | |
import math | |
import cv2 | |
import itertools | |
from collections import namedtuple | |
import matplotlib.pyplot as plt | |
get_ipython().run_line_magic('matplotlib', 'inline') | |
# In[12]: | |
# utility methods | |
def pre_process(image): | |
""" | |
Image pre-processing entails finding all of the png image files and | |
applying a number of cleaning steps to them. | |
""" | |
# Read in as greyscale | |
concatenated = np.array(image.convert('L')) | |
# Threshold image to black/white (threshold = 127 I presume) | |
num, grey_composite = cv2.threshold(concatenated, 127, 255, cv2.THRESH_BINARY) | |
# inverting the image for morphological operations | |
inverted_composite = 255 - grey_composite | |
# Perform closing, dilation followed by erosion | |
kernel = np.ones((2, 2), np.uint8) | |
closed_composite = cv2.morphologyEx(inverted_composite, cv2.MORPH_CLOSE, kernel) | |
# Undo inversion | |
closed_composite = 255 - closed_composite | |
# Write over original with processed version | |
return Image.fromarray(closed_composite) | |
def gray2rgb(image): | |
img = np.array(image) | |
img = cv2.cvtColor(img,cv2.COLOR_GRAY2RGB) | |
return img | |
# In[13]: | |
# main methods | |
BoundingBox = namedtuple('BoundingBox', 'left top right bottom') | |
class Word: | |
text = None | |
conf = None | |
bbox = None | |
id = None | |
def __init__(self, tag): | |
self.text = tag.text | |
self.conf = tag_conf(tag) | |
self.bbox = tag_bbox(tag) | |
self.id = tag.get('id').replace('word_','') | |
def __str__(self): | |
return self.text | |
def __repr__(self): | |
return f'Word<{self.id}>' | |
def get_numeric(self): | |
return convert_to_numeric(self.text) | |
class Block: | |
bbox = None | |
text = None | |
words = [] | |
def __init__(self, word: Word): | |
self.bbox = word.bbox | |
self.text = word.text | |
self.words = [word] | |
def add(self, word): | |
self.bbox = BoundingBox( | |
min(word.bbox.left, self.bbox.left), | |
min(word.bbox.top, self.bbox.top), | |
max(word.bbox.right, self.bbox.right), | |
max(word.bbox.bottom, self.bbox.bottom)) | |
self.text += ' ' + word.text | |
self.words.append(word) | |
def get_numeric(self): | |
return convert_to_numeric(self.text) | |
class Line: | |
bbox = None | |
words = None | |
blocks = None | |
id = None | |
def __init__(self, tag, h_toll=30): | |
self.bbox = tag_bbox(tag) | |
self.words = [Word(word) for word in tag.find_all('span', 'ocrx_word')] | |
self.id = tag.get('id').replace('line_','') | |
# group words into blocks | |
blocks = [] | |
block = Block(self.words[0]) | |
for word in self.words[1:]: | |
if (word.bbox.left - block.bbox.right) < h_toll: | |
block.add(word) | |
else: | |
blocks.append(block) | |
block = Block(word) | |
blocks.append(block) | |
self.blocks = blocks | |
def __repr__(self): | |
return f'Line<{self.id}>' | |
def __str__(self): | |
return ' '.join([str(word) for word in self.words]) | |
def get_text(self): | |
return str(self) | |
class Page: | |
image = None | |
bbox = None | |
lines = None | |
id = None | |
def __init__(self, image: Image, id=0, h_toll=30): | |
self.id = id | |
self.image = image | |
text = pytesseract.image_to_pdf_or_hocr(image, lang='eng', config='--oem 3 --psm 4', extension='hocr') | |
soup = BeautifulSoup(text, 'html.parser') | |
page = soup.find('div', 'ocr_page') | |
self.bbox = tag_bbox(page) | |
self.lines = [Line(line, h_toll) for line in page.find_all('span', 'ocr_line')] | |
def __repr__(self): | |
return f'Page<{self.id}>' | |
def get_words(self): | |
for line in self.lines: | |
for word in line.words: | |
yield word | |
def get_blocks(self): | |
for line in self.lines: | |
for block in line.blocks: | |
yield block | |
def plot_blocks(self): | |
#img = np.array(self.image) | |
img = gray2rgb(self.image) | |
colors = [(255,0,0), (0,255,0)] | |
for block in self.get_blocks(): | |
color = colors[int(np.isnan(block.get_numeric()))] | |
draw_rect(img, block.bbox, color, 0.2) | |
return Image.fromarray(img) | |
def plot_words(self): | |
#img = np.array(self.image) | |
img = gray2rgb(self.image) | |
for word in self.get_words(): | |
draw_rect(img, word.bbox, (100,100,0), 0.2) | |
return Image.fromarray(img) | |
def plot_lines(self): | |
#img = np.array(self.image) | |
img = gray2rgb(self.image) | |
for line in self.lines: | |
draw_rect(img, line.bbox, (0,100,100), 0.2) | |
return Image.fromarray(img) | |
def get_text(self): | |
return '\n'.join([line.get_text() for line in self.lines]) | |
def tag_bbox(tag): | |
title = tag.get('title') | |
match = re.search('bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)', title) | |
values = [int(x) for x in match.groups()] | |
return BoundingBox(*values) | |
def tag_conf(tag): | |
title = tag.get('title') | |
match = re.search('x_wconf ([0-9]+)', title) | |
return int(match.group(1)) | |
def draw_rect(img, bb, color, alpha): | |
overlay = img.copy() | |
cv2.rectangle(overlay, (bb.left, bb.top), (bb.right, bb.bottom), color, -1) | |
cv2.addWeighted(overlay, alpha, img, 1-alpha, 0, img) | |
cv2.rectangle(img, (bb.left, bb.top), (bb.right, bb.bottom), color, 2) | |
return img | |
def convert_to_numeric(value): | |
""" | |
Converts a pandas series object (of strings) to numeric if possible. | |
If not possible, will return numpy.nan. | |
""" | |
try: | |
x = (str(value).replace(" ","").replace("$","").replace("|","") | |
.replace(",", "").replace("(", "-").replace(")", "")) | |
return np.float(x) | |
except: | |
return np.nan | |
# In[14]: | |
#image = Image.open('working/png/00053475_12.png') | |
#image = Image.open('working/png/00178090_12.png') | |
image = Image.open('working/png/00468115_12.png') | |
#image = Image.open('working/png/00477955_10.png') | |
#image = Image.open('working/png/00542515_12.png') | |
#image = Image.open('working/png/00553535_11.png') | |
#image = Image.open('working/png/00782931_11.png') | |
# In[15]: | |
#image = Image.open('working/png/00030177_17.png') | |
#image = image.crop((200,200, 2250,3200)) | |
# In[16]: | |
page = Page(image) | |
# In[17]: | |
#print(page.get_text()) | |
#page.image | |
# In[18]: | |
#img = np.array(page.image) | |
#h_concentration = cv2.reduce(img, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32S) | |
#plt.plot(h_concentration[0]) | |
# In[19]: | |
#page.plot_blocks() | |
# In[20]: | |
#img = np.array(page.image) | |
#for block in page.get_blocks(): | |
# draw_rect(img, block.bbox, (0,0,0), 1) | |
#h_concentration = cv2.reduce(img, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32S) | |
#plt.plot(h_concentration[0]) | |
#Image.fromarray(img) | |
# In[21]: | |
#numeric_blocks = [block for block in page.get_blocks() if not np.isnan(block.get_numeric())] | |
img = np.array(page.image) | |
img.fill(255) | |
#for block in numeric_blocks: | |
for block in page.get_blocks(): | |
draw_rect(img, block.bbox, 0, 1) | |
#Image.fromarray(img) | |
h_concentration = cv2.reduce(img, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32S) | |
h_concentration = np.reshape(h_concentration, page.bbox.right) | |
#plt.plot(h_concentration[0]) | |
v_concentration = cv2.reduce(img, 1, cv2.REDUCE_AVG, dtype=cv2.CV_32S) | |
v_concentration = np.reshape(v_concentration, page.bbox.bottom) | |
plt.plot(v_concentration) | |
# In[22]: | |
page.plot_blocks() | |
#page.plot_words() | |
# In[23]: | |
#img = np.array(page.image) | |
img = gray2rgb(page.image) | |
overlay = img.copy() | |
alpha = 0.4 | |
threshold = 230 | |
for i, x in enumerate(h_concentration): | |
color = 0 if int(x) > threshold else (0,255,0) | |
cv2.line(overlay, (i,0), (i,page.bbox.bottom), color) | |
cv2.addWeighted(overlay, alpha, img, 1-alpha, 0, img) | |
for i, x in enumerate(v_concentration): | |
color = 0 if int(x) > threshold else (255,0,0) | |
cv2.line(overlay, (0,i), (page.bbox.right,i), color) | |
cv2.addWeighted(overlay, alpha, img, 1-alpha, 0, img) | |
Image.fromarray(img) | |
# In[ ]: | |
# In[ ]: | |
# In[ ]: | |
# In[24]: | |
page.plot_blocks() | |
# In[25]: | |
#! pip install scikit-learn | |
# In[ ]: | |
# In[26]: | |
from sklearn.cluster import KMeans | |
# In[27]: | |
y_centroids = [line.bbox.top + line.bbox.bottom//2 for line in page.lines] | |
y_centroids = np.reshape(y_centroids, (-1,1)) | |
# In[28]: | |
y_centroids.shape | |
# In[41]: | |
kmeans = KMeans(n_clusters=11, random_state=0).fit(y_centroids) | |
# In[42]: | |
#page.lines | |
# In[43]: | |
#kmeans.labels_ | |
# In[44]: | |
img = gray2rgb(image) | |
colors = [(255,0,0), (0,255,0), (0,0,255),(100,100,0),(100,0,100),(0,100,100),(100,100,100), (255,255,0), (0,255,255), (0,150,150), (150,150,150)] | |
for i, line in enumerate(page.lines): | |
color = colors[kmeans.labels_[i]] | |
draw_rect(img, line.bbox, color, 0.2) | |
Image.fromarray(img) | |
# In[33]: | |
import os | |
import cv2 | |
import imutils | |
# In[49]: | |
img = np.array(image) | |
morph_size=(4, 12) | |
min_text_height_limit=20 | |
max_text_height_limit=100 | |
# Otsu threshold | |
img = cv2.threshold(img, 250, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] | |
# dilate the text to make it solid spot | |
cpy = img.copy() | |
struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size) | |
cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1) | |
img = ~cpy | |
contours = cv2.findContours(img, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) | |
contours = contours[0] | |
# Getting the texts bounding boxes based on the text size assumptions | |
boxes = [] | |
for contour in contours: | |
box = cv2.boundingRect(contour) | |
h = box[3] | |
if min_text_height_limit < h < max_text_height_limit: | |
boxes.append(box) | |
img = gray2rgb(image) | |
for box in boxes: | |
b = BoundingBox(box[0],box[1], box[0]+box[2], box[1]+box[3]) | |
draw_rect(img, b, (255,0,0), 0.2) | |
Image.fromarray(img) | |
# In[ ]: | |
# In[ ]: | |
# In[ ]: | |
# In[ ]: | |
# In[79]: | |
def pre_process_image(img, morph_size=(8, 8)): | |
# get rid of the color | |
#pre = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
# Otsu threshold | |
pre = cv2.threshold(img, 250, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] | |
# dilate the text to make it solid spot | |
cpy = pre.copy() | |
struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size) | |
cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1) | |
pre = ~cpy | |
#if save_in_file is not None: | |
# cv2.imwrite(save_in_file, pre) | |
return pre | |
def find_text_boxes(pre, min_text_height_limit=6, max_text_height_limit=40): | |
# Looking for the text spots contours | |
contours = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) | |
contours = contours[0] #if imutils.is_cv2() else contours[1] | |
# Getting the texts bounding boxes based on the text size assumptions | |
boxes = [] | |
for contour in contours: | |
box = cv2.boundingRect(contour) | |
h = box[3] | |
if min_text_height_limit < h < max_text_height_limit: | |
boxes.append(box) | |
return boxes | |
def find_table_in_boxes(boxes, cell_threshold=10, min_columns=2): | |
rows = {} | |
cols = {} | |
# Clustering the bounding boxes by their positions | |
for box in boxes: | |
(x, y, w, h) = box | |
col_key = x // cell_threshold | |
row_key = y // cell_threshold | |
cols[row_key] = [box] if col_key not in cols else cols[col_key] + [box] | |
rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box] | |
# Filtering out the clusters having less than 2 cols | |
table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values())) | |
# Sorting the row cells by x coord | |
table_cells = [list(sorted(tb)) for tb in table_cells] | |
# Sorting rows by the y coord | |
table_cells = list(sorted(table_cells, key=lambda r: r[0][1])) | |
return table_cells | |
def build_lines(table_cells): | |
if table_cells is None or len(table_cells) <= 0: | |
return [], [] | |
max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2]) | |
max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2] | |
max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3]) | |
max_y = max_last_row_height_box[1] + max_last_row_height_box[3] | |
hor_lines = [] | |
ver_lines = [] | |
for box in table_cells: | |
x = box[0][0] | |
y = box[0][1] | |
hor_lines.append((x, y, max_x, y)) | |
for box in table_cells[0]: | |
x = box[0] | |
y = box[1] | |
ver_lines.append((x, y, x, max_y)) | |
(x, y, w, h) = table_cells[0][-1] | |
ver_lines.append((max_x, y, max_x, max_y)) | |
(x, y, w, h) = table_cells[0][0] | |
hor_lines.append((x, max_y, max_x, max_y)) | |
return hor_lines, ver_lines | |
# if __name__ == "__main__": | |
# in_file = os.path.join("data", "page.jpg") | |
# pre_file = os.path.join("data", "pre.png") | |
# out_file = os.path.join("data", "out.png") | |
# img = cv2.imread(os.path.join(in_file)) | |
# pre_processed = pre_process_image(img, pre_file) | |
# text_boxes = find_text_boxes(pre_processed) | |
# cells = find_table_in_boxes(text_boxes) | |
# hor_lines, ver_lines = build_lines(cells) | |
# # Visualize the result | |
# vis = img.copy() | |
# # for box in text_boxes: | |
# # (x, y, w, h) = box | |
# # cv2.rectangle(vis, (x, y), (x + w - 2, y + h - 2), (0, 255, 0), 1) | |
# for line in hor_lines: | |
# [x1, y1, x2, y2] = line | |
# cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1) | |
# for line in ver_lines: | |
# [x1, y1, x2, y2] = line | |
# cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1) | |
# cv2.imwrite(out_file, vis) | |
# In[91]: | |
img = np.array(image) | |
pre_processed = pre_process_image(img, morph_size=(10, 10)) | |
text_boxes = find_text_boxes(pre_processed, min_text_height_limit=18, max_text_height_limit=100) | |
cells = find_table_in_boxes(text_boxes,cell_threshold=100, min_columns=2) | |
hor_lines, ver_lines = build_lines(cells) | |
vis = gray2rgb(img.copy()) | |
# for box in text_boxes: | |
# (x, y, w, h) = box | |
# cv2.rectangle(vis, (x, y), (x + w - 2, y + h - 2), (0, 255, 0), 1) | |
for line in hor_lines: | |
[x1, y1, x2, y2] = line | |
cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1) | |
for line in ver_lines: | |
[x1, y1, x2, y2] = line | |
cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1) | |
Image.fromarray(vis) | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment