Skip to content

Instantly share code, notes, and snippets.

@FANGOD
Forked from Sigmame/pdf_table_with Tesseract
Created September 6, 2022 04:11
Show Gist options
  • Save FANGOD/cc2dc3b306ed5eafc0cec641881dc836 to your computer and use it in GitHub Desktop.
Save FANGOD/cc2dc3b306ed5eafc0cec641881dc836 to your computer and use it in GitHub Desktop.
Extract Data from PDF table using Python Image. Image Magick and tesseract
#Refer http://craiget.com/extracting-table-data-from-pdfs-with-ocr/
import Image, ImageOps
import subprocess, sys, os, glob
# minimum run of adjacent pixels to call something a line
H_THRESH = 300
V_THRESH = 300
def get_hlines(pix, w, h):
"""Get start/end pixels of lines containing horizontal runs of at least THRESH black pix"""
hlines = []
for y in range(h):
x1, x2 = (None, None)
black = 0
run = 0
for x in range(w):
if pix[x,y] == (0,0,0):
black = black + 1
if not x1: x1 = x
x2 = x
else:
if black > run:
run = black
black = 0
if run > H_THRESH:
hlines.append((x1,y,x2,y))
return hlines
def get_vlines(pix, w, h):
"""Get start/end pixels of lines containing vertical runs of at least THRESH black pix"""
vlines = []
for x in range(w):
y1, y2 = (None,None)
black = 0
run = 0
for y in range(h):
if pix[x,y] == (0,0,0):
black = black + 1
if not y1: y1 = y
y2 = y
else:
if black > run:
run = black
black = 0
if run > V_THRESH:
vlines.append((x,y1,x,y2))
return vlines
def get_cols(vlines):
"""Get top-left and bottom-right coordinates for each column from a list of vertical lines"""
cols = []
for i in range(1, len(vlines)):
if vlines[i][0] - vlines[i-1][0] > 1:
cols.append((vlines[i-1][0],vlines[i-1][1],vlines[i][2],vlines[i][3]))
return cols
def get_rows(hlines):
"""Get top-left and bottom-right coordinates for each row from a list of vertical lines"""
rows = []
for i in range(1, len(hlines)):
if hlines[i][1] - hlines[i-1][3] > 1:
rows.append((hlines[i-1][0],hlines[i-1][1],hlines[i][2],hlines[i][3]))
return rows
def get_cells(rows, cols):
"""Get top-left and bottom-right coordinates for each cell usings row and column coordinates"""
cells = {}
for i, row in enumerate(rows):
cells.setdefault(i, {})
for j, col in enumerate(cols):
x1 = col[0]
y1 = row[1]
x2 = col[2]
y2 = row[3]
cells[i][j] = (x1,y1,x2,y2)
return cells
def ocr_cell(im, cells, x, y):
"""Return OCRed text from this cell"""
fbase = "working/%d-%d" % (x, y)
ftif = "%s.tif" % fbase
ftxt = "%s.txt" % fbase
cmd = "tesseract %s %s" % (ftif, fbase)
# extract cell from whole image, grayscale (1-color channel), monochrome
region = im.crop(cells[x][y])
region = ImageOps.grayscale(region)
region = region.point(lambda p: p > 200 and 255)
# determine background color (most used color)
histo = region.histogram()
if histo[0] > histo[255]: bgcolor = 0
else: bgcolor = 255
# trim borders by finding top-left and bottom-right bg pixels
pix = region.load()
x1,y1 = 0,0
x2,y2 = region.size
x2,y2 = x2-1,y2-1
while pix[x1,y1] != bgcolor:
x1 += 1
y1 += 1
while pix[x2,y2] != bgcolor:
x2 -= 1
y2 -= 1
# save as TIFF and extract text with Tesseract OCR
trimmed = region.crop((x1,y1,x2,y2))
trimmed.save(ftif, "TIFF")
subprocess.call([cmd], shell=True, stderr=subprocess.PIPE)
lines = [l.strip() for l in open(ftxt).readlines()]
return lines[0]
def get_image_data(filename):
"""Extract textual data[rows][cols] from spreadsheet-like image file"""
im = Image.open(filename)
pix = im.load()
width, height = im.size
hlines = get_hlines(pix, width, height)
sys.stderr.write("%s: hlines: %d\n" % (filename, len(hlines)))
vlines = get_vlines(pix, width, height)
sys.stderr.write("%s: vlines: %d\n" % (filename, len(vlines)))
rows = get_rows(hlines)
sys.stderr.write("%s: rows: %d\n" % (filename, len(rows)))
cols = get_cols(vlines)
sys.stderr.write("%s: cols: %d\n" % (filename, len(cols)))
cells = get_cells(rows, cols)
data = []
for row in range(len(rows)):
data.append([ocr_cell(im,cells, row, col) for col in range(len(cols))])
return data
def split_pdf(filename):
"""Split PDF into PNG pages, return filenames"""
prefix = filename[:-4]
cmd = "convert -density 600 %s working/%s-%%d.png" % (filename, prefix)
subprocess.call([cmd], shell=True)
return [f for f in glob.glob(os.path.join('working', '%s*' % prefix))]
def extract_pdf(filename):
"""Extract table data from pdf"""
pngfiles = split_pdf(filename)
sys.stderr.write("Pages: %d\n" % len(pngfiles))
# extract table data from each page
data = []
for pngfile in pngfiles:
pngdata = get_image_data(pngfile)
for d in pngdata:
data.append(d)
# remove temp files for this page
os.system("rm working/*.tif")
os.system("rm working/*.txt")
# remove split pages
os.system("rm working/*")
return data
if __name__ == '__main__':
if len(sys.argv) != 2:
print "Usage: ctocr.py FILENAME"
exit()
# split target pdf into pages
filename = sys.argv[1]
data = extract_pdf(filename)
for row in data:
print "\t".join(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment