-
-
Save Sigmame/5102972757085e313252bfa1e2bb83be to your computer and use it in GitHub Desktop.
Extract Data from PDF table using Python Image. Image Magick and tesseract
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Refer http://craiget.com/extracting-table-data-from-pdfs-with-ocr/ | |
import Image, ImageOps | |
import subprocess, sys, os, glob | |
# minimum run of adjacent pixels to call something a line | |
H_THRESH = 300 | |
V_THRESH = 300 | |
def get_hlines(pix, w, h): | |
"""Get start/end pixels of lines containing horizontal runs of at least THRESH black pix""" | |
hlines = [] | |
for y in range(h): | |
x1, x2 = (None, None) | |
black = 0 | |
run = 0 | |
for x in range(w): | |
if pix[x,y] == (0,0,0): | |
black = black + 1 | |
if not x1: x1 = x | |
x2 = x | |
else: | |
if black > run: | |
run = black | |
black = 0 | |
if run > H_THRESH: | |
hlines.append((x1,y,x2,y)) | |
return hlines | |
def get_vlines(pix, w, h): | |
"""Get start/end pixels of lines containing vertical runs of at least THRESH black pix""" | |
vlines = [] | |
for x in range(w): | |
y1, y2 = (None,None) | |
black = 0 | |
run = 0 | |
for y in range(h): | |
if pix[x,y] == (0,0,0): | |
black = black + 1 | |
if not y1: y1 = y | |
y2 = y | |
else: | |
if black > run: | |
run = black | |
black = 0 | |
if run > V_THRESH: | |
vlines.append((x,y1,x,y2)) | |
return vlines | |
def get_cols(vlines): | |
"""Get top-left and bottom-right coordinates for each column from a list of vertical lines""" | |
cols = [] | |
for i in range(1, len(vlines)): | |
if vlines[i][0] - vlines[i-1][0] > 1: | |
cols.append((vlines[i-1][0],vlines[i-1][1],vlines[i][2],vlines[i][3])) | |
return cols | |
def get_rows(hlines): | |
"""Get top-left and bottom-right coordinates for each row from a list of vertical lines""" | |
rows = [] | |
for i in range(1, len(hlines)): | |
if hlines[i][1] - hlines[i-1][3] > 1: | |
rows.append((hlines[i-1][0],hlines[i-1][1],hlines[i][2],hlines[i][3])) | |
return rows | |
def get_cells(rows, cols): | |
"""Get top-left and bottom-right coordinates for each cell usings row and column coordinates""" | |
cells = {} | |
for i, row in enumerate(rows): | |
cells.setdefault(i, {}) | |
for j, col in enumerate(cols): | |
x1 = col[0] | |
y1 = row[1] | |
x2 = col[2] | |
y2 = row[3] | |
cells[i][j] = (x1,y1,x2,y2) | |
return cells | |
def ocr_cell(im, cells, x, y): | |
"""Return OCRed text from this cell""" | |
fbase = "working/%d-%d" % (x, y) | |
ftif = "%s.tif" % fbase | |
ftxt = "%s.txt" % fbase | |
cmd = "tesseract %s %s" % (ftif, fbase) | |
# extract cell from whole image, grayscale (1-color channel), monochrome | |
region = im.crop(cells[x][y]) | |
region = ImageOps.grayscale(region) | |
region = region.point(lambda p: p > 200 and 255) | |
# determine background color (most used color) | |
histo = region.histogram() | |
if histo[0] > histo[255]: bgcolor = 0 | |
else: bgcolor = 255 | |
# trim borders by finding top-left and bottom-right bg pixels | |
pix = region.load() | |
x1,y1 = 0,0 | |
x2,y2 = region.size | |
x2,y2 = x2-1,y2-1 | |
while pix[x1,y1] != bgcolor: | |
x1 += 1 | |
y1 += 1 | |
while pix[x2,y2] != bgcolor: | |
x2 -= 1 | |
y2 -= 1 | |
# save as TIFF and extract text with Tesseract OCR | |
trimmed = region.crop((x1,y1,x2,y2)) | |
trimmed.save(ftif, "TIFF") | |
subprocess.call([cmd], shell=True, stderr=subprocess.PIPE) | |
lines = [l.strip() for l in open(ftxt).readlines()] | |
return lines[0] | |
def get_image_data(filename): | |
"""Extract textual data[rows][cols] from spreadsheet-like image file""" | |
im = Image.open(filename) | |
pix = im.load() | |
width, height = im.size | |
hlines = get_hlines(pix, width, height) | |
sys.stderr.write("%s: hlines: %d\n" % (filename, len(hlines))) | |
vlines = get_vlines(pix, width, height) | |
sys.stderr.write("%s: vlines: %d\n" % (filename, len(vlines))) | |
rows = get_rows(hlines) | |
sys.stderr.write("%s: rows: %d\n" % (filename, len(rows))) | |
cols = get_cols(vlines) | |
sys.stderr.write("%s: cols: %d\n" % (filename, len(cols))) | |
cells = get_cells(rows, cols) | |
data = [] | |
for row in range(len(rows)): | |
data.append([ocr_cell(im,cells, row, col) for col in range(len(cols))]) | |
return data | |
def split_pdf(filename): | |
"""Split PDF into PNG pages, return filenames""" | |
prefix = filename[:-4] | |
cmd = "convert -density 600 %s working/%s-%%d.png" % (filename, prefix) | |
subprocess.call([cmd], shell=True) | |
return [f for f in glob.glob(os.path.join('working', '%s*' % prefix))] | |
def extract_pdf(filename): | |
"""Extract table data from pdf""" | |
pngfiles = split_pdf(filename) | |
sys.stderr.write("Pages: %d\n" % len(pngfiles)) | |
# extract table data from each page | |
data = [] | |
for pngfile in pngfiles: | |
pngdata = get_image_data(pngfile) | |
for d in pngdata: | |
data.append(d) | |
# remove temp files for this page | |
os.system("rm working/*.tif") | |
os.system("rm working/*.txt") | |
# remove split pages | |
os.system("rm working/*") | |
return data | |
if __name__ == '__main__': | |
if len(sys.argv) != 2: | |
print "Usage: ctocr.py FILENAME" | |
exit() | |
# split target pdf into pages | |
filename = sys.argv[1] | |
data = extract_pdf(filename) | |
for row in data: | |
print "\t".join(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment