Created
March 29, 2020 20:51
-
-
Save Ahanmr/f8ecfb6278d1f235d0efd72861713895 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TableFinder(object): | |
""" | |
Given a PDF page, finds table structures. | |
""" | |
def __init__(self, page, settings={}): | |
for k in settings.keys(): | |
if k not in DEFAULT_TABLE_SETTINGS: | |
raise ValueError("Unrecognized table setting: '{0}'".format( | |
k | |
)) | |
self.page = page | |
self.settings = dict(DEFAULT_TABLE_SETTINGS) | |
self.settings.update(settings) | |
for var, fallback in [ | |
("text_x_tolerance", "text_tolerance"), | |
("text_y_tolerance", "text_tolerance"), | |
("intersection_x_tolerance", "intersection_tolerance"), | |
("intersection_y_tolerance", "intersection_tolerance"), | |
]: | |
if self.settings[var] == None: | |
self.settings.update({ | |
var: self.settings[fallback] | |
}) | |
self.edges = self.get_edges() | |
self.intersections = edges_to_intersections( | |
self.edges, | |
self.settings["intersection_x_tolerance"], | |
self.settings["intersection_y_tolerance"], | |
) | |
self.cells = intersections_to_cells( | |
self.intersections | |
) | |
self.tables = [ Table(self.page, t) | |
for t in cells_to_tables(self.cells) ] | |
def get_edges(self): | |
settings = self.settings | |
for name in [ "vertical", "horizontal" ]: | |
strategy = settings[name + "_strategy"] | |
if strategy not in TABLE_STRATEGIES: | |
raise ValueError("{0} must be one of {{{1}}}".format( | |
name + "_strategy", | |
",".join(TABLE_STRATEGIES) | |
)) | |
if strategy == "explicit": | |
if len(settings["explicit_" + name + "_lines"]) < 2: | |
raise ValueError("If {0} == 'explicit', {1} must be specified as list/tuple of two or more floats/ints.".format( | |
strategy + "_strategy", | |
"explicit_" + name + "_lines", | |
)) | |
v_strat = settings["vertical_strategy"] | |
h_strat = settings["horizontal_strategy"] | |
if v_strat == "text" or h_strat == "text": | |
xt = settings["text_x_tolerance"] | |
if xt == None: | |
xt = settings["text_tolerance"] | |
yt = settings["text_y_tolerance"] | |
if yt == None: | |
yt = settings["text_tolerance"] | |
words = self.page.extract_words( | |
x_tolerance=xt, | |
y_tolerance=yt, | |
keep_blank_chars=settings["keep_blank_chars"] | |
) | |
def v_edge_desc_to_edge(desc): | |
if isinstance(desc, dict): | |
edge = { | |
"x0": desc.get("x0", desc.get("x")), | |
"x1": desc.get("x1", desc.get("x")), | |
"top": desc.get("top", self.page.bbox[1]), | |
"bottom": desc.get("bottom", self.page.bbox[3]), | |
"orientation": "v" | |
} | |
else: | |
edge = { | |
"x0": desc, | |
"x1": desc, | |
"top": self.page.bbox[1], | |
"bottom": self.page.bbox[3], | |
} | |
edge["height"] = edge["bottom"] - edge["top"] | |
edge["orientation"] = "v" | |
return edge | |
v_explicit = list(map(v_edge_desc_to_edge, settings["explicit_vertical_lines"])) | |
if v_strat == "lines": | |
v_base = utils.filter_edges(self.page.edges, "v") | |
elif v_strat == "lines_strict": | |
v_base = utils.filter_edges(self.page.edges, "v", | |
edge_type="lines") | |
elif v_strat == "text": | |
v_base = words_to_edges_v(words, | |
word_threshold=settings["min_words_vertical"]) | |
elif v_strat == "explicit": | |
v_base = [] | |
v = v_base + v_explicit | |
def h_edge_desc_to_edge(desc): | |
if isinstance(desc, dict): | |
edge = { | |
"x0": desc.get("x0", self.page.bbox[0]), | |
"x1": desc.get("x1", self.page.bbox[2]), | |
"top": desc.get("top", desc.get("bottom")), | |
"bottom": desc.get("bottom", desc.get("top")), | |
} | |
else: | |
edge = { | |
"x0": self.page.bbox[0], | |
"x1": self.page.bbox[2], | |
"top": desc, | |
"bottom": desc, | |
} | |
edge["width"] = edge["x1"] - edge["x0"] | |
edge["orientation"] = "h" | |
return edge | |
h_explicit = list(map(h_edge_desc_to_edge, settings["explicit_horizontal_lines"])) | |
if h_strat == "lines": | |
h_base = utils.filter_edges(self.page.edges, "h") | |
elif h_strat == "lines_strict": | |
h_base = utils.filter_edges(self.page.edges, "h", | |
edge_type="lines") | |
elif h_strat == "text": | |
h_base = words_to_edges_h(words, | |
word_threshold=settings["min_words_horizontal"]) | |
elif h_strat == "explicit": | |
h_base = [] | |
h = h_base + h_explicit | |
edges = list(v) + list(h) | |
if settings["snap_tolerance"] > 0 or settings["join_tolerance"] > 0: | |
edges = merge_edges(edges, | |
snap_tolerance=settings["snap_tolerance"], | |
join_tolerance=settings["join_tolerance"], | |
) | |
return utils.filter_edges(edges, | |
min_length=settings["edge_min_length"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment