Last active
October 10, 2023 13:25
-
-
Save josemarcosrf/5e40936fa4fc6c1490c41cc36e4dd263 to your computer and use it in GitHub Desktop.
Quick and dirty implementation of a text and bounding box extraction from PDFs using pdfminer.six==20191110
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
import fire | |
from pdfminer.converter import PDFPageAggregator | |
from pdfminer.layout import LAParams | |
from pdfminer.layout import LTChar | |
from pdfminer.layout import LTTextBox | |
from pdfminer.layout import LTTextLine | |
from pdfminer.pdfinterp import PDFPageInterpreter | |
from pdfminer.pdfinterp import PDFResourceManager | |
from pdfminer.pdfpage import PDFPage | |
from rich import print as rprint | |
def get_char_bboxes(pdf_path: str): | |
fp = open(pdf_path, "rb") | |
rsrcmgr = PDFResourceManager() | |
laparams = LAParams() | |
device = PDFPageAggregator(rsrcmgr, laparams=laparams) | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
pages = PDFPage.get_pages(fp) | |
per_page_tboxes = defaultdict(list) | |
for i, page in enumerate(pages): | |
print("Processing next page...") | |
interpreter.process_page(page) | |
layout = device.get_result() | |
for lobj in layout: | |
if isinstance(lobj, LTTextBox): | |
for tline in lobj: | |
for tchar in tline: | |
if isinstance(tchar, LTChar): | |
# x0, y0, x1, y1 = tchar.bbox[0], tchar.bbox[1], tchar.bbox[2], tchar.bbox[3] | |
# text = tchar.get_text() | |
per_page_tboxes[i].append((tchar.bbox, tchar.get_text())) | |
return per_page_tboxes | |
def textract(pdf_path: str): | |
per_page_bboxes = get_char_bboxes(pdf_path) | |
for p_idx, page_bboxes in per_page_bboxes.items(): | |
# 1. Group by vertical position | |
groups = defaultdict(list) | |
for tbox in page_bboxes: | |
coords, _ = tbox | |
_, y0, _, _ = coords | |
groups[y0].append(tbox) | |
# 2. Split by space and get the enclosing box | |
for i, (y0, tbox_list) in enumerate(groups.items()): | |
# Group in words | |
words = [] | |
wboxes = [] | |
bb = [] | |
w = "" | |
for tbox in tbox_list: | |
try: | |
coords, c = tbox | |
if c != " ": | |
w += c | |
bb.append(coords) | |
else: | |
if len(bb): | |
words.append(w) | |
if len(bb) > 1: | |
bb_s, bb_f = bb[0], bb[-1] | |
wboxes.append((bb_s[0], bb_s[1], bb_f[2], bb_f[3])) | |
else: | |
wboxes.append((bb[0])) | |
w = "" | |
bb = [] | |
except Exception as e: | |
rprint(f"[red] Error: {e}[/red]") | |
rprint(f"[red] w: {w} | bb: {bb}[/red]") | |
tboxes = list(zip(words, wboxes)) | |
rprint(f"PAGE {p_idx} - LINE: {i}") | |
rprint(tboxes) | |
if __name__ == "__main__": | |
# Requires: | |
# fire==0.5.0 | |
# pdfminer.six==20191110 | |
# rich==13.6.0 | |
fire.Fire({"pdf": textract}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment