Created
April 26, 2023 00:22
-
-
Save turicas/22219672d78e9fb3dfd3a66bf2e0923c to your computer and use it in GitHub Desktop.
Plot PDF text/rect objects' using rows + Pillow
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install pillow cached-property pdfminer.six https://github.com/turicas/rows/archive/develop.zip | |
import argparse | |
from rows.plugins.plugin_pdf import ( | |
RectObject, | |
TextObject, | |
PDFMinerBackend, | |
group_objects, | |
YGroupsAlgorithm, | |
plot_objects, | |
split_object_lines, | |
) | |
from PIL.ImageShow import register, DisplayViewer | |
register(DisplayViewer(), 0) # Use `display` command on Image.show() | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--width") | |
parser.add_argument("--height") | |
parser.add_argument("--page-number", type=int, default=1) | |
parser.add_argument("pdf_filename") | |
args = parser.parse_args() | |
doc = PDFMinerBackend(args.pdf_filename) | |
selected_page = None | |
for counter, page in enumerate(doc.objects(), start=1): | |
if counter == args.page_number: | |
selected_page = page | |
break | |
img1 = plot_objects(selected_page, width=args.width, height=args.height) | |
img1.show() | |
page_split = [] | |
for obj in selected_page: | |
if isinstance(obj, TextObject): | |
page_split.extend(split_object_lines(obj)) | |
else: | |
page_split.append(obj) | |
img2 = plot_objects(page_split, width=args.width, height=args.height) | |
img2.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment