Created
October 6, 2021 07:43
-
-
Save cmin764/7ddfada0d541ceaa91d724964403a0a3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python3 | |
| import logging | |
| import sys | |
| from io import StringIO | |
| from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, \ | |
| PDFPageAggregator | |
| from pdfminer.image import ImageWriter | |
| from pdfminer.layout import LAParams | |
| from pdfminer.pdfdevice import TagExtractor | |
| from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
| from pdfminer.pdfpage import PDFPage | |
| from pdfminer.utils import open_filename | |
| def iter_text_per_page(pdf_file, password='', page_numbers=None, maxpages=0, | |
| caching=True, codec='utf-8', laparams=None): | |
| """Parse and return the text contained in a PDF file. | |
| :param pdf_file: Either a file path or a file-like object for the PDF file | |
| to be worked on. | |
| :param password: For encrypted PDFs, the password to decrypt. | |
| :param page_numbers: List of zero-indexed page numbers to extract. | |
| :param maxpages: The maximum number of pages to parse | |
| :param caching: If resources should be cached | |
| :param codec: Text decoding codec | |
| :param laparams: An LAParams object from pdfminer.layout. If None, uses | |
| some default settings that often work well. | |
| :return: a string containing all of the text extracted. | |
| """ | |
| if laparams is None: | |
| laparams = LAParams() | |
| with open_filename(pdf_file, "rb") as fp: | |
| rsrcmgr = PDFResourceManager(caching=caching) | |
| idx = 1 | |
| for page in PDFPage.get_pages( | |
| fp, | |
| page_numbers, | |
| maxpages=maxpages, | |
| password=password, | |
| caching=caching, | |
| ): | |
| with StringIO() as output_string: | |
| device = TextConverter(rsrcmgr, output_string, codec=codec, | |
| laparams=laparams) | |
| interpreter = PDFPageInterpreter(rsrcmgr, device) | |
| interpreter.process_page(page) | |
| yield idx, output_string.getvalue() | |
| idx += 1 | |
| def main(): | |
| pdf_file, search = sys.argv[1:3] | |
| for count, page_text in iter_text_per_page(pdf_file): | |
| idx = page_text.find(search) | |
| if idx != -1: | |
| lo = max(0, idx - 128) | |
| hi = min(len(page_text), idx + 128) | |
| content = page_text[lo:hi] | |
| print(f"Found text at page {count}:\n{content}") | |
| #break | |
| if __name__ == "__main__": | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Call it with:
% ./pdf2text-pages.py NASDAQ_TSLA_2019.pdf "Form 10-K"Output: