cmin764 · October 6, 2021 07:43 · cmin764 · Oct 6, 2021
diff --git a/pdf2text-pages.py b/pdf2text-pages.py
 #! /usr/bin/env python3


 import logging
 import sys
 from io import StringIO

 from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, \
    PDFPageAggregator
 from pdfminer.image import ImageWriter
 from pdfminer.layout import LAParams
 from pdfminer.pdfdevice import TagExtractor
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 from pdfminer.pdfpage import PDFPage
 from pdfminer.utils import open_filename


 def iter_text_per_page(pdf_file, password='', page_numbers=None, maxpages=0,
                 caching=True, codec='utf-8', laparams=None):
    """Parse and return the text contained in a PDF file.
    :param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: a string containing all of the text extracted.
    """
    if laparams is None:
        laparams = LAParams()

    with open_filename(pdf_file, "rb") as fp:
        rsrcmgr = PDFResourceManager(caching=caching)

        idx = 1
        for page in PDFPage.get_pages(
                fp,
                page_numbers,
                maxpages=maxpages,
                password=password,
                caching=caching,
        ):
            with StringIO() as output_string:
                device = TextConverter(rsrcmgr, output_string, codec=codec,
                                       laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                yield idx, output_string.getvalue()
                idx += 1


 def main():
    pdf_file, search = sys.argv[1:3]
    for count, page_text in iter_text_per_page(pdf_file):
        idx = page_text.find(search)
        if idx != -1:
            lo = max(0, idx - 128)
            hi = min(len(page_text), idx + 128)
            content = page_text[lo:hi]
            print(f"Found text at page {count}:\n{content}")
        #break


 if __name__ == "__main__":
    main()
	#! /usr/bin/env python3


	import logging
	import sys
	from io import StringIO

	from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, \
	PDFPageAggregator
	from pdfminer.image import ImageWriter
	from pdfminer.layout import LAParams
	from pdfminer.pdfdevice import TagExtractor
	from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
	from pdfminer.pdfpage import PDFPage
	from pdfminer.utils import open_filename


	def iter_text_per_page(pdf_file, password='', page_numbers=None, maxpages=0,
	caching=True, codec='utf-8', laparams=None):
	"""Parse and return the text contained in a PDF file.
	:param pdf_file: Either a file path or a file-like object for the PDF file
	to be worked on.
	:param password: For encrypted PDFs, the password to decrypt.
	:param page_numbers: List of zero-indexed page numbers to extract.
	:param maxpages: The maximum number of pages to parse
	:param caching: If resources should be cached
	:param codec: Text decoding codec
	:param laparams: An LAParams object from pdfminer.layout. If None, uses
	some default settings that often work well.
	:return: a string containing all of the text extracted.
	"""
	if laparams is None:
	laparams = LAParams()

	with open_filename(pdf_file, "rb") as fp:
	rsrcmgr = PDFResourceManager(caching=caching)

	idx = 1
	for page in PDFPage.get_pages(
	fp,
	page_numbers,
	maxpages=maxpages,
	password=password,
	caching=caching,
	):
	with StringIO() as output_string:
	device = TextConverter(rsrcmgr, output_string, codec=codec,
	laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	interpreter.process_page(page)
	yield idx, output_string.getvalue()
	idx += 1


	def main():
	pdf_file, search = sys.argv[1:3]
	for count, page_text in iter_text_per_page(pdf_file):
	idx = page_text.find(search)
	if idx != -1:
	lo = max(0, idx - 128)
	hi = min(len(page_text), idx + 128)
	content = page_text[lo:hi]
	print(f"Found text at page {count}:\n{content}")
	#break


	if __name__ == "__main__":
	main()
No results found