Skip to content

Instantly share code, notes, and snippets.

@84adam
Created January 2, 2020 17:43
Show Gist options
  • Select an option

  • Save 84adam/e76a1b4590acc32cfe26bc9b549e3c7e to your computer and use it in GitHub Desktop.

Select an option

Save 84adam/e76a1b4590acc32cfe26bc9b549e3c7e to your computer and use it in GitHub Desktop.
Extract text from PDF files
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import os
def convert_pdf_to_txt(path, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(path, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close()
return text
if __name__ == '__main__':
filename = input("Enter name of PDF file from which to extract text: ")
output = convert_pdf_to_txt(filename)
print(output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment