Skip to content

Instantly share code, notes, and snippets.

@skhani
Created June 23, 2020 19:43
Show Gist options
  • Save skhani/c2ceaa8c9ed7c980ac8571a56ac656a4 to your computer and use it in GitHub Desktop.
Save skhani/c2ceaa8c9ed7c980ac8571a56ac656a4 to your computer and use it in GitHub Desktop.
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
def pdf_to_text(path):
manager = PDFResourceManager()
retstr = BytesIO()
layout = LAParams(all_texts=True)
device = TextConverter(manager, retstr, laparams=layout)
filepath = open(path, 'rb')
interpreter = PDFPageInterpreter(manager, device)
for page in PDFPage.get_pages(filepath, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
filepath.close()
device.close()
retstr.close()
return text
if __name__ == "__main__":
text = pdf_to_text("yourfile.pdf")
print(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment