Created
January 2, 2020 18:12
-
-
Save 84adam/396ac0952c1c7877fef9fd9a0b2f04e6 to your computer and use it in GitHub Desktop.
Extract text from a PDF given its URL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
| from pdfminer.converter import TextConverter | |
| from pdfminer.layout import LAParams | |
| from pdfminer.pdfpage import PDFPage | |
| from io import StringIO, BytesIO | |
| def convert_pdf_to_txt(url, pages=None): | |
| if not pages: | |
| pagenums = set() | |
| else: | |
| pagenums = set(pages) | |
| output = StringIO() | |
| manager = PDFResourceManager() | |
| converter = TextConverter(manager, output, laparams=LAParams()) | |
| interpreter = PDFPageInterpreter(manager, converter) | |
| r = requests.get(url) | |
| infile = BytesIO(r.content) | |
| for page in PDFPage.get_pages(infile, pagenums): | |
| interpreter.process_page(page) | |
| infile.close() | |
| converter.close() | |
| text = output.getvalue() | |
| output.close() | |
| return text | |
| if __name__ == '__main__': | |
| url = input("Enter URL of PDF from which to extract text: ") | |
| # Example URL: https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf | |
| # Output: | |
| # >>> Dummy PDF file | |
| output = convert_pdf_to_txt(url) | |
| print(output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment