Skip to content

Instantly share code, notes, and snippets.

@84adam
Created January 2, 2020 18:12
Show Gist options
  • Select an option

  • Save 84adam/396ac0952c1c7877fef9fd9a0b2f04e6 to your computer and use it in GitHub Desktop.

Select an option

Save 84adam/396ac0952c1c7877fef9fd9a0b2f04e6 to your computer and use it in GitHub Desktop.
Extract text from a PDF given its URL
import requests
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO, BytesIO
def convert_pdf_to_txt(url, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
r = requests.get(url)
infile = BytesIO(r.content)
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close()
return text
if __name__ == '__main__':
url = input("Enter URL of PDF from which to extract text: ")
# Example URL: https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf
# Output:
# >>> Dummy PDF file
output = convert_pdf_to_txt(url)
print(output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment