Created
January 22, 2025 08:18
-
-
Save yeiichi/807b5e38d1e11417ca09e77742815d89 to your computer and use it in GitHub Desktop.
Extract strings directly from a PDF file URL.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from io import BytesIO | |
import requests | |
from pdfminer.high_level import extract_text | |
def extract_str_fm_pdf_url(url_pdf): | |
"""Extract strings directly from a PDF file URL. | |
Args: | |
url_pdf (str): URL of the target PDF file | |
Returns: | |
Content string | |
""" | |
byte_data = requests.get(url_pdf).content | |
bytio_obj = BytesIO(byte_data) | |
try: | |
return extract_text(bytio_obj) | |
except Exception as err_msg: | |
return err_msg | |
if __name__ == '__main__': | |
print(extract_str_fm_pdf_url(input('Target PDF URL? >> '))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment