Last active
October 1, 2021 20:04
-
-
Save mmahbub/6f840fdf35c5447dd1c27a3c7049305f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install PyPDF2 - > Read and parse your content pdf | |
# pip install requests - > request for get the pdf | |
# pip install BeautifulSoup - > for parse the html | |
from PyPDF2 import PdfFileReader | |
import requests | |
import io | |
from bs4 import BeautifulSoup | |
def getPdfMeta(pdfLink): | |
response = requests.get(pdfLink) | |
with io.BytesIO(response.content) as f: | |
pdf = PdfFileReader(f) | |
information = pdf.getDocumentInfo() | |
number_of_pages = pdf.getNumPages() | |
txt = f""" | |
Author: {information.author}\n | |
Creator: {information.creator}\n | |
Producer: {information.producer}\n | |
Subject: {information.subject}\n | |
Title: {information.title}\n | |
Number of pages: {number_of_pages} | |
""" | |
# Here the metadata of your pdf | |
print(f'Metadata:\n {txt}') | |
def getPdfContent(pdfLink, numpage): | |
response = requests.get(pdfLink) | |
with io.BytesIO(response.content) as f: | |
pdf = PdfFileReader(f) | |
numpage=numpage | |
page = pdf.getPage(numpage) | |
page_content = page.extractText() | |
#page_content = page_content.replace('\n', '') | |
#page_content = page_content.replace('˜', 'fi') | |
#page_content = page_content.replace('˚', 'ff') | |
#tokenized_content = nltk.sent_tokenize(page_content) | |
return page_content | |
pdfLink1 = "https://www.osce.org/files/f/documents/9/d/499420.pdf" | |
getPdfMeta(pdfLink1) | |
getPdfContent(pdfLink1, 0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment