Skip to content

Instantly share code, notes, and snippets.

@mmahbub
Last active October 1, 2021 20:04
Show Gist options
  • Save mmahbub/6f840fdf35c5447dd1c27a3c7049305f to your computer and use it in GitHub Desktop.
Save mmahbub/6f840fdf35c5447dd1c27a3c7049305f to your computer and use it in GitHub Desktop.
# pip install PyPDF2 - > Read and parse your content pdf
# pip install requests - > request for get the pdf
# pip install BeautifulSoup - > for parse the html
from PyPDF2 import PdfFileReader
import requests
import io
from bs4 import BeautifulSoup
def getPdfMeta(pdfLink):
response = requests.get(pdfLink)
with io.BytesIO(response.content) as f:
pdf = PdfFileReader(f)
information = pdf.getDocumentInfo()
number_of_pages = pdf.getNumPages()
txt = f"""
Author: {information.author}\n
Creator: {information.creator}\n
Producer: {information.producer}\n
Subject: {information.subject}\n
Title: {information.title}\n
Number of pages: {number_of_pages}
"""
# Here the metadata of your pdf
print(f'Metadata:\n {txt}')
def getPdfContent(pdfLink, numpage):
response = requests.get(pdfLink)
with io.BytesIO(response.content) as f:
pdf = PdfFileReader(f)
numpage=numpage
page = pdf.getPage(numpage)
page_content = page.extractText()
#page_content = page_content.replace('\n', '')
#page_content = page_content.replace('˜', 'fi')
#page_content = page_content.replace('˚', 'ff')
#tokenized_content = nltk.sent_tokenize(page_content)
return page_content
pdfLink1 = "https://www.osce.org/files/f/documents/9/d/499420.pdf"
getPdfMeta(pdfLink1)
getPdfContent(pdfLink1, 0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment