mmahbub · October 1, 2021 20:04
diff --git a/extract_text_from_pdf b/extract_text_from_pdf
 # pip install PyPDF2 - > Read and parse your content pdf
 # pip install requests - > request for get the pdf
 # pip install BeautifulSoup - > for parse the html

 from PyPDF2 import PdfFileReader
 import requests
 import io
 from bs4 import BeautifulSoup

 def getPdfMeta(pdfLink):
  response = requests.get(pdfLink)
  with io.BytesIO(response.content) as f:
    pdf = PdfFileReader(f)
    information = pdf.getDocumentInfo()
    number_of_pages = pdf.getNumPages()
    txt = f"""
    Author: {information.author}\n
    Creator: {information.creator}\n
    Producer: {information.producer}\n
    Subject: {information.subject}\n
    Title: {information.title}\n
    Number of pages: {number_of_pages}
    """
    # Here the metadata of your pdf
    print(f'Metadata:\n {txt}')


 def getPdfContent(pdfLink, numpage):
  response = requests.get(pdfLink)
  with io.BytesIO(response.content) as f:
    pdf = PdfFileReader(f)
    numpage=numpage
    page = pdf.getPage(numpage)
    page_content = page.extractText()
    #page_content = page_content.replace('\n', '')
    #page_content = page_content.replace('˜', 'fi')
    #page_content = page_content.replace('˚', 'ff')
    #tokenized_content = nltk.sent_tokenize(page_content)
      
  return page_content

 pdfLink1 = "https://www.osce.org/files/f/documents/9/d/499420.pdf"
 getPdfMeta(pdfLink1)
 getPdfContent(pdfLink1, 0)
	# pip install PyPDF2 - > Read and parse your content pdf
	# pip install requests - > request for get the pdf
	# pip install BeautifulSoup - > for parse the html

	from PyPDF2 import PdfFileReader
	import requests
	import io
	from bs4 import BeautifulSoup

	def getPdfMeta(pdfLink):
	response = requests.get(pdfLink)
	with io.BytesIO(response.content) as f:
	pdf = PdfFileReader(f)
	information = pdf.getDocumentInfo()
	number_of_pages = pdf.getNumPages()
	txt = f"""
	Author: {information.author}\n
	Creator: {information.creator}\n
	Producer: {information.producer}\n
	Subject: {information.subject}\n
	Title: {information.title}\n
	Number of pages: {number_of_pages}
	"""
	# Here the metadata of your pdf
	print(f'Metadata:\n {txt}')


	def getPdfContent(pdfLink, numpage):
	response = requests.get(pdfLink)
	with io.BytesIO(response.content) as f:
	pdf = PdfFileReader(f)
	numpage=numpage
	page = pdf.getPage(numpage)
	page_content = page.extractText()
	#page_content = page_content.replace('\n', '')
	#page_content = page_content.replace('˜', 'fi')
	#page_content = page_content.replace('˚', 'ff')
	#tokenized_content = nltk.sent_tokenize(page_content)

	return page_content

	pdfLink1 = "https://www.osce.org/files/f/documents/9/d/499420.pdf"
	getPdfMeta(pdfLink1)
	getPdfContent(pdfLink1, 0)