code-simple · January 24, 2023 12:25
diff --git a/PDF_Downloader.py b/PDF_Downloader.py
 # Import libraries
 import subprocess
 from selenium.webdriver.chrome.service import Service
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from webdriver_manager.chrome import ChromeDriverManager
 from bs4 import BeautifulSoup

 options = Options()
 options.add_argument('--headless')
 s = Service(ChromeDriverManager().install())
 driver = webdriver.Chrome(service=s, options=options)
 driver.get("https://www.geeksforgeeks.org/how-to-extract-pdf-tables-in-python/")

 # URL from which pdfs to be downloaded


 # Requests URL and get response object


 # Parse text obtained
 soup = BeautifulSoup(driver.page_source, 'html.parser')

 # Find all hyperlinks present on webpage
 links = soup.find_all('a')

 i = 0


 # This will execute System commands , i am using it to execute wget
 # Reason why i am using this is because i am using selenium and not requests, because many websites give error when accessed using resposne
 # Selenium is only solution but selenium page_source is of type String and not bytes and we need bytes to write it to pdf file but bytes response is not possible using selenium
 # I think this is the only solution.
 def runcmd(cmd, verbose = False, *args, **kwargs):
    process = subprocess.Popen(
        cmd,
        stdout = subprocess.PIPE,
        stderr = subprocess.PIPE,
        text = True,
        shell = True
    )
    std_out, std_err = process.communicate()
    if verbose:
        print(std_out.strip(), std_err)
    pass



 # From all links check for pdf link and
 # if present download file
 for link in links:
    if ('.pdf' in link.get('href', [])):
        i += 1
        print("Downloading file: ", i)
        runcmd('wget '+str(link.get('href')))


 print("All PDF files downloaded")
 driver.close()
	# Import libraries
	import subprocess
	from selenium.webdriver.chrome.service import Service
	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.common.by import By
	from webdriver_manager.chrome import ChromeDriverManager
	from bs4 import BeautifulSoup

	options = Options()
	options.add_argument('--headless')
	s = Service(ChromeDriverManager().install())
	driver = webdriver.Chrome(service=s, options=options)
	driver.get("https://www.geeksforgeeks.org/how-to-extract-pdf-tables-in-python/")

	# URL from which pdfs to be downloaded


	# Requests URL and get response object


	# Parse text obtained
	soup = BeautifulSoup(driver.page_source, 'html.parser')

	# Find all hyperlinks present on webpage
	links = soup.find_all('a')

	i = 0


	# This will execute System commands , i am using it to execute wget
	# Reason why i am using this is because i am using selenium and not requests, because many websites give error when accessed using resposne
	# Selenium is only solution but selenium page_source is of type String and not bytes and we need bytes to write it to pdf file but bytes response is not possible using selenium
	# I think this is the only solution.
	def runcmd(cmd, verbose = False, args, *kwargs):
	process = subprocess.Popen(
	cmd,
	stdout = subprocess.PIPE,
	stderr = subprocess.PIPE,
	text = True,
	shell = True
	)
	std_out, std_err = process.communicate()
	if verbose:
	print(std_out.strip(), std_err)
	pass



	# From all links check for pdf link and
	# if present download file
	for link in links:
	if ('.pdf' in link.get('href', [])):
	i += 1
	print("Downloading file: ", i)
	runcmd('wget '+str(link.get('href')))


	print("All PDF files downloaded")
	driver.close()