Skip to content

Instantly share code, notes, and snippets.

@code-simple
Last active January 24, 2023 12:25
Show Gist options
  • Save code-simple/38918e17f25981d4b69137e758114c7d to your computer and use it in GitHub Desktop.
Save code-simple/38918e17f25981d4b69137e758114c7d to your computer and use it in GitHub Desktop.
# Import libraries
import subprocess
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
options = Options()
options.add_argument('--headless')
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s, options=options)
driver.get("https://www.geeksforgeeks.org/how-to-extract-pdf-tables-in-python/")
# URL from which pdfs to be downloaded
# Requests URL and get response object
# Parse text obtained
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Find all hyperlinks present on webpage
links = soup.find_all('a')
i = 0
# This will execute System commands , i am using it to execute wget
# Reason why i am using this is because i am using selenium and not requests, because many websites give error when accessed using resposne
# Selenium is only solution but selenium page_source is of type String and not bytes and we need bytes to write it to pdf file but bytes response is not possible using selenium
# I think this is the only solution.
def runcmd(cmd, verbose = False, *args, **kwargs):
process = subprocess.Popen(
cmd,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE,
text = True,
shell = True
)
std_out, std_err = process.communicate()
if verbose:
print(std_out.strip(), std_err)
pass
# From all links check for pdf link and
# if present download file
for link in links:
if ('.pdf' in link.get('href', [])):
i += 1
print("Downloading file: ", i)
runcmd('wget '+str(link.get('href')))
print("All PDF files downloaded")
driver.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment