Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save RelativisticMechanic/602cfd8c971efe2700318a7d62dd4a0f to your computer and use it in GitHub Desktop.
Save RelativisticMechanic/602cfd8c971efe2700318a7d62dd4a0f to your computer and use it in GitHub Desktop.
A Web Crawler in Python That Takes Screenshots and Saves it (Uses Selenium and BSIV)
# implement a web crawler in python
import requests
import bs4
import selenium.webdriver
# URL
source_url = 'https://bhagvad-gita.github.io/'
# Which folder to save screenshots in
folder = './screenshots/'
# How many levels to traverse?
levels = 1
selenium_driver = selenium.webdriver.Edge()
def crawl(source_url, drv, folder, max_level, currentlevel=0):
if(max_level < currentlevel):
return
urls = get_all_secure_hyperlinks(source_url)
take_screenshot_bulk(drv, folder, urls)
for url in urls:
crawl(url, drv, folder, max_level, currentlevel + 1)
def get_all_secure_hyperlinks(source_url):
reqs = requests.get(source_url)
soup = bs4.BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a'):
if(str(link.get('href')).startswith('https')):
urls.append(str(link.get('href')))
elif(str(link.get('href')).startswith('./')):
urls.append(source_url + str(link.get('href')[2:]))
return urls
def take_screenshot_bulk(drv, folder, urls):
for url in urls:
take_screenshot_of_page(drv, folder, url)
def take_screenshot_of_page(drv, folder, url):
drv.get(url)
drv.save_screenshot(f'./{folder}/{url.replace(":", "").replace("/", "")}_scrnshot.png')
crawl(source_url, selenium_driver, folder, levels)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment