Created
December 29, 2022 20:13
-
-
Save RelativisticMechanic/602cfd8c971efe2700318a7d62dd4a0f to your computer and use it in GitHub Desktop.
A Web Crawler in Python That Takes Screenshots and Saves it (Uses Selenium and BSIV)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# implement a web crawler in python | |
import requests | |
import bs4 | |
import selenium.webdriver | |
# URL | |
source_url = 'https://bhagvad-gita.github.io/' | |
# Which folder to save screenshots in | |
folder = './screenshots/' | |
# How many levels to traverse? | |
levels = 1 | |
selenium_driver = selenium.webdriver.Edge() | |
def crawl(source_url, drv, folder, max_level, currentlevel=0): | |
if(max_level < currentlevel): | |
return | |
urls = get_all_secure_hyperlinks(source_url) | |
take_screenshot_bulk(drv, folder, urls) | |
for url in urls: | |
crawl(url, drv, folder, max_level, currentlevel + 1) | |
def get_all_secure_hyperlinks(source_url): | |
reqs = requests.get(source_url) | |
soup = bs4.BeautifulSoup(reqs.text, 'html.parser') | |
urls = [] | |
for link in soup.find_all('a'): | |
if(str(link.get('href')).startswith('https')): | |
urls.append(str(link.get('href'))) | |
elif(str(link.get('href')).startswith('./')): | |
urls.append(source_url + str(link.get('href')[2:])) | |
return urls | |
def take_screenshot_bulk(drv, folder, urls): | |
for url in urls: | |
take_screenshot_of_page(drv, folder, url) | |
def take_screenshot_of_page(drv, folder, url): | |
drv.get(url) | |
drv.save_screenshot(f'./{folder}/{url.replace(":", "").replace("/", "")}_scrnshot.png') | |
crawl(source_url, selenium_driver, folder, levels) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment