Last active
February 4, 2022 08:03
-
-
Save iam-mhaseeb/d2dfb5eb20db187db59d4fa15aa57439 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import logging | |
import requests | |
from bs4 import BeautifulSoup | |
from collections import deque | |
from urllib.parse import urlsplit | |
from urllib.parse import urlparse | |
LOGGER = logging.getLogger(__name__) | |
class BaseScraper: | |
def __init__(self, start_urls): | |
if not start_urls: | |
raise Exception('start_urls are required to start crawling...') | |
self.start_urls = deque(start_urls) | |
def crawl(self): | |
# process urls one by one until we exhaust the queue | |
while len(self.start_urls): | |
url = self.start_urls.popleft() | |
LOGGER.info('Processing url: {}'.format(url)) | |
try: | |
response = requests.get(url) | |
except Exception as e: | |
LOGGER.error('Failed to process url: {} with following error: {}'.format(url, e)) | |
broken_urls.add(url) | |
continue | |
self.parse(response) | |
def parse(self, response): | |
raise NotImplementedError('Implementation of parse function is required...') | |
def run(self): | |
LOGGER.info('Running scraper...') | |
self.crawl() | |
LOGGER.info('Finished running scraper!') | |
class CustomScraper(BaseScraper): | |
start_urls = ['https://scrapethissite.com'] | |
processed_urls = set() | |
local_urls = set() | |
foreign_urls = set() | |
broken_urls = set() | |
def parse(self, response): | |
soup = BeautifulSoup(response.text, "lxml") | |
for link in soup.find_all('a'): | |
extracted_url = link.attrs["href"] if "href" in link.attrs else '' | |
if response.url in extracted_url: | |
self.local_urls.add(url) | |
else: | |
self.foreign_urls.add(url) | |
self.processed_urls.add(url) | |
if __name__ == "main": | |
scraper = CustomScraper() | |
scraper.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment