Skip to content

Instantly share code, notes, and snippets.

@LewisGet
Last active December 3, 2024 18:20
Show Gist options
  • Save LewisGet/44f874b019a5b02c27116f4e55fb64b6 to your computer and use it in GitHub Desktop.
Save LewisGet/44f874b019a5b02c27116f4e55fb64b6 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from time import sleep
class WebCrawler:
def __init__(self, default_url):
self.driver = webdriver.Chrome()
self.driver.get(default_url)
self.content_selector = (By.ID, "content")
self.href_selector = (By.CSS_SELECTOR, ".next a")
self.content = ""
sleep(5)
def get_next_dom(self):
return self.driver.find_element(self.href_selector)
def get_next_href(self):
return self.get_next_dom().get_attribute('href')
def get_content(self):
return self.driver.find_element(self.content_selector).get_attribute('innerText')
def crawl_page(self):
try:
self.content = self.content + self.get_content()
url = self.get_next_href()
if url == "" or url == None:
raise "not found"
self.get_next_dom().click()
sleep(5)
except Exception as e:
print(e)
return False
return True
def execute(self):
stop = False
while True:
if stop:
break
found_content = self.crawl_page()
if not found_content:
stop = True
return self.content
m = WebCrawler("https://127.0.0.1")
content = m.execute()
path = "output.txt"
f = open(path, 'w', encoding='UTF-8')
f.write(content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment