Created
October 19, 2021 11:17
-
-
Save bepcyc/c5a4f381d88024912b9c0aed1aba625b to your computer and use it in GitHub Desktop.
Crawl a dynamic website in headless mode
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### based on great SO answers: https://stackoverflow.com/a/50593885/918211 and https://stackoverflow.com/a/46768243/918211 | |
## Debian/Ubuntu specific | |
# sudo apt install -y firefox-geckodriver | |
# python3 -m venv venv | |
# cd venv | |
# source bin/activate | |
# pip install selenium beautifulsoup4 | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from bs4 import BeautifulSoup as bs | |
from selenium.webdriver.firefox.options import Options | |
options = Options() | |
options.headless = True | |
driver = webdriver.Firefox(options=options) | |
driver.get("https://yourdynamicwebsite.org/dgfdgdgd") | |
try: | |
element = WebDriverWait(driver, 10).until( | |
EC.presence_of_element_located((By.ID, "myDynamicElement"))) #waits 10 seconds until element is located. Can have other wait conditions such as visibility_of_element_located or text_to_be_present_in_element | |
html = driver.page_source | |
soup = bs(html, "lxml") | |
dynamic_text = soup.find_all("p", {"class":"class_name"}) #or other attributes, optional | |
else: | |
print("Couldnt locate element") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment