Created
March 31, 2017 12:45
-
-
Save ugnb/c747df047d03edabd278ef03878c7d8f to your computer and use it in GitHub Desktop.
A faster way to extract all URLs from page using Selenium WebDriver in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from json import JSONDecodeError | |
from typing import List | |
from selenium import webdriver | |
from selenium.common.exceptions import WebDriverException | |
driver = webdriver.Remote( | |
command_executor='{}/wd/hub'.format('http://localhost:4444'), | |
desired_capabilities=webdriver.ChromeOptions().to_capabilities(), | |
keep_alive=True) | |
driver.get('http://google.com/') | |
try: | |
links_json: str = driver.execute_script( | |
"return JSON.stringify([].slice.call(document.getElementsByTagName('a')).map(a => a.href))") | |
urls: List[str] = json.loads(links_json) | |
print(urls) | |
except WebDriverException as ex: | |
print("Failed to get links on page: {}".format(ex)) | |
except JSONDecodeError: | |
print("Failed to decode links JSON array: {}".format(links_json)) | |
driver.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment