Last active
March 28, 2018 12:36
-
-
Save jaklinger/cf395a1e06228470bcef28f56de9ca48 to your computer and use it in GitHub Desktop.
Example of scraping in JS-redirected iframe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions | |
import os | |
import time | |
def wait_and_find(driver, element_id, load_time): | |
time.sleep(load_time/3) | |
by = (By.ID, element_id) | |
condition = expected_conditions.presence_of_element_located(by) | |
WebDriverWait(driver, load_time).until(condition) | |
return driver.find_element_by_id(element_id) | |
def get_grant_info_div(driver, url, iframe_id, | |
tab_to_click, div_id, | |
keyword, load_time=1.5): | |
driver.get(url) | |
driver.switch_to_frame(wait_and_find(driver, iframe_id, load_time)) | |
wait_and_find(driver, tab_to_click, load_time).click() | |
div = wait_and_find(driver, div_id, load_time) | |
html = div.get_attribute("innerHTML") | |
assert keyword in html, "'{}' not found in iframe".format(keyword) | |
return html | |
if __name__ == "__main__": | |
# Set PATH so that Selenium picks up the local chromedriver | |
os.environ["PATH"] = os.environ["PWD"] + os.pathsep + os.environ["PATH"] | |
chrome_options = webdriver.ChromeOptions() | |
chrome_options.add_argument("--headless") | |
driver = webdriver.Chrome("chromedriver", chrome_options=chrome_options) | |
# Iterate through URLs and get the iframe source | |
iframe_id = "embeddedIframe" | |
tab_to_click = "synopsisDetailsTab" | |
div_id = "synopsisDetailsContent" | |
keyword = "Funding Opportunity Number" | |
max_load_time = 10 | |
min_load_time = 1 | |
urls = ["https://www.grants.gov/view-opportunity.html?oppId=39707"]*10 | |
for url in urls: | |
load_time = min_load_time | |
html = None | |
while html is None: | |
try: | |
html = get_grant_info_div(driver, url, iframe_id, | |
tab_to_click, div_id, | |
keyword, load_time=load_time) | |
#print(html) | |
# Do whatever you want with the HTML source | |
except Exception as err: | |
print("ERROR ({})".format(url)) | |
load_time += 1 | |
driver.quit() | |
assert load_time <= max_load_time, "Maximum retries exceeded" | |
driver = webdriver.Chrome("chromedriver", | |
chrome_options=chrome_options) | |
# Cleanly exit | |
driver.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment