Last active
February 5, 2019 18:23
-
-
Save cheevahagadog/bb60240edd16499d3e368879f2f7d07a to your computer and use it in GitHub Desktop.
A brief example of using Requestium in a cloud environment while setting the download directory and waiting for files to download.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Note this is designed to run for Python 3.6 | |
# -- Pre reqs: | |
# 1. Install Python (I like using Miniconda, version 3.6) | |
# 2. Install Git | |
# 3. Install a chromedriver | |
# 4. Install Chrome | |
# 5. Install Python dependencies | |
from pyvirtualdisplay import Display # For headless browsing | |
from selenium import webdriver | |
import requestium | |
import time | |
import glob | |
import os | |
import logging | |
from logging.handlers import RotatingFileHandler | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
file_handler = RotatingFileHandler("log.txt", 'a', 1 * 1024 * 1024, 10) | |
file_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]')) | |
file_handler.setLevel(logging.INFO) | |
logger.addHandler(file_handler) | |
class Browser(object): | |
def __init__(self, link, directory=None): | |
"""Class for handling proper setup and teardown of a webdriven client for a remote environment. | |
Args: | |
link: str, URL to the page you want to visit | |
directory: str, full path to an existing directory where downloaded files will land | |
""" | |
display = Display(visible=0, size=(1000, 1000)) | |
self.chromedriver = "/usr/local/bin/chromedriver" | |
options = webdriver.ChromeOptions() | |
if isinstance(directory, str) and os.path.isdir(directory): | |
self.webdriver_options = {'prefs': {'download.default_directory': directory}} | |
self.directory = directory | |
else: | |
self.webdriver_options = None | |
self.directory = '.' | |
self.session = requestium.Session( | |
webdriver_path=self.chromedriver, | |
browser='chrome', | |
default_timeout=15, | |
webdriver_options=self.webdriver_options) | |
self.display.start() | |
self.session.driver.get(link) | |
logger.info(f"(setup) Started display, session and visited {link}.") | |
def select_and_download_data(self): | |
"""Example for interacting with the NCES college website""" | |
# Let's just select the schools in New York (because I love New York!) | |
states = self.session.driver.ensure_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_ucMapMain_lstState') | |
state_select = requestium.Select(states) | |
state_select.deselect_all() | |
state_select.select_by_value("NY") | |
# Only selecting schools with undergrad and grad options | |
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkGrad").click() | |
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkBach").click() | |
# Public schools | |
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkControlPublic").click() | |
# Show Results | |
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_btnSearch").click() | |
# Click on "Export Results" | |
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_divExport").click() | |
# Click the CSV option output | |
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_rdbCSV").click() | |
# click the final export button --> this will download the file to our specified directory | |
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_aExportData").click() | |
success = self.wait_for_download_to_complete(file_name=file_) | |
return success | |
def wait_for_download_to_complete(self, file_name, delay=2, tries_max=10): | |
"""Waits for a file to download before continuing execution. | |
Args: | |
file_name: str, the name of the file to be downloaded including the extension | |
delay: int, how many seconds to wait before checking the file again | |
tries_max: int, how many attempts at checking a download is happening before quitting | |
Returns: | |
bool: True if the file successfully downloaded, else False | |
""" | |
downloading_file = os.path.join(self.directory, "Unconfirmed*.crdownload") | |
finished_file = os.path.join(self.directory, file_name) | |
n_tries = 0 | |
download_started = False | |
while n_tries < tries_max: | |
currently_downloading = glob.glob(downloading_file) | |
file_is_downloaded = glob.glob(finished_file) | |
# A file is downloading, but our expected file isn't there yet | |
if currently_downloading and not file_is_downloaded: | |
download_started = True | |
time.sleep(delay) | |
elif not currently_downloading and download_started and not file_is_downloaded: | |
raise ValueError(f"File downloaded but was perhaps misnamed. No {finished_file} file found!") | |
elif file_is_downloaded: | |
break | |
# we wait for a file to show up as downloading | |
else: | |
n_tries += 1 | |
time.sleep(delay) | |
return file_is_downloaded | |
def wrap_up(self, session): | |
"""Close the browser session and the display""" | |
session.driver.quit() # Stops the Chrome session | |
self.display.sendstop() # Safely closes the virtualdisplay | |
logger.info('(wrap_up) Closed display and chrome browser') | |
def main(self): | |
try: | |
success = self.select_and_download_data() | |
finally: | |
self.wrap_up() | |
if __name__ == '__main__': | |
butils = Browser(link="https://nces.ed.gov/collegenavigator/", | |
directory='/my/download/location/dir') | |
butils.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment