Last active
November 25, 2019 02:45
-
-
Save datadavev/8ec2f478b981fc23ee6f81fcceb0bc87 to your computer and use it in GitHub Desktop.
Headless partial render of a javascript app page
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Returns a partially rendered DataONE view, without the additional javascript | |
being executed. | |
This script is proof of concept only, and will only handle a single | |
request at a time since the single Chrome driver instance is reused. | |
Could recreate the driver for each request or use a pool of drivers. | |
Given a URL that starts with "https://search.", load the page using | |
Selenium and allow javascript to execute until the desired content | |
is rendered, then return the HTML of the page at that point. It will | |
include the basic text etc about the data package, and also includes | |
the JSON-LD in the header. | |
To run: | |
1. Install python flask, http://flask.pocoo.org/ | |
2. Install Selenium with the chrome driver, https://www.seleniumhq.org/ | |
3. Install python selenium, https://selenium-python.readthedocs.io/ | |
Navigate to whereever this files is located then: | |
$ FLASK_APP=amirror.py flask run | |
Example call: | |
http://localhost:5000/S/https://search.dataone.org/view/https://pasta.lternet.edu/package/metadata/eml/edi/198/5 | |
If running from a laptop, you can use ngrok to expose to the Google | |
structured data test tool. | |
e.g.: $ ngrok 5000 | |
Then the serice will be avalable at something like: | |
http://27a31f08.ngrok.io/S/https://search.dataone.org/view/https://pasta.lternet.edu/package/metadata/eml/edi/198/5 | |
""" | |
import logging | |
import time | |
from flask import Flask | |
from flask import abort, stream_with_context, request, Response | |
import selenium.common.exceptions | |
from selenium import webdriver | |
import selenium.webdriver.support.ui as ui | |
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
from selenium.webdriver.chrome.options import Options | |
from multiprocessing import Pool, cpu_count | |
selenium_pool = Pool() | |
app = Flask("amirror") | |
caps = DesiredCapabilities.CHROME | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") | |
DRIVER = webdriver.Chrome( | |
"chromedriver", desired_capabilities=caps, chrome_options=chrome_options | |
) | |
CACHE = {} | |
def shutdown_server(): | |
func = request.environ.get('werkzeug.server.shutdown') | |
if func is None: | |
raise RuntimeError('Not running with the Werkzeug Server') | |
DRIVER.close() | |
func() | |
@app.route('/shutdown', methods=['POST']) | |
def shutdown(): | |
""" | |
curl -X POST http://localhost:5000/shutdown | |
""" | |
shutdown_server() | |
return 'Server shutting down...' | |
def pageHasLoaded(driver): | |
"""Checks the ready state of the document. | |
""" | |
logging.debug("Checking if {} page is loaded.".format(driver.current_url)) | |
page_state = driver.execute_script("return document.readyState;") | |
return page_state == "complete" | |
def ajaxNotActive(driver): | |
"""Checks if any ajax requests are pending | |
""" | |
jquery_state = False | |
try: | |
jquery_state = driver.execute_script("return $.active == 0") | |
except selenium.common.exceptions.WebDriverException: | |
jquery_state = False | |
return jquery_state | |
@app.route("/S/<path:URL>") | |
def minimalMirror(URL): | |
"""return the partially loaded page. Javascript is run until the | |
element with id="Metadata" is present in the DOM and outstanding | |
ajax requests are completed. | |
""" | |
if not URL.startswith("https://search."): | |
abort(404) | |
def generate(url): | |
DRIVER.get(url) | |
wait = ui.WebDriverWait(DRIVER, 6) | |
wait.until(lambda DRIVER: pageHasLoaded(DRIVER)) | |
#wait.until(lambda DRIVER: DRIVER.find_element_by_class_name("metadata-view")) | |
wait.until(lambda DRIVER: DRIVER.find_element_by_id("Metadata")) | |
element = DRIVER.find_element_by_id("loading-app") | |
DRIVER.execute_script("arguments[0].hidden = true", element) | |
#wait.until(lambda DRIVER: DRIVER.find_element_by_id("jsonld")) | |
done = True | |
while not done: | |
if ajaxNotActive(DRIVER): | |
done = True | |
time.sleep(0.1) | |
CACHE[url] = DRIVER.page_source | |
yield CACHE[url] | |
try: | |
return Response(CACHE[URL]) | |
except KeyError: | |
pass | |
return Response(stream_with_context(generate(URL))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment