Created
August 20, 2020 10:43
-
-
Save ivanistheone/ed654d3ccff7ef9f52891e056beaaf4e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def retrieve_flex_book_lesson(item): | |
html_url = "/flx/get/perma/modality/lesson/{lesson_id}/{domain_id}?format=html".format( | |
lesson_id=item["handle"], | |
domain_id=item["domain"]["encodedID"], | |
) | |
dst = tempfile.mkdtemp() | |
try: | |
download_ck12_file( | |
html_url, | |
dst, | |
filename="index.html", | |
middleware_callbacks=[process_flex_book], | |
) | |
except TooManyLinksToBotherIncluding: | |
return None | |
except ContainsNonfreeImage: | |
return None | |
shutil.copy("resources/flex-styles.css", dst) | |
download_mathjax(dst) | |
# preview_in_browser(dst) | |
zippath = create_predictable_zip(dst) | |
node = HTML5AppNode(files=[HTMLZipFile(zippath)], **shared_node_attributes(item, title_suffix=" (Flexbook)")) | |
return node | |
def process_flex_book(content, baseurl, destpath, **kwargs): | |
html = BeautifulSoup(content, "html.parser") | |
# some pages are basically just a list of links; since we strip links out, no use keeping these | |
ratio = len(" ".join([a.text for a in html.find_all("a")])) / len(html.text) | |
if ratio > 0.1: | |
raise TooManyLinksToBotherIncluding() | |
# download all the images in the page, and replace the img src's to make them local | |
for img in html.find_all("img"): | |
if img.get("data-flx-url"): | |
filename = urlparse(img.get("data-flx-url")).path.split("/")[-1] | |
del img["data-flx-url"] | |
else: | |
filename = hashlib.md5(img["src"].encode()).hexdigest() + ".png" | |
# check for license comments preceding the image | |
license = "" | |
prev = img.previous | |
for i in range(8): | |
if "@@license" in prev: | |
license = prev.split('"')[1].lower() | |
break | |
prev = prev.previous | |
next = img.next | |
for i in range(8): | |
if "@@license" in next: | |
license = next.split('"')[1].lower() | |
break | |
next = next.next | |
# check whether the image is marked as under a non-open license, and abort if it is | |
if "shutterstock" in license or "under license" in license or "permission" in license or "getty" in license or "all rights reserved" in license: | |
# TODO: determine when an image is critical to the flow of the text, and don't skip the entire flexbook if possible | |
raise ContainsNonfreeImage() | |
img["src"], _ = download_ck12_file(img["src"], destpath, filename=filename) | |
for iframe in html.find_all("iframe"): | |
# print("Skipping iframe: " + iframe.get("src")) | |
iframe.extract() | |
for node in html.find_all(): | |
# remove all id attributes | |
if node.get("id"): | |
del node["id"] | |
# remove empty paragraph tags | |
for p in html.find_all("p"): | |
# if it has attributes, it might be needed for something | |
if p.attrs: | |
continue | |
# if it has non-empty text nodes, don't remove | |
if any(node.strip() for node in p.children if isinstance(node, str)): | |
continue | |
# if it has non-text nodes, don't remove | |
if any(not isinstance(node, str) for node in p.children): | |
continue | |
# if we got this far, it's just a boring old empty p tag | |
p.extract() | |
# remove or update links | |
for a in html.find_all("a"): | |
url = a.get("href", "") | |
# if urlparse(url).path.endswith(".pdf"): | |
# fileurlpath, response = download_ck12_file(url, destpath) | |
# if response.status_code == 200: | |
# a["href"] = fileurlpath | |
# a["target"] = "_blank" | |
# continue | |
a.unwrap() | |
# remove extra sections at end that we don't want | |
for h3 in html.find_all("h3"): | |
if h3.text in ["Review (Answers)", "Resources"]: | |
node = h3 | |
while True: | |
node, _ = node.nextSibling, node.extract() | |
if not node or node.name == "h3" or str(node).strip().startswith("End inserted XHTML"): | |
break | |
# insert the shared head and body resources into the html | |
insert_codeblock_into_soup_node("resources/flex-headblock.html", html.head) | |
insert_codeblock_into_soup_node("resources/flex-bodyblock.html", html.body) | |
return html.prettify(encoding="ascii") | |
def insert_codeblock_into_soup_node(filepath, soupnode, position=-1): | |
with open(filepath) as f: | |
newsoup = BeautifulSoup(f.read(), "html.parser") | |
for node in list(newsoup.children): | |
soupnode.insert(position, node) | |
def perform_mockjax_api_downloads(filepath, domain_id, domain_handle, question_id, artifact_id): | |
cookies = {"asmt-plix-trial": "1"} | |
resp = download_and_mock_api_endpoint_with_mockjax("/assessment/api/get/info/test/plix%20practice/plixID/" + question_id, filepath, cookies=cookies) | |
test_id = json.loads(resp)["response"]["test"]["_id"] | |
download_and_mock_api_endpoint_with_mockjax("/assessment/api/render/questionInstance?evalData=True&includeConcepts=True&ans=True&qID=" + question_id, filepath, cookies=cookies) | |
download_and_mock_api_endpoint_with_mockjax("/assessment/api/start/tests/{test_id}?instanceBundle=true&evalData=true&includePLIX=true".format(test_id=test_id), filepath, cookies=cookies) | |
download_and_mock_api_endpoint_with_mockjax("/flx/get/minimal/modalities/{handle}?ownedBy=ck12&modalities=lesson".format(handle=domain_handle), filepath) | |
download_and_mock_api_endpoint_with_mockjax("/flx/get/info/artifact/{a_id}".format(a_id=artifact_id), filepath) | |
download_and_mock_api_endpoint_with_mockjax("/assessment/api/browse/info/questions/geometry-interactive?pageNum=1&pageSize=6&filters=encodedIDs," + domain_id, filepath) | |
download_and_mock_api_endpoint_with_mockjax("/taxonomy/get/info/concept/" + domain_id, filepath) | |
download_and_mock_api_endpoint_with_mockjax("/assessment/tools/geometry-tool/challengeMeTemp.html", filepath, mocked_url_pattern=".*challengeMeTemp\.html", wrap_in_string=True) | |
def download_and_mock_api_endpoint_with_mockjax(url, xhr_mock_path, mocked_url_pattern=None, middleware_callbacks=None, middleware_kwargs=None, cookies=None, wrap_in_string=False): | |
# mocked_url_pattern is matched against request URLs to decide when to mock; if not set, use actual url's path | |
mocked_url_pattern = mocked_url_pattern or (".*" + urlparse(url).path.replace("/", "\/") + ".*") | |
# make the request to the URL | |
content = make_request(make_fully_qualified_url(url), cookies=cookies).content.decode() | |
# if there are any middleware callbacks, apply them to the content | |
if middleware_callbacks: | |
if not isinstance(middleware_callbacks, list): | |
middleware_callbacks = [middleware_callbacks] | |
for callback in middleware_callbacks: | |
content = callback(content, **middleware_kwargs) | |
divider = "// insertion point for more lines" | |
if os.path.exists(xhr_mock_path): | |
with open(xhr_mock_path, "r") as f: | |
xhr_mock_text = f.read() | |
else: | |
xhr_mock_text = """ | |
define(["mockjax"], function(mockjax) {{ | |
{divider} | |
}}); | |
""".format(divider=divider) | |
if wrap_in_string: | |
content = repr(content) | |
with open(xhr_mock_path, "w") as f: | |
mock_line = """ | |
$.mockjax({{url: /{url}/i, responseText: {content}}}); | |
""".format(url=mocked_url_pattern, content=content.replace("\n", " ").replace("\r", " ")) | |
xhr_mock_text = xhr_mock_text.replace(divider, mock_line + divider) | |
f.write(xhr_mock_text) | |
return content | |
def add_xhr_mocking_to_index(content, destpath, **kwargs): | |
html = BeautifulSoup(content, "html.parser") | |
# copy over the static resources that are needed | |
xhr_deps = ["xhr.mock.js", "plix-xhr.mock.js"] | |
for dep in xhr_deps: | |
shutil.copy(os.path.join("resources", dep), destpath) | |
# download necessary API endpoints | |
downloaded_mock_filename = "xhr.mock.apidownloads.js" | |
download_and_mock_api_endpoint_with_xhr("http://google.com/", destpath, downloaded_mock_filename, mocked_url="./build/././modalityAssign/js/templates/modal.info.html") | |
# insert script tags into page | |
scripts_to_insert = ["./" + filename for filename in xhr_deps + [downloaded_mock_filename]] | |
for script in scripts_to_insert: | |
html.head.append(html.new_tag("script", src=script)) | |
return str(html) | |
def download_and_mock_api_endpoint_with_xhr(url, destpath, filename, mocked_url=None, middleware_callbacks=None, middleware_kwargs=None): | |
# mocked_url is the url to mach against requests to decide when to mock; if not set, use actual url | |
mocked_url = mocked_url or url | |
# make the request to the URL | |
content = make_request(make_fully_qualified_url(url)).content.decode() | |
# if there are any middleware callbacks, apply them to the content | |
if middleware_callbacks: | |
if not isinstance(middleware_callbacks, list): | |
middleware_callbacks = [middleware_callbacks] | |
for callback in middleware_callbacks: | |
content = callback(content, **middleware_kwargs) | |
xhr_mock_path = os.path.join(destpath, filename) | |
with open(xhr_mock_path, "a") as f: | |
mock_line = """ | |
xhr_mock.mock_conditionally({{url: "{url}"}}, {content}); | |
""".format(url=mocked_url, content=json.dumps(content)) | |
f.write(mock_line) | |
return content |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment