Created
February 9, 2017 21:33
-
-
Save jamalex/d9524cd56e04d1df6f9d7b97b73ed23c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import tempfile | |
from bs4 import BeautifulSoup | |
from ricecooker.classes.nodes import ChannelNode, HTML5AppNode, TopicNode | |
from ricecooker.classes.files import HTMLZipFile, ThumbnailFile | |
from ricecooker.utils.caching import CacheForeverHeuristic, FileCache, CacheControlAdapter, InvalidatingCacheControlAdapter | |
from ricecooker.utils.html import download_file | |
from ricecooker.utils.zip import create_predictable_zip | |
from ricecooker.classes import licenses | |
sess = requests.Session() | |
cache = FileCache('.webcache') | |
basic_adapter = CacheControlAdapter(cache=cache) | |
forever_adapter= CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) | |
sess.mount('http://', forever_adapter) | |
sess.mount('https://', forever_adapter) | |
def make_fully_qualified_url(url): | |
if url.startswith("//"): | |
return "https:" + url | |
if url.startswith("/"): | |
return "https://en.wikipedia.org" + url | |
assert url.startswith("http"), "Bad URL (relative to unknown location): " + url | |
return url | |
def make_request(url, *args, **kwargs): | |
response = sess.get(url, *args, **kwargs) | |
if response.status_code != 200: | |
print("NOT FOUND:", url) | |
elif not response.from_cache: | |
print("NOT CACHED:", url) | |
return response | |
def get_parsed_html_from_url(url, *args, **kwargs): | |
html = make_request(url, *args, **kwargs).content | |
return BeautifulSoup(html, "html.parser") | |
def construct_channel(*args, **kwargs): | |
channel = ChannelNode( | |
source_domain="jamiealexandre.com", | |
source_id="ja-html-test-1", | |
title="JA HTML Test", | |
thumbnail="https://lh3.googleusercontent.com/zwwddqxgFlP14DlucvBV52RUMA-cV3vRvmjf-iWqxuVhYVmB-l8XN9NDirb0687DSw=w300", | |
) | |
citrus_topic = TopicNode(source_id="List_of_citrus_fruits", title="Citrus!") | |
channel.add_child(citrus_topic) | |
add_subpages_from_wikipedia_list(citrus_topic, "https://en.wikipedia.org/wiki/List_of_citrus_fruits") | |
potato_topic = TopicNode(source_id="List_of_potato_cultivars", title="Potatoes!") | |
channel.add_child(potato_topic) | |
add_subpages_from_wikipedia_list(potato_topic, "https://en.wikipedia.org/wiki/List_of_potato_cultivars") | |
return channel | |
def add_subpages_from_wikipedia_list(topic, list_url): | |
# to understand how the following parsing works, look at: | |
# 1. the source of the page (e.g. https://en.wikipedia.org/wiki/List_of_citrus_fruits), or inspect in chrome dev tools | |
# 2. the documentation for BeautifulSoup version 4: https://www.crummy.com/software/BeautifulSoup/bs4/doc/ | |
# parse the the page into BeautifulSoup format, so we can loop through and manipulate it | |
page = get_parsed_html_from_url(list_url) | |
# extract the main table from the page | |
table = page.find("table") | |
# loop through all the rows in the table | |
for row in table.find_all("tr"): | |
# extract the columns (cells, really) within the current row | |
columns = row.find_all("td") | |
# some rows are empty, so just skip | |
if not columns: | |
continue | |
# get the link to the subpage | |
link = columns[0].find("a") | |
# some rows don't have links, so skip | |
if not link: | |
continue | |
# extract the URL and title for the subpage | |
url = make_fully_qualified_url(link["href"]) | |
title = link.text | |
# attempt to extract a thumbnail for the subpage, from the second column in the table | |
image = columns[1].find("img") | |
thumbnail_url = make_fully_qualified_url(image["src"]) if image else None | |
if thumbnail_url and not (thumbnail_url.endswith("jpg") or thumbnail_url.endswith("png")): | |
thumbnail_url = None | |
# download the wikipedia page into an HTML5 app node | |
html5app = download_wikipedia_page(url, thumbnail=thumbnail_url, title=title) | |
# add the downloaded HTML5 app node into the topic | |
topic.add_child(html5app) | |
def download_wikipedia_page(url, thumbnail, title): | |
# create a temp directory to house our downloaded files | |
destpath = tempfile.mkdtemp() | |
# downlod the main wikipedia page, apply a middleware processor, and call it index.html | |
localref, _ = download_file( | |
url, | |
destpath, | |
filename="index.html", | |
middleware_callbacks=process_wikipedia_page, | |
request_fn=make_request, | |
) | |
# turn the temp folder into a zip file | |
zippath = create_predictable_zip(destpath) | |
# create an HTML5 app node | |
html5app = HTML5AppNode( | |
files=[HTMLZipFile(zippath)], | |
title=title, | |
thumbnail=thumbnail, | |
source_id=url.split("/")[-1], | |
license=licenses.PublicDomainLicense(), | |
) | |
return html5app | |
def process_wikipedia_page(content, baseurl, destpath, **kwargs): | |
page = BeautifulSoup(content, "html.parser") | |
for image in page.find_all("img"): | |
relpath, _ = download_file(make_fully_qualified_url(image["src"]), destpath, request_fn=make_request) | |
image["src"] = relpath | |
return str(page) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment