Created
March 4, 2019 02:12
-
-
Save ssokolow/4ca5fe898c9cbe3896badd215042056e to your computer and use it in GitHub Desktop.
A one-off script for converting a specific dump of the SDL wiki into a Dash/Zeal docset... in case it's useful to someone
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Quick script to generate a Dash/Zeal docset from the SDL 2 wiki. | |
Requirements: | |
- Python 2.x (3.x may work, but it's untested) | |
- LXML (for parsing the non-XML HTML used in the pages) | |
- Pillow (for converting favicon.ico into icon.png) | |
- http://www.libsdl.org/tmp/SDL-wiki.zip | |
""" | |
from __future__ import print_function | |
__author__ = "Stephan Sokolow (deitarion/SSokolow)" | |
__license__ = "MIT" | |
# ---=== Configuration Begins ===--- | |
# Pages which would be undesirably excluded by categorical rules below | |
WHITELISTED_PAGES = [ | |
'FrontPage.html', | |
'SDL_SaveDollarTemplate.html' | |
] | |
UNWANTED_PAGES = [ | |
# Home pages missing CategoryHomepage | |
'Sam(20)Lantinga.html', | |
'Spikerocks101.html', | |
# Useless in offline docs | |
'APIContributionStyleGuide.html', | |
'Contributing.html', | |
'error.log', | |
'SDL(2d)gsoc*.html', # Scratch pages | |
'SDL*Template.html', # Page Templates (Note: see whitelist) | |
'SG*.html', # Style Guides | |
'Roadmap.html', | |
'Test.html', | |
'ToDo.html', | |
] | |
# Unwanted pages which also contain lists of links to unwanted pages | |
# (This throws out the system pages and most of the contributor home pages) | |
UNWANTED_GROUPS = [ | |
'AdminGroup.html', | |
'AutoAdminGroup.html', | |
'CategoryHomepage.html', | |
'ContributorGroup.html', | |
'EditorGroup.html', | |
'SystemPages*Group.html', | |
'Wiki(20)Help.html', | |
] | |
# Pages which shouldn't trigger a missing page warning if not found | |
# (eg. Stuff linked from retained pages which we intentionally stripped) | |
EXPECTED_DEADLINKS = [ | |
# Stuff we generate our own more helpful replacements for | |
'WordIndex.html', | |
'TitleIndex.html', | |
'CategoryCategory.html', | |
] | |
# Metadata | |
DOCSET_ID = 'sdl2' | |
DOCSET_NAME = 'SDL 2' | |
START_PAGE = 'index.html' | |
BASE_URL = 'https://wiki.libsdl.org/' | |
SRC_DIR = 'SDL-wiki' | |
ICO_URL = 'https://www.libsdl.org/favicon.ico' | |
ZIP_URL = 'http://www.libsdl.org/tmp/SDL-wiki.zip' | |
ICO_FILE = "sdl2.ico" | |
ZIP_FILE = "%s.zip" % SRC_DIR | |
TAR_FILE = "%s.tar" % DOCSET_NAME.replace(' ', '_') | |
PLIST_TMPL = """<?xml version="1.0" encoding="UTF-8"?> | |
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" | |
"http://www.apple.com/DTDs/PropertyList-1.0.dtd"> | |
<plist version="1.0"> | |
<dict> | |
<key>CFBundleIdentifier</key> | |
<string>{id}</string> | |
<key>CFBundleName</key> | |
<string>{name}</string> | |
<key>DocSetPlatformFamily</key> | |
<string>{id}</string> | |
<key>isDashDocset</key> | |
<true/> | |
<key>dashIndexFilePath</key> | |
<string>{start_page}</string> | |
<key>DashDocSetFallbackURL</key> | |
<string>{base_url}</string> | |
</dict> | |
</plist>""" | |
# ---=== Code Begins ===--- | |
import glob, os, shutil, sqlite3, tarfile | |
from urllib import url2pathname, urlretrieve | |
from urlparse import urlparse | |
# External dependencies | |
from lxml.html import parse # Needed for parsing non-XML HTML | |
from PIL import Image | |
def prep_path(url): | |
"""Resolve a URL to a local path, enforcing the deletion whitelist.""" | |
parts = urlparse(url) | |
if parts.scheme or parts.netloc: | |
return | |
path = os.path.normcase(os.path.normpath(url2pathname(parts.path))) | |
if path in WHITELISTED_PAGES: | |
return | |
return path | |
unwanted_files = [] | |
def add_unwanted(page_fname): | |
"""Resolve a URL to a local path and mark it for deletion""" | |
path = prep_path(page_fname) | |
if not path: | |
return | |
if page_fname not in unwanted_files and os.path.exists(page_fname): | |
unwanted_files.append(page_fname) | |
def delete_unwanted(): | |
"""Apply and empty the deletion queue""" | |
cwd = os.getcwd() | |
for fname in unwanted_files: | |
fpath = os.path.abspath(fname) | |
if not fpath.startswith(cwd): | |
print("Skipping for safety: %s" % fpath) | |
continue | |
if os.path.isdir(fpath): | |
shutil.rmtree(fpath) | |
elif os.path.isfile(fpath): | |
os.remove(fpath) | |
unwanted_files[:] = [] | |
# ---=== Main Program Begins ===--- | |
# Unpack SDL-wiki.zip if not already done | |
if not os.path.exists(SRC_DIR): | |
if not os.path.exists(ZIP_FILE): | |
print("Downloading %s..." % ZIP_URL) | |
urlretrieve(ZIP_URL, ZIP_FILE) | |
print("Unpacking %s..." % ZIP_FILE) | |
import zipfile | |
with zipfile.ZipFile(ZIP_FILE, 'r') as zobj: | |
zobj.extractall() | |
os.chdir(SRC_DIR) | |
print("Removing unwanted pages...") | |
# Mark all unwanted boilerplate files for deletion | |
for glob_pat in UNWANTED_PAGES: | |
for page_fname in glob.glob(glob_pat): | |
add_unwanted(page_fname) | |
for glob_pat in UNWANTED_GROUPS: | |
for group_fname in glob.glob(glob_pat): | |
add_unwanted(group_fname) | |
for node in parse(group_fname).findall('.//a'): | |
add_unwanted(node.get('href', '')) | |
print("Searching for and removing mis-tagged CategoryHomepage pages...") | |
# Mark all home pages where CategoryHomepage was mis-applied somehow | |
# (Where they link to it, but it doesn't link to them) | |
for page in glob.glob('*.html'): | |
for node in parse(page).findall('.//a'): | |
if 'CategoryHomepage.html' in node.get('href', ''): | |
add_unwanted(page) | |
# Delete all marked files | |
delete_unwanted() | |
print("Deleting orphaned attachments...") | |
remaining = os.listdir('.') | |
for fname in os.listdir('attachments'): | |
if fname + '.html' not in remaining: | |
add_unwanted(os.path.join('attachments', fname)) | |
delete_unwanted() | |
print("Removing dead links and missing images...") | |
for page in glob.glob('*.html'): | |
changed, root = False, parse(page) | |
for node in root.findall('.//a'): | |
link_url = prep_path(node.get('href', '')) | |
if not link_url: | |
continue | |
if os.path.exists(link_url): | |
continue | |
elif link_url in EXPECTED_DEADLINKS or link_url in UNWANTED_PAGES: | |
node.tag = 'span' | |
changed = True | |
else: | |
print("WARNING: Missing page: %s" % link_url) | |
# Remove dead <img> tags | |
for node in root.findall('.//img'): | |
if not os.path.exists(prep_path(node.get('src', ''))): | |
node.getparent().remove(node) | |
if changed: | |
root.write(page) | |
print("Setting up docset directory structure...") | |
os.chdir(os.pardir) | |
dsdir = "%s.docset" % DOCSET_NAME.replace(' ', '_') | |
cntdir = os.path.join(dsdir, "Contents") | |
resdir = os.path.join(cntdir, "Resources") | |
os.makedirs(resdir) | |
docdir = os.path.join(resdir, "Documents") | |
os.rename(SRC_DIR, docdir) | |
print("Generating Info.plist...") | |
with open(os.path.join(cntdir, "Info.plist"), 'w') as fobj: | |
fobj.write(PLIST_TMPL.format( | |
id=DOCSET_ID, | |
name=DOCSET_NAME, | |
start_page=START_PAGE, | |
base_url=BASE_URL, | |
)) | |
print("Generating index...") | |
conn = sqlite3.connect(os.path.join(resdir, "docSet.dsidx")) | |
conn.executescript(""" | |
CREATE TABLE searchIndex( | |
id INTEGER PRIMARY KEY, | |
name TEXT, | |
type TEXT, | |
path TEXT); | |
CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path); | |
""") | |
# Populate the index | |
for fname in os.listdir(docdir): | |
fpath = os.path.join(docdir, fname) | |
# Skip non-HTML files | |
if not (os.path.isfile(fpath) and fname.endswith('.html')): | |
continue | |
# Parse the HTML and extract the title | |
root = parse(fpath) | |
entry_name = root.find('.//h1') | |
if entry_name is not None: | |
entry_name = entry_name.text | |
else: # Fail-safe for "replaced with..." pages | |
entry_name = root.find('.//title').text | |
# Infer a default type as well as we can | |
if entry_name.startswith('SDL_'): | |
entry_type = 'Function' | |
else: | |
entry_type = 'Guide' | |
cats = ','.join(x.text or '' for x in root.findall('.//a')) | |
for typename in ('Define', 'Enum', 'Struct'): | |
if 'Category%s' % typename in cats: | |
entry_type = typename | |
break | |
conn.execute("INSERT INTO searchIndex(name, type, path) " | |
"VALUES (?, ?, ?)", [entry_name, entry_type, fname]) | |
conn.commit() | |
# Unpack SDL-wiki.zip if not already done | |
if not os.path.exists(ICO_FILE): | |
print("Downloading %s..." % ICO_URL) | |
urlretrieve(ICO_URL, ICO_FILE) | |
print("Converting %s to icon.png..." % ICO_FILE) | |
Image.open(ICO_FILE).save(os.path.join(dsdir, 'icon.png')) | |
# TODO: https://kapeli.com/docsets#tableofcontents | |
# (Manually define what to traverse at the top levels so that the by-category | |
# traversal claims API pages first, then ignore links to pages that have | |
# already been visited in order to turn the directed graph into a tree.) | |
print("Archiving docset as %s for sharing..." % TAR_FILE) | |
with tarfile.open(TAR_FILE, 'w:gz') as tobj: | |
tobj.add(dsdir, filter=lambda x: | |
None if x.name.split('/')[-1] == '.DS_Store' else x) | |
print("Done.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment