Last active
February 18, 2017 16:37
-
-
Save voroninman/fdd91e936722450617a215c8c927e8fd to your computer and use it in GitHub Desktop.
Download and save PostgreSQL documentation as a single HTML-page replacing URLs with HTML anchors
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Download and save PostgreSQL documentation as a single | |
HTML-page replacing URLs with HTML anchors. | |
To convert the resulting HTML file to an e-book use | |
appropriate online services. | |
Be carful opennig it in a browser. The resulting HTML | |
could be heavy. | |
""" | |
import re | |
import requests | |
_session = requests.Session() | |
version = '9.6' | |
base_url = 'https://www.postgresql.org/docs/{0}/static/'.format(version) | |
stop_at_page_file = 'client-interfaces.html' | |
def extract_page_html(html): | |
match = re.search( | |
r'<h1.+<div class=\"NAVFOOTER\">', | |
html, | |
flags=re.DOTALL) | |
return match.group()[:-23] | |
def search_next_page_file(html): | |
match = re.search( | |
r'([^\"]+)\"\s+accesskey=\s*\"N\">Next</a>', | |
html) | |
if match: | |
return match.group(1) | |
def replace_links(html): | |
return re.sub( | |
r'href=\s*"([\w\-]+\.html)(?:#[\w\-]+)?"', | |
r'href="#\1"', | |
html) | |
with open('postgresql-{0}-docs.html'.format(version), 'w') as f: | |
f.write('<!doctype html><html><body>') | |
next_page_file = 'index.html' | |
while True: | |
page_file = next_page_file | |
url = base_url + page_file | |
html = _session.get(url).text | |
page_html = extract_page_html(html) | |
page_html = replace_links(page_html) | |
f.write('<a name="{0}"></a>'.format(page_file)) | |
f.write(page_html.encode('utf8')) | |
print(url) | |
next_page_file = search_next_page_file(html) | |
if not next_page_file or next_page_file == stop_at_page_file: | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment