Skip to content

Instantly share code, notes, and snippets.

@samrat
Last active March 5, 2016 07:25
Show Gist options
  • Save samrat/d3c57e1da3b5358eb383 to your computer and use it in GitHub Desktop.
Save samrat/d3c57e1da3b5358eb383 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
from urllib.request import urlopen
import subprocess
import re
import os
import sys
import pickle
import shutil
"""
NOTE:
- Use shlex to split:
wkhtmltopdf --javascript-delay 1000 --run-script "workbook.hide_nav()" http://computationstructures.org/notes/isas/notes.html isas.pdf
- 1 sec delay is for MathJax
"""
index_url = "http://computationstructures.org/notes/top_level/notes.html"
base_url = "http://computationstructures.org"
def download_chapter(url, current_version):
# this is only used for the last-revised date
resp = str(urlopen(url).read())
last_revised = re.search('(Last revised )(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', resp).groups()[1]
chapter_name = re.search('(.*notes\/)(\w+)(\/notes.html)', url).groups()[1]
out_file = chapter_name + ".pdf"
if (last_revised == current_version) and os.path.isfile(out_file):
print("Skipping", chapter_name, "Last revised=", last_revised)
return (chapter_name, last_revised)
print("Downloading", chapter_name, "(Last revised:", last_revised, ")")
subprocess.run(['wkhtmltopdf', '--javascript-delay', '1000', '--run-script', 'workbook.hide_nav()', url, out_file])
return (chapter_name, last_revised)
def download_all_chapters():
resp = urlopen(index_url).read()
soup = BeautifulSoup(resp, "html.parser")
try:
with open("chapter_versions", 'rb') as versions_file:
chapter_versions = pickle.load(versions_file)
except Exception:
chapter_versions = {}
links = soup.find_all(href=re.compile(".*\/notes\/.*"))
failed_downloads = []
for l in links:
try:
href = l.attrs['href']
path = re.search('(.*)(\/notes.*\/notes\.html)', href).groups()[1]
full_url = base_url + path
chapter_name = re.search('(.*notes\/)(\w+)(\/notes.html)', full_url).groups()[1]
if chapter_name in chapter_versions:
current_version = chapter_versions[chapter_name]
else:
current_version = None
(chapter_name, version) = download_chapter(full_url, current_version)
chapter_versions[chapter_name] = version
except Exception:
failed_downloads.append(l)
with open("chapter_versions", 'wb') as versions_file:
pickle.dump(chapter_versions, versions_file)
print("Failed links:", failed_downloads)
if __name__ == '__main__':
if shutil.which("wkhtmltopdf") == None:
print("Could not find wkhtmltopdf")
sys.exit(1)
download_all_chapters()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment