Last active
March 5, 2016 07:25
-
-
Save samrat/d3c57e1da3b5358eb383 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from urllib.request import urlopen | |
import subprocess | |
import re | |
import os | |
import sys | |
import pickle | |
import shutil | |
""" | |
NOTE: | |
- Use shlex to split: | |
wkhtmltopdf --javascript-delay 1000 --run-script "workbook.hide_nav()" http://computationstructures.org/notes/isas/notes.html isas.pdf | |
- 1 sec delay is for MathJax | |
""" | |
index_url = "http://computationstructures.org/notes/top_level/notes.html" | |
base_url = "http://computationstructures.org" | |
def download_chapter(url, current_version): | |
# this is only used for the last-revised date | |
resp = str(urlopen(url).read()) | |
last_revised = re.search('(Last revised )(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', resp).groups()[1] | |
chapter_name = re.search('(.*notes\/)(\w+)(\/notes.html)', url).groups()[1] | |
out_file = chapter_name + ".pdf" | |
if (last_revised == current_version) and os.path.isfile(out_file): | |
print("Skipping", chapter_name, "Last revised=", last_revised) | |
return (chapter_name, last_revised) | |
print("Downloading", chapter_name, "(Last revised:", last_revised, ")") | |
subprocess.run(['wkhtmltopdf', '--javascript-delay', '1000', '--run-script', 'workbook.hide_nav()', url, out_file]) | |
return (chapter_name, last_revised) | |
def download_all_chapters(): | |
resp = urlopen(index_url).read() | |
soup = BeautifulSoup(resp, "html.parser") | |
try: | |
with open("chapter_versions", 'rb') as versions_file: | |
chapter_versions = pickle.load(versions_file) | |
except Exception: | |
chapter_versions = {} | |
links = soup.find_all(href=re.compile(".*\/notes\/.*")) | |
failed_downloads = [] | |
for l in links: | |
try: | |
href = l.attrs['href'] | |
path = re.search('(.*)(\/notes.*\/notes\.html)', href).groups()[1] | |
full_url = base_url + path | |
chapter_name = re.search('(.*notes\/)(\w+)(\/notes.html)', full_url).groups()[1] | |
if chapter_name in chapter_versions: | |
current_version = chapter_versions[chapter_name] | |
else: | |
current_version = None | |
(chapter_name, version) = download_chapter(full_url, current_version) | |
chapter_versions[chapter_name] = version | |
except Exception: | |
failed_downloads.append(l) | |
with open("chapter_versions", 'wb') as versions_file: | |
pickle.dump(chapter_versions, versions_file) | |
print("Failed links:", failed_downloads) | |
if __name__ == '__main__': | |
if shutil.which("wkhtmltopdf") == None: | |
print("Could not find wkhtmltopdf") | |
sys.exit(1) | |
download_all_chapters() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment