samrat · March 5, 2016 07:25
diff --git a/dl_comp_structures.py b/dl_comp_structures.py
 from bs4 import BeautifulSoup
 from urllib.request import urlopen
 import subprocess
 import re
 import os
 import sys
 import pickle
 import shutil

 """
 NOTE:
 - Use shlex to split:
 wkhtmltopdf --javascript-delay 1000 --run-script "workbook.hide_nav()" http://computationstructures.org/notes/isas/notes.html isas.pdf

 - 1 sec delay is for MathJax
 """

 index_url = "http://computationstructures.org/notes/top_level/notes.html"
 base_url = "http://computationstructures.org"

 def download_chapter(url, current_version):
    # this is only used for the last-revised date
    resp = str(urlopen(url).read())
    last_revised = re.search('(Last revised )(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', resp).groups()[1]

    chapter_name = re.search('(.*notes\/)(\w+)(\/notes.html)', url).groups()[1]
    out_file = chapter_name + ".pdf"

    if (last_revised == current_version) and os.path.isfile(out_file):
        print("Skipping", chapter_name, "Last revised=", last_revised)
        return (chapter_name, last_revised)

    print("Downloading", chapter_name, "(Last revised:", last_revised, ")")

    subprocess.run(['wkhtmltopdf', '--javascript-delay', '1000', '--run-script', 'workbook.hide_nav()', url, out_file])

    return (chapter_name, last_revised)

 def download_all_chapters():
    resp = urlopen(index_url).read()
    soup = BeautifulSoup(resp, "html.parser")

    try:
        with open("chapter_versions", 'rb') as versions_file:
            chapter_versions = pickle.load(versions_file)
    except Exception:
        chapter_versions = {}

    links = soup.find_all(href=re.compile(".*\/notes\/.*"))

    failed_downloads = []
    for l in links:
        try:
            href = l.attrs['href']
            path = re.search('(.*)(\/notes.*\/notes\.html)', href).groups()[1]
            full_url = base_url + path

            chapter_name = re.search('(.*notes\/)(\w+)(\/notes.html)', full_url).groups()[1]

            if chapter_name in chapter_versions:
                current_version = chapter_versions[chapter_name]
            else:
                current_version = None
                
            (chapter_name, version) = download_chapter(full_url, current_version)
            chapter_versions[chapter_name] = version
        except Exception:
            failed_downloads.append(l)

    with open("chapter_versions", 'wb') as versions_file:
        pickle.dump(chapter_versions, versions_file)
    print("Failed links:", failed_downloads)

 if __name__ == '__main__':
    if shutil.which("wkhtmltopdf") == None:
        print("Could not find wkhtmltopdf")
        sys.exit(1)
    download_all_chapters()
	from bs4 import BeautifulSoup
	from urllib.request import urlopen
	import subprocess
	import re
	import os
	import sys
	import pickle
	import shutil

	"""
	NOTE:
	- Use shlex to split:
	wkhtmltopdf --javascript-delay 1000 --run-script "workbook.hide_nav()" http://computationstructures.org/notes/isas/notes.html isas.pdf

	- 1 sec delay is for MathJax
	"""

	index_url = "http://computationstructures.org/notes/top_level/notes.html"
	base_url = "http://computationstructures.org"

	def download_chapter(url, current_version):
	# this is only used for the last-revised date
	resp = str(urlopen(url).read())
	last_revised = re.search('(Last revised )(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', resp).groups()[1]

	chapter_name = re.search('(.*notes\/)(\w+)(\/notes.html)', url).groups()[1]
	out_file = chapter_name + ".pdf"

	if (last_revised == current_version) and os.path.isfile(out_file):
	print("Skipping", chapter_name, "Last revised=", last_revised)
	return (chapter_name, last_revised)

	print("Downloading", chapter_name, "(Last revised:", last_revised, ")")

	subprocess.run(['wkhtmltopdf', '--javascript-delay', '1000', '--run-script', 'workbook.hide_nav()', url, out_file])

	return (chapter_name, last_revised)

	def download_all_chapters():
	resp = urlopen(index_url).read()
	soup = BeautifulSoup(resp, "html.parser")

	try:
	with open("chapter_versions", 'rb') as versions_file:
	chapter_versions = pickle.load(versions_file)
	except Exception:
	chapter_versions = {}

	links = soup.find_all(href=re.compile(".\/notes\/."))

	failed_downloads = []
	for l in links:
	try:
	href = l.attrs['href']
	path = re.search('(.)(\/notes.\/notes\.html)', href).groups()[1]
	full_url = base_url + path

	chapter_name = re.search('(.*notes\/)(\w+)(\/notes.html)', full_url).groups()[1]

	if chapter_name in chapter_versions:
	current_version = chapter_versions[chapter_name]
	else:
	current_version = None

	(chapter_name, version) = download_chapter(full_url, current_version)
	chapter_versions[chapter_name] = version
	except Exception:
	failed_downloads.append(l)

	with open("chapter_versions", 'wb') as versions_file:
	pickle.dump(chapter_versions, versions_file)
	print("Failed links:", failed_downloads)

	if __name__ == '__main__':
	if shutil.which("wkhtmltopdf") == None:
	print("Could not find wkhtmltopdf")
	sys.exit(1)
	download_all_chapters()