Skip to content

Instantly share code, notes, and snippets.

@kalda341
Created March 22, 2019 03:10
Show Gist options
  • Save kalda341/ead91c24d9f277b1f9740080c57bc104 to your computer and use it in GitHub Desktop.
Save kalda341/ead91c24d9f277b1f9740080c57bc104 to your computer and use it in GitHub Desktop.
import re
from pathlib import Path
from lxml import html
import requests
DATA_DIR = Path('data')
BASE_URL = 'https://ocw.mit.edu'
def json_get(path):
r = requests.get(BASE_URL + path)
r.raise_for_status()
return r.json()
def mine_site():
topics = json_get('/courses/find-by-topic/topics.json')
for topic in topics:
mine_topic(DATA_DIR, topic)
def mine_topic(base_path, topic):
courses = json_get('/courses/find-by-topic/{}'.format(topic['file']))
for course in courses:
try_mine_course(base_path, course)
def try_mine_course(base_path, course):
course_path = base_path / ' - '.join([course['mcn'], course['sem'], course['title']])
try:
mine_lecture_notes(course_path, course)
except Exception as e:
print('Failed to mine course {}'.format(course_path))
print(e)
def mine_lecture_notes(course_path, course):
r = requests.get(BASE_URL + '/' + course['href'] + '/' + 'lecture-notes/')
r.raise_for_status()
tree = html.fromstring(r.content)
pdf_links = tree.xpath('//a[substring(@href,string-length(@href) -string-length(\'.pdf\') +1) = \'.pdf\']')
for link in pdf_links:
# Links look like Lecture 0: Introduction (PDF - 2.7MB), which is gross. Remove the final brackets.
filename = re.match(r'(.+?)( \(.*\))?$', link.text)[1] + '.pdf'
# Handle generic link names
if filename.lower().startswith('pdf'):
filename = link.get('href').split('/')[-1]
download_pdf(BASE_URL + link.get('href'), course_path, filename)
def download_pdf(url, path, filename):
r = requests.get(url, allow_redirects=True)
path.mkdir(exist_ok=True)
with open(path / filename, 'wb') as f:
f.write(r.content)
if __name__ == '__main__':
mine_site()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment