kalda341 · March 22, 2019 03:10
diff --git a/main.py b/main.py
 import re
 from pathlib import Path
 from lxml import html
 import requests


 DATA_DIR = Path('data')
 BASE_URL = 'https://ocw.mit.edu'


 def json_get(path):
    r = requests.get(BASE_URL + path)
    r.raise_for_status()
    return r.json()

 def mine_site():
    topics = json_get('/courses/find-by-topic/topics.json')
    for topic in topics:
        mine_topic(DATA_DIR, topic)

 def mine_topic(base_path, topic):
    courses = json_get('/courses/find-by-topic/{}'.format(topic['file']))
    for course in courses:
        try_mine_course(base_path, course)

 def try_mine_course(base_path, course):
    course_path = base_path / ' - '.join([course['mcn'], course['sem'], course['title']])
    try:
        mine_lecture_notes(course_path, course)
    except Exception as e:
        print('Failed to mine course {}'.format(course_path))
        print(e)

 def mine_lecture_notes(course_path, course):
    r = requests.get(BASE_URL + '/' + course['href'] + '/' + 'lecture-notes/')
    r.raise_for_status()
    tree = html.fromstring(r.content)

    pdf_links = tree.xpath('//a[substring(@href,string-length(@href) -string-length(\'.pdf\') +1) = \'.pdf\']')
    for link in pdf_links:
        # Links look like Lecture 0: Introduction (PDF - 2.7MB), which is gross. Remove the final brackets.
        filename = re.match(r'(.+?)( \(.*\))?$', link.text)[1] + '.pdf'
        # Handle generic link names
        if filename.lower().startswith('pdf'):
            filename = link.get('href').split('/')[-1]
        download_pdf(BASE_URL + link.get('href'), course_path, filename)

 def download_pdf(url, path, filename):
    r = requests.get(url, allow_redirects=True)

    path.mkdir(exist_ok=True)
    with open(path / filename, 'wb') as f:
        f.write(r.content)


 if __name__ == '__main__':
    mine_site()
	import re
	from pathlib import Path
	from lxml import html
	import requests


	DATA_DIR = Path('data')
	BASE_URL = 'https://ocw.mit.edu'


	def json_get(path):
	r = requests.get(BASE_URL + path)
	r.raise_for_status()
	return r.json()

	def mine_site():
	topics = json_get('/courses/find-by-topic/topics.json')
	for topic in topics:
	mine_topic(DATA_DIR, topic)

	def mine_topic(base_path, topic):
	courses = json_get('/courses/find-by-topic/{}'.format(topic['file']))
	for course in courses:
	try_mine_course(base_path, course)

	def try_mine_course(base_path, course):
	course_path = base_path / ' - '.join([course['mcn'], course['sem'], course['title']])
	try:
	mine_lecture_notes(course_path, course)
	except Exception as e:
	print('Failed to mine course {}'.format(course_path))
	print(e)

	def mine_lecture_notes(course_path, course):
	r = requests.get(BASE_URL + '/' + course['href'] + '/' + 'lecture-notes/')
	r.raise_for_status()
	tree = html.fromstring(r.content)

	pdf_links = tree.xpath('//a[substring(@href,string-length(@href) -string-length(\'.pdf\') +1) = \'.pdf\']')
	for link in pdf_links:
	# Links look like Lecture 0: Introduction (PDF - 2.7MB), which is gross. Remove the final brackets.
	filename = re.match(r'(.+?)( \(.*\))?$', link.text)[1] + '.pdf'
	# Handle generic link names
	if filename.lower().startswith('pdf'):
	filename = link.get('href').split('/')[-1]
	download_pdf(BASE_URL + link.get('href'), course_path, filename)

	def download_pdf(url, path, filename):
	r = requests.get(url, allow_redirects=True)

	path.mkdir(exist_ok=True)
	with open(path / filename, 'wb') as f:
	f.write(r.content)


	if __name__ == '__main__':
	mine_site()