Last active
July 23, 2022 06:57
-
-
Save itsamirhn/de9b435c7b597751c8714d50503ba751 to your computer and use it in GitHub Desktop.
A simple & messy crawler for downloading resources of MIT-LinearAlgebra course in structured folders
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import shutil | |
from pathlib import Path | |
import requests | |
from bs4 import BeautifulSoup | |
from tqdm import tqdm | |
HOST = 'https://ocw.mit.edu' | |
future_list = [] | |
def safe_name(name: str) -> str: | |
return name.replace(':', ' -') | |
def get_soup(url: str) -> BeautifulSoup: | |
response = requests.get(url) | |
return BeautifulSoup(response.text, 'html5lib') | |
def future(f): | |
def g(*args, **kwargs): | |
future_list.append((f, args, kwargs)) | |
return g | |
def download_file(path: Path, url: str, force=False) -> None: | |
file_path = path / url.split('/')[-1] | |
if not force: | |
if file_path.exists(): | |
print(f"[-] File already exists: {file_path}") | |
return | |
with requests.get(url, stream=True) as r: | |
r.raise_for_status() | |
total_length = int(r.headers.get("Content-Length")) | |
with tqdm.wrapattr(r.raw, "read", total=total_length, unit='iB', unit_scale=True) as raw: | |
with open(file_path, 'wb') as output: | |
shutil.copyfileobj(raw, output) | |
@future | |
def crawl_video(path: Path, url: str) -> None: | |
print("[+] Downloading video: ", url) | |
soup = get_soup(HOST + url) | |
main = soup.find('div', id='main-content') | |
video = main.find('video') | |
download_link = video.get('data-downloadlink') | |
download_file(path, download_link) | |
@future | |
def crawl_pdf(path: Path, url: str) -> None: | |
print("[+] Downloading pdf: ", url) | |
soup = get_soup(HOST + url) | |
main = soup.find('div', id='main-content') | |
a = main.find('a', class_='download-file') | |
download_link = HOST + a.get('href') | |
download_file(path, download_link) | |
def crawl_unit_table(path: Path, table: BeautifulSoup) -> None: | |
rows = table.find('tbody').find_all('tr') | |
for i, row in enumerate(rows): | |
cells = row.find_all('td') | |
title_text = cells[0].text.strip() | |
lecture_url = cells[1].find('a').get('href') | |
summary_url = cells[2].find('a').get('href') | |
reading_suggest_text = cells[3].text.strip() | |
reading_suggest_text = None if reading_suggest_text == 'None' else reading_suggest_text | |
problem_solving_urls = [a.get('href') for a in cells[4].find_all('a')] | |
problem_sets_urls = [a.get('href') for a in cells[5].find_all('a')] | |
if path.name == title_text: | |
session_path = path | |
elif len(rows) == 1: | |
session_path = path / safe_name(title_text) | |
else: | |
session_path = path / safe_name(f"{i + 1}. {title_text}") | |
session_path.mkdir(parents=True, exist_ok=True) | |
if lecture_url: | |
crawl_video(session_path, lecture_url) | |
if summary_url: | |
crawl_pdf(session_path, summary_url) | |
if problem_solving_urls: | |
for problem_solving_url in problem_solving_urls: | |
crawl_video(session_path, problem_solving_url) | |
if problem_sets_urls: | |
for problem_sets_url in problem_sets_urls: | |
crawl_pdf(session_path, problem_sets_url) | |
if reading_suggest_text: | |
with open(session_path / 'Suggested Reading.txt', 'w') as f: | |
f.write(reading_suggest_text) | |
def crawl_exam_table(path: Path, table: BeautifulSoup) -> None: | |
rows = table.find('tbody').find_all('tr') | |
for i, row in enumerate(rows): | |
cells = row.find_all('td') | |
title_text = cells[0].text.strip() | |
exam_files_urls = [a.get('href') for a in cells[1].find_all('a')] | |
if path.name == title_text: | |
exam_path = path | |
elif len(rows) == 1: | |
exam_path = path / safe_name(title_text) | |
else: | |
exam_path = path / safe_name(f"{i + 1}. {title_text}") | |
exam_path.mkdir(parents=True, exist_ok=True) | |
if exam_files_urls: | |
for exam_files_url in exam_files_urls: | |
crawl_pdf(exam_path, exam_files_url) | |
def crawl_resources(path: Path, url: str) -> None: | |
soup = get_soup(url) | |
main = soup.find('main', id='course-content-section') | |
titles = main.find_all('h2', recursive=False) | |
tables = main.find_all('table', recursive=False) | |
assert len(titles) == len(tables) | |
print("[+] Creating Structures...") | |
for title, table in zip(titles, tables): | |
title_text = title.text.strip() | |
if 'Exam' in title_text: | |
crawl_exam_table(path / safe_name(title_text), table) | |
else: | |
crawl_unit_table(path / safe_name(title_text), table) | |
for f, args, kwargs in future_list: | |
f(*args, **kwargs) | |
if __name__ == '__main__': | |
course_root = 'Linear Algebra' | |
resources_url = "https://ocw.mit.edu/courses/18-06sc-linear-algebra-fall-2011/pages/resource-index/" | |
crawl_resources(Path(course_root), resources_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment