Skip to content

Instantly share code, notes, and snippets.

@itsamirhn
Last active July 23, 2022 06:57
Show Gist options
  • Save itsamirhn/de9b435c7b597751c8714d50503ba751 to your computer and use it in GitHub Desktop.
Save itsamirhn/de9b435c7b597751c8714d50503ba751 to your computer and use it in GitHub Desktop.
A simple & messy crawler for downloading resources of MIT-LinearAlgebra course in structured folders
import shutil
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
HOST = 'https://ocw.mit.edu'
future_list = []
def safe_name(name: str) -> str:
return name.replace(':', ' -')
def get_soup(url: str) -> BeautifulSoup:
response = requests.get(url)
return BeautifulSoup(response.text, 'html5lib')
def future(f):
def g(*args, **kwargs):
future_list.append((f, args, kwargs))
return g
def download_file(path: Path, url: str, force=False) -> None:
file_path = path / url.split('/')[-1]
if not force:
if file_path.exists():
print(f"[-] File already exists: {file_path}")
return
with requests.get(url, stream=True) as r:
r.raise_for_status()
total_length = int(r.headers.get("Content-Length"))
with tqdm.wrapattr(r.raw, "read", total=total_length, unit='iB', unit_scale=True) as raw:
with open(file_path, 'wb') as output:
shutil.copyfileobj(raw, output)
@future
def crawl_video(path: Path, url: str) -> None:
print("[+] Downloading video: ", url)
soup = get_soup(HOST + url)
main = soup.find('div', id='main-content')
video = main.find('video')
download_link = video.get('data-downloadlink')
download_file(path, download_link)
@future
def crawl_pdf(path: Path, url: str) -> None:
print("[+] Downloading pdf: ", url)
soup = get_soup(HOST + url)
main = soup.find('div', id='main-content')
a = main.find('a', class_='download-file')
download_link = HOST + a.get('href')
download_file(path, download_link)
def crawl_unit_table(path: Path, table: BeautifulSoup) -> None:
rows = table.find('tbody').find_all('tr')
for i, row in enumerate(rows):
cells = row.find_all('td')
title_text = cells[0].text.strip()
lecture_url = cells[1].find('a').get('href')
summary_url = cells[2].find('a').get('href')
reading_suggest_text = cells[3].text.strip()
reading_suggest_text = None if reading_suggest_text == 'None' else reading_suggest_text
problem_solving_urls = [a.get('href') for a in cells[4].find_all('a')]
problem_sets_urls = [a.get('href') for a in cells[5].find_all('a')]
if path.name == title_text:
session_path = path
elif len(rows) == 1:
session_path = path / safe_name(title_text)
else:
session_path = path / safe_name(f"{i + 1}. {title_text}")
session_path.mkdir(parents=True, exist_ok=True)
if lecture_url:
crawl_video(session_path, lecture_url)
if summary_url:
crawl_pdf(session_path, summary_url)
if problem_solving_urls:
for problem_solving_url in problem_solving_urls:
crawl_video(session_path, problem_solving_url)
if problem_sets_urls:
for problem_sets_url in problem_sets_urls:
crawl_pdf(session_path, problem_sets_url)
if reading_suggest_text:
with open(session_path / 'Suggested Reading.txt', 'w') as f:
f.write(reading_suggest_text)
def crawl_exam_table(path: Path, table: BeautifulSoup) -> None:
rows = table.find('tbody').find_all('tr')
for i, row in enumerate(rows):
cells = row.find_all('td')
title_text = cells[0].text.strip()
exam_files_urls = [a.get('href') for a in cells[1].find_all('a')]
if path.name == title_text:
exam_path = path
elif len(rows) == 1:
exam_path = path / safe_name(title_text)
else:
exam_path = path / safe_name(f"{i + 1}. {title_text}")
exam_path.mkdir(parents=True, exist_ok=True)
if exam_files_urls:
for exam_files_url in exam_files_urls:
crawl_pdf(exam_path, exam_files_url)
def crawl_resources(path: Path, url: str) -> None:
soup = get_soup(url)
main = soup.find('main', id='course-content-section')
titles = main.find_all('h2', recursive=False)
tables = main.find_all('table', recursive=False)
assert len(titles) == len(tables)
print("[+] Creating Structures...")
for title, table in zip(titles, tables):
title_text = title.text.strip()
if 'Exam' in title_text:
crawl_exam_table(path / safe_name(title_text), table)
else:
crawl_unit_table(path / safe_name(title_text), table)
for f, args, kwargs in future_list:
f(*args, **kwargs)
if __name__ == '__main__':
course_root = 'Linear Algebra'
resources_url = "https://ocw.mit.edu/courses/18-06sc-linear-algebra-fall-2011/pages/resource-index/"
crawl_resources(Path(course_root), resources_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment