itsamirhn · July 23, 2022 06:57
diff --git a/MITCrawler.py b/MITCrawler.py
 import shutil
 from pathlib import Path

 import requests
 from bs4 import BeautifulSoup
 from tqdm import tqdm

 HOST = 'https://ocw.mit.edu'
 future_list = []

 def safe_name(name: str) -> str:
    return name.replace(':', ' -')


 def get_soup(url: str) -> BeautifulSoup:
    response = requests.get(url)
    return BeautifulSoup(response.text, 'html5lib')


 def future(f):
    def g(*args, **kwargs):
        future_list.append((f, args, kwargs))
    return g


 def download_file(path: Path, url: str, force=False) -> None:
    file_path = path / url.split('/')[-1]
    if not force:
        if file_path.exists():
            print(f"[-] File already exists: {file_path}")
            return
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        total_length = int(r.headers.get("Content-Length"))
        with tqdm.wrapattr(r.raw, "read", total=total_length, unit='iB', unit_scale=True) as raw:
            with open(file_path, 'wb') as output:
                shutil.copyfileobj(raw, output)


 @future
 def crawl_video(path: Path, url: str) -> None:
    print("[+] Downloading video: ", url)
    soup = get_soup(HOST + url)
    main = soup.find('div', id='main-content')
    video = main.find('video')
    download_link = video.get('data-downloadlink')
    download_file(path, download_link)


 @future
 def crawl_pdf(path: Path, url: str) -> None:
    print("[+] Downloading pdf: ", url)
    soup = get_soup(HOST + url)
    main = soup.find('div', id='main-content')
    a = main.find('a', class_='download-file')
    download_link = HOST + a.get('href')
    download_file(path, download_link)


 def crawl_unit_table(path: Path, table: BeautifulSoup) -> None:
    rows = table.find('tbody').find_all('tr')
    for i, row in enumerate(rows):
        cells = row.find_all('td')
        title_text = cells[0].text.strip()
        lecture_url = cells[1].find('a').get('href')
        summary_url = cells[2].find('a').get('href')
        reading_suggest_text = cells[3].text.strip()
        reading_suggest_text = None if reading_suggest_text == 'None' else reading_suggest_text
        problem_solving_urls = [a.get('href') for a in cells[4].find_all('a')]
        problem_sets_urls = [a.get('href') for a in cells[5].find_all('a')]
        if path.name == title_text:
            session_path = path
        elif len(rows) == 1:
            session_path = path / safe_name(title_text)
        else:
            session_path = path / safe_name(f"{i + 1}. {title_text}")
        session_path.mkdir(parents=True, exist_ok=True)

        if lecture_url:
            crawl_video(session_path, lecture_url)
        if summary_url:
            crawl_pdf(session_path, summary_url)
        if problem_solving_urls:
            for problem_solving_url in problem_solving_urls:
                crawl_video(session_path, problem_solving_url)
        if problem_sets_urls:
            for problem_sets_url in problem_sets_urls:
                crawl_pdf(session_path, problem_sets_url)
        if reading_suggest_text:
            with open(session_path / 'Suggested Reading.txt', 'w') as f:
                f.write(reading_suggest_text)


 def crawl_exam_table(path: Path, table: BeautifulSoup) -> None:
    rows = table.find('tbody').find_all('tr')
    for i, row in enumerate(rows):
        cells = row.find_all('td')
        title_text = cells[0].text.strip()
        exam_files_urls = [a.get('href') for a in cells[1].find_all('a')]
        if path.name == title_text:
            exam_path = path
        elif len(rows) == 1:
            exam_path = path / safe_name(title_text)
        else:
            exam_path = path / safe_name(f"{i + 1}. {title_text}")
        exam_path.mkdir(parents=True, exist_ok=True)
        if exam_files_urls:
            for exam_files_url in exam_files_urls:
                crawl_pdf(exam_path, exam_files_url)


 def crawl_resources(path: Path, url: str) -> None:
    soup = get_soup(url)
    main = soup.find('main', id='course-content-section')

    titles = main.find_all('h2', recursive=False)
    tables = main.find_all('table', recursive=False)
    assert len(titles) == len(tables)

    print("[+] Creating Structures...")
    for title, table in zip(titles, tables):
        title_text = title.text.strip()
        if 'Exam' in title_text:
            crawl_exam_table(path / safe_name(title_text), table)
        else:
            crawl_unit_table(path / safe_name(title_text), table)

    for f, args, kwargs in future_list:
        f(*args, **kwargs)


 if __name__ == '__main__':
    course_root = 'Linear Algebra'
    resources_url = "https://ocw.mit.edu/courses/18-06sc-linear-algebra-fall-2011/pages/resource-index/"
    crawl_resources(Path(course_root), resources_url)
	import shutil
	from pathlib import Path

	import requests
	from bs4 import BeautifulSoup
	from tqdm import tqdm

	HOST = 'https://ocw.mit.edu'
	future_list = []

	def safe_name(name: str) -> str:
	return name.replace(':', ' -')


	def get_soup(url: str) -> BeautifulSoup:
	response = requests.get(url)
	return BeautifulSoup(response.text, 'html5lib')


	def future(f):
	def g(args, *kwargs):
	future_list.append((f, args, kwargs))
	return g


	def download_file(path: Path, url: str, force=False) -> None:
	file_path = path / url.split('/')[-1]
	if not force:
	if file_path.exists():
	print(f"[-] File already exists: {file_path}")
	return
	with requests.get(url, stream=True) as r:
	r.raise_for_status()
	total_length = int(r.headers.get("Content-Length"))
	with tqdm.wrapattr(r.raw, "read", total=total_length, unit='iB', unit_scale=True) as raw:
	with open(file_path, 'wb') as output:
	shutil.copyfileobj(raw, output)


	@future
	def crawl_video(path: Path, url: str) -> None:
	print("[+] Downloading video: ", url)
	soup = get_soup(HOST + url)
	main = soup.find('div', id='main-content')
	video = main.find('video')
	download_link = video.get('data-downloadlink')
	download_file(path, download_link)


	@future
	def crawl_pdf(path: Path, url: str) -> None:
	print("[+] Downloading pdf: ", url)
	soup = get_soup(HOST + url)
	main = soup.find('div', id='main-content')
	a = main.find('a', class_='download-file')
	download_link = HOST + a.get('href')
	download_file(path, download_link)


	def crawl_unit_table(path: Path, table: BeautifulSoup) -> None:
	rows = table.find('tbody').find_all('tr')
	for i, row in enumerate(rows):
	cells = row.find_all('td')
	title_text = cells[0].text.strip()
	lecture_url = cells[1].find('a').get('href')
	summary_url = cells[2].find('a').get('href')
	reading_suggest_text = cells[3].text.strip()
	reading_suggest_text = None if reading_suggest_text == 'None' else reading_suggest_text
	problem_solving_urls = [a.get('href') for a in cells[4].find_all('a')]
	problem_sets_urls = [a.get('href') for a in cells[5].find_all('a')]
	if path.name == title_text:
	session_path = path
	elif len(rows) == 1:
	session_path = path / safe_name(title_text)
	else:
	session_path = path / safe_name(f"{i + 1}. {title_text}")
	session_path.mkdir(parents=True, exist_ok=True)

	if lecture_url:
	crawl_video(session_path, lecture_url)
	if summary_url:
	crawl_pdf(session_path, summary_url)
	if problem_solving_urls:
	for problem_solving_url in problem_solving_urls:
	crawl_video(session_path, problem_solving_url)
	if problem_sets_urls:
	for problem_sets_url in problem_sets_urls:
	crawl_pdf(session_path, problem_sets_url)
	if reading_suggest_text:
	with open(session_path / 'Suggested Reading.txt', 'w') as f:
	f.write(reading_suggest_text)


	def crawl_exam_table(path: Path, table: BeautifulSoup) -> None:
	rows = table.find('tbody').find_all('tr')
	for i, row in enumerate(rows):
	cells = row.find_all('td')
	title_text = cells[0].text.strip()
	exam_files_urls = [a.get('href') for a in cells[1].find_all('a')]
	if path.name == title_text:
	exam_path = path
	elif len(rows) == 1:
	exam_path = path / safe_name(title_text)
	else:
	exam_path = path / safe_name(f"{i + 1}. {title_text}")
	exam_path.mkdir(parents=True, exist_ok=True)
	if exam_files_urls:
	for exam_files_url in exam_files_urls:
	crawl_pdf(exam_path, exam_files_url)


	def crawl_resources(path: Path, url: str) -> None:
	soup = get_soup(url)
	main = soup.find('main', id='course-content-section')

	titles = main.find_all('h2', recursive=False)
	tables = main.find_all('table', recursive=False)
	assert len(titles) == len(tables)

	print("[+] Creating Structures...")
	for title, table in zip(titles, tables):
	title_text = title.text.strip()
	if 'Exam' in title_text:
	crawl_exam_table(path / safe_name(title_text), table)
	else:
	crawl_unit_table(path / safe_name(title_text), table)

	for f, args, kwargs in future_list:
	f(args, *kwargs)


	if __name__ == '__main__':
	course_root = 'Linear Algebra'
	resources_url = "https://ocw.mit.edu/courses/18-06sc-linear-algebra-fall-2011/pages/resource-index/"
	crawl_resources(Path(course_root), resources_url)