Created
April 13, 2019 12:55
-
-
Save Koenvh1/80a5181c62d1d7c156c835263df58c12 to your computer and use it in GitHub Desktop.
Scrape your Blackboard website and download all content to a folder. Tested with Blackboard Learn 3100.0.6.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import os.path | |
import shutil | |
from urllib.parse import unquote | |
import pathvalidate | |
import requests_html | |
class Scraper: | |
def __init__(self, base_url: str, username: str, password: str, output_folder: str): | |
self.BASE_URL = base_url.rstrip("/") | |
self.USERNAME = username | |
self.PASSWORD = password | |
self.OUTPUT_FOLDER = output_folder.rstrip("/") + "/" | |
self.s = requests_html.HTMLSession() | |
self.s.post(self.BASE_URL + "/webapps/login/", { | |
"user_id": self.USERNAME, | |
"password": self.PASSWORD, | |
"login": "Login", | |
"action": "login", | |
"new_loc": "" | |
}) | |
def get_course(self, course_id): | |
path = self.OUTPUT_FOLDER + course_id + "/" | |
downloaded_files = set() | |
to_visit_links = {self.BASE_URL + "/webapps/blackboard/content/listContent.jsp?course_id=" + course_id} | |
visited_links = set() | |
while len(to_visit_links) > 0: | |
new_link = to_visit_links.pop() | |
visited_links.add(new_link) | |
new_link = self.expand_link(new_link) | |
print(f"GET {new_link} (to visit {len(to_visit_links)}, visited {len(visited_links)})") | |
page = self.s.get(new_link) | |
links = page.html.links | |
title = page.html.find("title", first=True).text.split("–")[0] # Not a -, but a – (different character) | |
local_path = path + pathvalidate.sanitize_filename(title) + "/" | |
if not os.path.exists(local_path): | |
os.makedirs(local_path) | |
with open(local_path + "index.html", "w", encoding="utf-8") as f: | |
f.write(page.text) | |
for link in links: | |
expanded_link = self.expand_link(link) | |
if expanded_link in visited_links: | |
continue | |
if "listContent.jsp?course_id=" + course_id in link: | |
to_visit_links.add(expanded_link) | |
if "bbcswebdav" in link and link not in downloaded_files: | |
print("DOWNLOAD " + link) | |
self.download_file(link, local_path) | |
downloaded_files.add(link) | |
print("Done.") | |
def download_file(self, link, path): | |
if not os.path.exists(path): | |
os.makedirs(path) | |
r = self.s.get(self.expand_link(link), allow_redirects=True, stream=True) | |
local_filename = r.url.split('/')[-1] | |
local_filename = unquote(local_filename) | |
with open(path + local_filename, 'wb') as f: | |
shutil.copyfileobj(r.raw, f) | |
def expand_link(self, link): | |
if link.startswith("/"): | |
link = self.BASE_URL + link | |
return link | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Download all content from Blackboard") | |
parser.add_argument("url", help="URL to the Blackboard website, e.g. https://blackboard.utwente.nl") | |
parser.add_argument("username", help="Username used to log in to Blackboard") | |
parser.add_argument("password", help="Password used to log in to Blackboard") | |
parser.add_argument("output", help="Path to the output folder, e.g. output/") | |
parser.add_argument("course_id", nargs="+", | |
help="ID for the course to parse, can be found in the page URL, and looks like _xxxxx_1") | |
args = parser.parse_args() | |
s = Scraper(args.url, args.username, args.password, args.output) | |
for i in args.course_id: | |
s.get_course(i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment