Created
June 9, 2025 09:59
-
-
Save MichalPt/4093a1f475204dadbfff536869839564 to your computer and use it in GitHub Desktop.
Script for scratching course data from supervisor's website (mancal.cz)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import json, re, os, argparse | |
from urllib.parse import urlparse | |
BASE_URL = "https://www.mancal.cz/" | |
def get_soup(url): | |
resp = requests.get(url) | |
resp.raise_for_status() | |
return BeautifulSoup(resp.text, "html.parser") | |
def download_file(url, folder_path, filename): | |
"""Download a file to the specified folder.""" | |
os.makedirs(folder_path, exist_ok=True) | |
file_path = os.path.join(folder_path, filename) | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
with open(file_path, 'wb') as f: | |
f.write(response.content) | |
print(f"Downloaded: {filename}") | |
return True | |
except Exception as e: | |
print(f"Failed to download {filename}: {e}") | |
return False | |
def crawl(target_url): | |
soup = get_soup(target_url) | |
parsed_url = urlparse(target_url) | |
course_name = parsed_url.path.rstrip('/').split("/")[-1] | |
course_data = { | |
"courseName": course_name, | |
"courseCode": "", | |
"courseDescription": "", | |
"language": "English", | |
"semester": "winter", | |
"courseVisible": True, | |
"showDates": False, | |
"content": [] | |
} | |
# Find all sections (h2 tags) | |
for section in soup.find_all("h2")[1:]: # Skip the first h2 which is usually the title | |
section_title = section.get_text(strip=True) | |
section_content = [] | |
# Locate the parent container of the section | |
section_container = section.find_parent("div").find_next_sibling("div") | |
while section_container and not section_container.find("h2"): | |
# Find all subsections (h3 tags) within the section container | |
for subsection in section_container.find_all("h3"): | |
subsection_title = subsection.get_text(strip=True) | |
subsection_content = [] | |
# Locate the list of video links (ol or ul) within the subsection | |
link_list = subsection.find_next("ol") or subsection.find_next("ul") | |
if not link_list: | |
continue | |
# Extract video links from the list | |
for link_item in link_list.find_all("a", href=True): | |
if "teaching/" in link_item["href"]: | |
video_title = link_item.get_text(strip=True) | |
link_url = link_item["href"] | |
if not link_url.startswith("http"): | |
link_url = BASE_URL + link_url | |
# Fetch the video page and extract the YouTube link from the iframe | |
sub_soup = get_soup(link_url) | |
yt_iframe = sub_soup.find("iframe", src=True) | |
if not yt_iframe: | |
continue | |
yt_url = None | |
if yt_iframe: | |
iframe_src = yt_iframe["src"] | |
# Convert embed URL to standard YouTube watch URL | |
if "youtube.com/embed/" in iframe_src: | |
video_id = iframe_src.split("/")[-1].split("?")[0] | |
yt_url = f"https://www.youtube.com/watch?v={video_id}" | |
elif "youtu.be/" in iframe_src: | |
video_id = iframe_src.split("/")[-1] | |
yt_url = f"https://www.youtube.com/watch?v={video_id}" | |
subsection_content.append({ | |
"title": video_title, | |
"videoUrl": yt_url, | |
"pdfName": "", | |
"visible": True, | |
"label": "", | |
"tags": [], | |
"description": "", | |
}) | |
elif "fileadmin/" in link_item["href"]: | |
# Handle PDF links - download the file | |
pdf_url = link_item["href"] | |
if not pdf_url.startswith("http"): | |
pdf_url = BASE_URL + pdf_url | |
pdf_filename = link_item["href"].split("/")[-1] | |
# Download the file to the course folder | |
if download_file(pdf_url, "./{}".format(course_name), pdf_filename): | |
subsection_content[-1]["pdfName"] = pdf_filename | |
if subsection_content: | |
section_content.append({ | |
"subsectionTitle": subsection_title, | |
"subsectionVisible": True, | |
"subsectionContent": subsection_content | |
}) | |
section_container = section_container.find_next_sibling("div") | |
if section_content: | |
course_data["content"].append({ | |
"sectionTitle": section_title, | |
"sectionVisisble": True, | |
"sectionContent": section_content | |
}) | |
return course_data | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Crawl a webpage for YT videos.") | |
parser.add_argument("target_url", help="The URL of the target webpage to crawl.") | |
args = parser.parse_args() | |
data = crawl(args.target_url) | |
filename = "{}.json".format(data["courseName".replace(" ","-")]) | |
with open(filename, "w", encoding="utf-8") as f: | |
json.dump(data, f, ensure_ascii=False, indent=4) | |
print("Data saved to {}".format(filename)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment