Skip to content

Instantly share code, notes, and snippets.

@MichalPt
Created June 9, 2025 09:59
Show Gist options
  • Save MichalPt/4093a1f475204dadbfff536869839564 to your computer and use it in GitHub Desktop.
Save MichalPt/4093a1f475204dadbfff536869839564 to your computer and use it in GitHub Desktop.
Script for scratching course data from supervisor's website (mancal.cz)
import requests
from bs4 import BeautifulSoup
import json, re, os, argparse
from urllib.parse import urlparse
BASE_URL = "https://www.mancal.cz/"
def get_soup(url):
resp = requests.get(url)
resp.raise_for_status()
return BeautifulSoup(resp.text, "html.parser")
def download_file(url, folder_path, filename):
"""Download a file to the specified folder."""
os.makedirs(folder_path, exist_ok=True)
file_path = os.path.join(folder_path, filename)
try:
response = requests.get(url)
response.raise_for_status()
with open(file_path, 'wb') as f:
f.write(response.content)
print(f"Downloaded: {filename}")
return True
except Exception as e:
print(f"Failed to download {filename}: {e}")
return False
def crawl(target_url):
soup = get_soup(target_url)
parsed_url = urlparse(target_url)
course_name = parsed_url.path.rstrip('/').split("/")[-1]
course_data = {
"courseName": course_name,
"courseCode": "",
"courseDescription": "",
"language": "English",
"semester": "winter",
"courseVisible": True,
"showDates": False,
"content": []
}
# Find all sections (h2 tags)
for section in soup.find_all("h2")[1:]: # Skip the first h2 which is usually the title
section_title = section.get_text(strip=True)
section_content = []
# Locate the parent container of the section
section_container = section.find_parent("div").find_next_sibling("div")
while section_container and not section_container.find("h2"):
# Find all subsections (h3 tags) within the section container
for subsection in section_container.find_all("h3"):
subsection_title = subsection.get_text(strip=True)
subsection_content = []
# Locate the list of video links (ol or ul) within the subsection
link_list = subsection.find_next("ol") or subsection.find_next("ul")
if not link_list:
continue
# Extract video links from the list
for link_item in link_list.find_all("a", href=True):
if "teaching/" in link_item["href"]:
video_title = link_item.get_text(strip=True)
link_url = link_item["href"]
if not link_url.startswith("http"):
link_url = BASE_URL + link_url
# Fetch the video page and extract the YouTube link from the iframe
sub_soup = get_soup(link_url)
yt_iframe = sub_soup.find("iframe", src=True)
if not yt_iframe:
continue
yt_url = None
if yt_iframe:
iframe_src = yt_iframe["src"]
# Convert embed URL to standard YouTube watch URL
if "youtube.com/embed/" in iframe_src:
video_id = iframe_src.split("/")[-1].split("?")[0]
yt_url = f"https://www.youtube.com/watch?v={video_id}"
elif "youtu.be/" in iframe_src:
video_id = iframe_src.split("/")[-1]
yt_url = f"https://www.youtube.com/watch?v={video_id}"
subsection_content.append({
"title": video_title,
"videoUrl": yt_url,
"pdfName": "",
"visible": True,
"label": "",
"tags": [],
"description": "",
})
elif "fileadmin/" in link_item["href"]:
# Handle PDF links - download the file
pdf_url = link_item["href"]
if not pdf_url.startswith("http"):
pdf_url = BASE_URL + pdf_url
pdf_filename = link_item["href"].split("/")[-1]
# Download the file to the course folder
if download_file(pdf_url, "./{}".format(course_name), pdf_filename):
subsection_content[-1]["pdfName"] = pdf_filename
if subsection_content:
section_content.append({
"subsectionTitle": subsection_title,
"subsectionVisible": True,
"subsectionContent": subsection_content
})
section_container = section_container.find_next_sibling("div")
if section_content:
course_data["content"].append({
"sectionTitle": section_title,
"sectionVisisble": True,
"sectionContent": section_content
})
return course_data
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Crawl a webpage for YT videos.")
parser.add_argument("target_url", help="The URL of the target webpage to crawl.")
args = parser.parse_args()
data = crawl(args.target_url)
filename = "{}.json".format(data["courseName".replace(" ","-")])
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print("Data saved to {}".format(filename))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment