MichalPt · June 9, 2025 09:59
diff --git a/crawl.py b/crawl.py
 import requests
 from bs4 import BeautifulSoup
 import json, re, os, argparse
 from urllib.parse import urlparse

 BASE_URL = "https://www.mancal.cz/"

 def get_soup(url):
    resp = requests.get(url)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")

 def download_file(url, folder_path, filename):
    """Download a file to the specified folder."""
    os.makedirs(folder_path, exist_ok=True)
    file_path = os.path.join(folder_path, filename)
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {filename}")
        return True
    except Exception as e:
        print(f"Failed to download {filename}: {e}")
        return False

 def crawl(target_url):
    soup = get_soup(target_url)
    parsed_url = urlparse(target_url)
    course_name = parsed_url.path.rstrip('/').split("/")[-1]

    course_data = {
        "courseName": course_name,
        "courseCode": "",
        "courseDescription": "",
        "language": "English",
        "semester": "winter",
        "courseVisible": True,
        "showDates": False,
        "content": []
    }

    # Find all sections (h2 tags)
    for section in soup.find_all("h2")[1:]:  # Skip the first h2 which is usually the title
        section_title = section.get_text(strip=True)
        section_content = []

        # Locate the parent container of the section
        section_container = section.find_parent("div").find_next_sibling("div")
        while section_container and not section_container.find("h2"):

            # Find all subsections (h3 tags) within the section container
            for subsection in section_container.find_all("h3"):
                subsection_title = subsection.get_text(strip=True)
                subsection_content = []

                # Locate the list of video links (ol or ul) within the subsection
                link_list = subsection.find_next("ol") or subsection.find_next("ul")
                if not link_list:
                    continue

                # Extract video links from the list
                for link_item in link_list.find_all("a", href=True):
                    if "teaching/" in link_item["href"]:
                        video_title = link_item.get_text(strip=True)
                        link_url = link_item["href"]
                        if not link_url.startswith("http"):
                            link_url = BASE_URL + link_url

                        # Fetch the video page and extract the YouTube link from the iframe
                        sub_soup = get_soup(link_url)
                        yt_iframe = sub_soup.find("iframe", src=True)
                        if not yt_iframe:
                            continue
                        yt_url = None
                        if yt_iframe:
                            iframe_src = yt_iframe["src"]
                            # Convert embed URL to standard YouTube watch URL
                            if "youtube.com/embed/" in iframe_src:
                                video_id = iframe_src.split("/")[-1].split("?")[0]
                                yt_url = f"https://www.youtube.com/watch?v={video_id}"
                            elif "youtu.be/" in iframe_src:
                                video_id = iframe_src.split("/")[-1]
                                yt_url = f"https://www.youtube.com/watch?v={video_id}"
                                
                        subsection_content.append({
                            "title": video_title,
                            "videoUrl": yt_url,
                            "pdfName": "",
                            "visible": True,
                            "label": "",
                            "tags": [],
                            "description": "",
                        })

                    elif "fileadmin/" in link_item["href"]:
                        # Handle PDF links - download the file
                        pdf_url = link_item["href"]
                        if not pdf_url.startswith("http"):
                            pdf_url = BASE_URL + pdf_url
                        
                        pdf_filename = link_item["href"].split("/")[-1]
                        
                        # Download the file to the course folder
                        if download_file(pdf_url, "./{}".format(course_name), pdf_filename):
                            subsection_content[-1]["pdfName"] = pdf_filename
                    

                if subsection_content:
                    section_content.append({
                        "subsectionTitle": subsection_title,
                        "subsectionVisible": True,
                        "subsectionContent": subsection_content
                    })

            section_container = section_container.find_next_sibling("div")

        if section_content:
            course_data["content"].append({
                "sectionTitle": section_title,
                "sectionVisisble": True,
                "sectionContent": section_content
            })

    return course_data

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Crawl a webpage for YT videos.")
    parser.add_argument("target_url", help="The URL of the target webpage to crawl.")
    args = parser.parse_args()

    data = crawl(args.target_url)
    filename = "{}.json".format(data["courseName".replace(" ","-")])
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print("Data saved to {}".format(filename))
	import requests
	from bs4 import BeautifulSoup
	import json, re, os, argparse
	from urllib.parse import urlparse

	BASE_URL = "https://www.mancal.cz/"

	def get_soup(url):
	resp = requests.get(url)
	resp.raise_for_status()
	return BeautifulSoup(resp.text, "html.parser")

	def download_file(url, folder_path, filename):
	"""Download a file to the specified folder."""
	os.makedirs(folder_path, exist_ok=True)
	file_path = os.path.join(folder_path, filename)

	try:
	response = requests.get(url)
	response.raise_for_status()

	with open(file_path, 'wb') as f:
	f.write(response.content)
	print(f"Downloaded: {filename}")
	return True
	except Exception as e:
	print(f"Failed to download {filename}: {e}")
	return False

	def crawl(target_url):
	soup = get_soup(target_url)
	parsed_url = urlparse(target_url)
	course_name = parsed_url.path.rstrip('/').split("/")[-1]

	course_data = {
	"courseName": course_name,
	"courseCode": "",
	"courseDescription": "",
	"language": "English",
	"semester": "winter",
	"courseVisible": True,
	"showDates": False,
	"content": []
	}

	# Find all sections (h2 tags)
	for section in soup.find_all("h2")[1:]: # Skip the first h2 which is usually the title
	section_title = section.get_text(strip=True)
	section_content = []

	# Locate the parent container of the section
	section_container = section.find_parent("div").find_next_sibling("div")
	while section_container and not section_container.find("h2"):

	# Find all subsections (h3 tags) within the section container
	for subsection in section_container.find_all("h3"):
	subsection_title = subsection.get_text(strip=True)
	subsection_content = []

	# Locate the list of video links (ol or ul) within the subsection
	link_list = subsection.find_next("ol") or subsection.find_next("ul")
	if not link_list:
	continue

	# Extract video links from the list
	for link_item in link_list.find_all("a", href=True):
	if "teaching/" in link_item["href"]:
	video_title = link_item.get_text(strip=True)
	link_url = link_item["href"]
	if not link_url.startswith("http"):
	link_url = BASE_URL + link_url

	# Fetch the video page and extract the YouTube link from the iframe
	sub_soup = get_soup(link_url)
	yt_iframe = sub_soup.find("iframe", src=True)
	if not yt_iframe:
	continue
	yt_url = None
	if yt_iframe:
	iframe_src = yt_iframe["src"]
	# Convert embed URL to standard YouTube watch URL
	if "youtube.com/embed/" in iframe_src:
	video_id = iframe_src.split("/")[-1].split("?")[0]
	yt_url = f"https://www.youtube.com/watch?v={video_id}"
	elif "youtu.be/" in iframe_src:
	video_id = iframe_src.split("/")[-1]
	yt_url = f"https://www.youtube.com/watch?v={video_id}"

	subsection_content.append({
	"title": video_title,
	"videoUrl": yt_url,
	"pdfName": "",
	"visible": True,
	"label": "",
	"tags": [],
	"description": "",
	})

	elif "fileadmin/" in link_item["href"]:
	# Handle PDF links - download the file
	pdf_url = link_item["href"]
	if not pdf_url.startswith("http"):
	pdf_url = BASE_URL + pdf_url

	pdf_filename = link_item["href"].split("/")[-1]

	# Download the file to the course folder
	if download_file(pdf_url, "./{}".format(course_name), pdf_filename):
	subsection_content[-1]["pdfName"] = pdf_filename


	if subsection_content:
	section_content.append({
	"subsectionTitle": subsection_title,
	"subsectionVisible": True,
	"subsectionContent": subsection_content
	})

	section_container = section_container.find_next_sibling("div")

	if section_content:
	course_data["content"].append({
	"sectionTitle": section_title,
	"sectionVisisble": True,
	"sectionContent": section_content
	})

	return course_data

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Crawl a webpage for YT videos.")
	parser.add_argument("target_url", help="The URL of the target webpage to crawl.")
	args = parser.parse_args()

	data = crawl(args.target_url)
	filename = "{}.json".format(data["courseName".replace(" ","-")])
	with open(filename, "w", encoding="utf-8") as f:
	json.dump(data, f, ensure_ascii=False, indent=4)
	print("Data saved to {}".format(filename))