MichalPt · July 7, 2025 07:36
diff --git a/webcrawler-outreach.py b/webcrawler-outreach.py
 import requests
 from bs4 import BeautifulSoup
 import json, re, os, argparse
 from urllib.parse import urlparse
 from datetime import datetime

 BASE_URL = "https://www.mancal.cz/"

 def get_soup(url):
    resp = requests.get(url)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")

 def parse_date(date_text):
    """Parse date from various formats found in the HTML"""
    if not date_text:
        return ""
    
    # Remove extra whitespace and extract date
    date_text = date_text.strip()
    
    # Look for date patterns like "17.11. 2021", "03.01. 2022", etc.
    date_match = re.search(r'(\d+\.\s*\d+\.\s*\d+)', date_text)
    if date_match:
        parsed = date_text.replace(" ","").split(".")
        if len(parsed) == 3:
            return '-'.join(parsed[::-1])  # Convert to YYYY-MM-DD format
    
    # If no date found, return empty string
    return ""

 def crawl_outreach(target_url):
    soup = get_soup(target_url)
    
    data = []

    # Find all main sections by looking for h2 tags
    h2_tags = soup.find_all("h2")
    
    for h2_tag in h2_tags:
        section_title = h2_tag.get_text(strip=True)
        if not section_title:
            continue
            
        section_content = []
        
        # Find ALL divs that contain ul.list after this h2 (until next h2)
        current_element = h2_tag.parent
        all_list_containers = []
        
        while current_element:
            current_element = current_element.find_next_sibling()
            if not current_element:
                break
                
            # Stop if we hit another h2 (next section)
            if current_element.find("h2"):
                break
                
            # Collect ALL ul.list elements from this div
            list_containers = current_element.find_all("ul", class_="list")
            all_list_containers.extend(list_containers)
                
        if not all_list_containers:
            continue
            
        # Process each list container
        for list_container in all_list_containers:
            # Find all li elements that contain h3 tags (subsections)
            subsection_items = list_container.find_all("li", recursive=False)
            
            for item in subsection_items:
                h3_tag = item.find("h3")
                if not h3_tag:
                    continue
                    
                subsection_title = h3_tag.get_text(strip=True)
                subsection_content = []
                
                # Find the ul.list-unstyled that follows this li
                unstyled_list = item.find_next_sibling("ul", class_="list-unstyled")
                if not unstyled_list:
                    continue
                
                # Extract links from the unstyled list
                for link_item in unstyled_list.find_all("li"):
                    # Skip empty list items
                    item_text = link_item.get_text(strip=True)
                    if not item_text:
                        continue
                        
                    # Find links
                    link = link_item.find("a", href=True)
                    if link:
                        link_url = link["href"]
                        video_title = link.get_text(strip=True)
                    else:
                        link_url = ""
                        video_title = item_text
                    
                    # Extract date if present
                    date_element = link_item.find("time")
                    
                    if date_element:
                        date_text = parse_date(date_element.get_text())
                    else:
                        date_text = ""

                    # If no time element, look for date in text
                    if not date_text:
                        date_text = parse_date(item_text)

                    
                    if "youtube.com" in link_url or "youtu.be" in link_url:
                        link_type = "video"
                    else:
                        link_type = "other"
                    
                    subsection_content.append({
                        "title": video_title,
                        "linkUrl": link_url,
                        "date": date_text,
                        "type": link_type,
                        "visible": True,
                        "language": "Czech",
                        "label": "",
                        "tags": [],
                        "description": "",
                    })
                
                if subsection_content:
                    section_content.append({
                        "subsectionTitle": subsection_title,
                        "subsectionVisible": True,
                        "subsectionContent": subsection_content
                    })
        
        if section_content:
            data.append({
                "sectionTitle": section_title,
                "sectionVisible": True,
                "sectionContent": section_content
            })

    return data

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Crawl outreach webpage for YouTube videos.")
    parser.add_argument("target_url", help="The URL of the target webpage to crawl.")
    args = parser.parse_args()

    data = crawl_outreach(args.target_url)
    filename = "outreach.json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print("Data saved to {}".format(filename))
diff --git a/webcrawler-teaching.py b/webcrawler-teaching.py
 import requests
 from bs4 import BeautifulSoup
 import json, re, os, argparse
 from urllib.parse import urlparse

 BASE_URL = "https://www.mancal.cz/"

 def get_soup(url):
    resp = requests.get(url)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")

 def download_file(url, folder_path, filename):
    """Download a file to the specified folder."""
    os.makedirs(folder_path, exist_ok=True)
    file_path = os.path.join(folder_path, filename)
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {filename}")
        return True
    except Exception as e:
        print(f"Failed to download {filename}: {e}")
        return False

 def crawl(target_url):
    soup = get_soup(target_url)
    parsed_url = urlparse(target_url)
    course_name = parsed_url.path.rstrip('/').split("/")[-1]

    course_data = {
        "courseName": course_name,
        "courseCode": "",
        "courseDescription": "",
        "language": "English",
        "semester": "winter",
        "courseVisible": True,
        "showDates": False,
        "content": []
    }

    # Find all sections (h2 tags)
    for section in soup.find_all("h2")[1:]:  # Skip the first h2 which is usually the title
        section_title = section.get_text(strip=True)
        section_content = []

        # Locate the parent container of the section
        section_container = section.find_parent("div").find_next_sibling("div")
        while section_container and not section_container.find("h2"):

            # Find all subsections (h3 tags) within the section container
            for subsection in section_container.find_all("h3"):
                subsection_title = subsection.get_text(strip=True)
                subsection_content = []

                # Locate the list of video links (ol or ul) within the subsection
                link_list = subsection.find_next("ol") or subsection.find_next("ul")
                if not link_list:
                    continue

                # Extract video links from the list
                for link_item in link_list.find_all("a", href=True):
                    if "teaching/" in link_item["href"]:
                        video_title = link_item.get_text(strip=True)
                        link_url = link_item["href"]
                        if not link_url.startswith("http"):
                            link_url = BASE_URL + link_url

                        # Fetch the video page and extract the YouTube link from the iframe
                        sub_soup = get_soup(link_url)
                        yt_iframe = sub_soup.find("iframe", src=True)
                        if not yt_iframe:
                            continue
                        yt_url = None
                        if yt_iframe:
                            iframe_src = yt_iframe["src"]
                            # Convert embed URL to standard YouTube watch URL
                            if "youtube.com/embed/" in iframe_src:
                                video_id = iframe_src.split("/")[-1].split("?")[0]
                                yt_url = f"https://www.youtube.com/watch?v={video_id}"
                            elif "youtu.be/" in iframe_src:
                                video_id = iframe_src.split("/")[-1]
                                yt_url = f"https://www.youtube.com/watch?v={video_id}"
                                
                        subsection_content.append({
                            "title": video_title,
                            "videoUrl": yt_url,
                            "pdfName": "",
                            "visible": True,
                            "label": "",
                            "tags": [],
                            "description": "",
                        })

                    elif "fileadmin/" in link_item["href"]:
                        # Handle PDF links - download the file
                        pdf_url = link_item["href"]
                        if not pdf_url.startswith("http"):
                            pdf_url = BASE_URL + pdf_url
                        
                        pdf_filename = link_item["href"].split("/")[-1]
                        
                        # Download the file to the course folder
                        if download_file(pdf_url, "./{}".format(course_name), pdf_filename):
                            subsection_content[-1]["pdfName"] = pdf_filename
                    

                if subsection_content:
                    section_content.append({
                        "subsectionTitle": subsection_title,
                        "subsectionVisible": True,
                        "subsectionContent": subsection_content
                    })

            section_container = section_container.find_next_sibling("div")

        if section_content:
            course_data["content"].append({
                "sectionTitle": section_title,
                "sectionVisisble": True,
                "sectionContent": section_content
            })

    return course_data

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Crawl a webpage for YT videos.")
    parser.add_argument("target_url", help="The URL of the target webpage to crawl.")
    args = parser.parse_args()

    data = crawl(args.target_url)
    filename = "{}.json".format(data["courseName".replace(" ","-")])
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print("Data saved to {}".format(filename))
	import requests
	from bs4 import BeautifulSoup
	import json, re, os, argparse
	from urllib.parse import urlparse
	from datetime import datetime

	BASE_URL = "https://www.mancal.cz/"

	def get_soup(url):
	resp = requests.get(url)
	resp.raise_for_status()
	return BeautifulSoup(resp.text, "html.parser")

	def parse_date(date_text):
	"""Parse date from various formats found in the HTML"""
	if not date_text:
	return ""

	# Remove extra whitespace and extract date
	date_text = date_text.strip()

	# Look for date patterns like "17.11. 2021", "03.01. 2022", etc.
	date_match = re.search(r'(\d+\.\s\d+\.\s\d+)', date_text)
	if date_match:
	parsed = date_text.replace(" ","").split(".")
	if len(parsed) == 3:
	return '-'.join(parsed[::-1]) # Convert to YYYY-MM-DD format

	# If no date found, return empty string
	return ""

	def crawl_outreach(target_url):
	soup = get_soup(target_url)

	data = []

	# Find all main sections by looking for h2 tags
	h2_tags = soup.find_all("h2")

	for h2_tag in h2_tags:
	section_title = h2_tag.get_text(strip=True)
	if not section_title:
	continue

	section_content = []

	# Find ALL divs that contain ul.list after this h2 (until next h2)
	current_element = h2_tag.parent
	all_list_containers = []

	while current_element:
	current_element = current_element.find_next_sibling()
	if not current_element:
	break

	# Stop if we hit another h2 (next section)
	if current_element.find("h2"):
	break

	# Collect ALL ul.list elements from this div
	list_containers = current_element.find_all("ul", class_="list")
	all_list_containers.extend(list_containers)

	if not all_list_containers:
	continue

	# Process each list container
	for list_container in all_list_containers:
	# Find all li elements that contain h3 tags (subsections)
	subsection_items = list_container.find_all("li", recursive=False)

	for item in subsection_items:
	h3_tag = item.find("h3")
	if not h3_tag:
	continue

	subsection_title = h3_tag.get_text(strip=True)
	subsection_content = []

	# Find the ul.list-unstyled that follows this li
	unstyled_list = item.find_next_sibling("ul", class_="list-unstyled")
	if not unstyled_list:
	continue

	# Extract links from the unstyled list
	for link_item in unstyled_list.find_all("li"):
	# Skip empty list items
	item_text = link_item.get_text(strip=True)
	if not item_text:
	continue

	# Find links
	link = link_item.find("a", href=True)
	if link:
	link_url = link["href"]
	video_title = link.get_text(strip=True)
	else:
	link_url = ""
	video_title = item_text

	# Extract date if present
	date_element = link_item.find("time")

	if date_element:
	date_text = parse_date(date_element.get_text())
	else:
	date_text = ""

	# If no time element, look for date in text
	if not date_text:
	date_text = parse_date(item_text)


	if "youtube.com" in link_url or "youtu.be" in link_url:
	link_type = "video"
	else:
	link_type = "other"

	subsection_content.append({
	"title": video_title,
	"linkUrl": link_url,
	"date": date_text,
	"type": link_type,
	"visible": True,
	"language": "Czech",
	"label": "",
	"tags": [],
	"description": "",
	})

	if subsection_content:
	section_content.append({
	"subsectionTitle": subsection_title,
	"subsectionVisible": True,
	"subsectionContent": subsection_content
	})

	if section_content:
	data.append({
	"sectionTitle": section_title,
	"sectionVisible": True,
	"sectionContent": section_content
	})

	return data

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Crawl outreach webpage for YouTube videos.")
	parser.add_argument("target_url", help="The URL of the target webpage to crawl.")
	args = parser.parse_args()

	data = crawl_outreach(args.target_url)
	filename = "outreach.json"
	with open(filename, "w", encoding="utf-8") as f:
	json.dump(data, f, ensure_ascii=False, indent=4)
	print("Data saved to {}".format(filename))