Created
July 7, 2025 07:36
-
-
Save MichalPt/36c5f3e569cc449ff6502ca11f97e735 to your computer and use it in GitHub Desktop.
Webcrawler for transfering content from mancal.cz to a JSON
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import json, re, os, argparse | |
from urllib.parse import urlparse | |
from datetime import datetime | |
BASE_URL = "https://www.mancal.cz/" | |
def get_soup(url): | |
resp = requests.get(url) | |
resp.raise_for_status() | |
return BeautifulSoup(resp.text, "html.parser") | |
def parse_date(date_text): | |
"""Parse date from various formats found in the HTML""" | |
if not date_text: | |
return "" | |
# Remove extra whitespace and extract date | |
date_text = date_text.strip() | |
# Look for date patterns like "17.11. 2021", "03.01. 2022", etc. | |
date_match = re.search(r'(\d+\.\s*\d+\.\s*\d+)', date_text) | |
if date_match: | |
parsed = date_text.replace(" ","").split(".") | |
if len(parsed) == 3: | |
return '-'.join(parsed[::-1]) # Convert to YYYY-MM-DD format | |
# If no date found, return empty string | |
return "" | |
def crawl_outreach(target_url): | |
soup = get_soup(target_url) | |
data = [] | |
# Find all main sections by looking for h2 tags | |
h2_tags = soup.find_all("h2") | |
for h2_tag in h2_tags: | |
section_title = h2_tag.get_text(strip=True) | |
if not section_title: | |
continue | |
section_content = [] | |
# Find ALL divs that contain ul.list after this h2 (until next h2) | |
current_element = h2_tag.parent | |
all_list_containers = [] | |
while current_element: | |
current_element = current_element.find_next_sibling() | |
if not current_element: | |
break | |
# Stop if we hit another h2 (next section) | |
if current_element.find("h2"): | |
break | |
# Collect ALL ul.list elements from this div | |
list_containers = current_element.find_all("ul", class_="list") | |
all_list_containers.extend(list_containers) | |
if not all_list_containers: | |
continue | |
# Process each list container | |
for list_container in all_list_containers: | |
# Find all li elements that contain h3 tags (subsections) | |
subsection_items = list_container.find_all("li", recursive=False) | |
for item in subsection_items: | |
h3_tag = item.find("h3") | |
if not h3_tag: | |
continue | |
subsection_title = h3_tag.get_text(strip=True) | |
subsection_content = [] | |
# Find the ul.list-unstyled that follows this li | |
unstyled_list = item.find_next_sibling("ul", class_="list-unstyled") | |
if not unstyled_list: | |
continue | |
# Extract links from the unstyled list | |
for link_item in unstyled_list.find_all("li"): | |
# Skip empty list items | |
item_text = link_item.get_text(strip=True) | |
if not item_text: | |
continue | |
# Find links | |
link = link_item.find("a", href=True) | |
if link: | |
link_url = link["href"] | |
video_title = link.get_text(strip=True) | |
else: | |
link_url = "" | |
video_title = item_text | |
# Extract date if present | |
date_element = link_item.find("time") | |
if date_element: | |
date_text = parse_date(date_element.get_text()) | |
else: | |
date_text = "" | |
# If no time element, look for date in text | |
if not date_text: | |
date_text = parse_date(item_text) | |
if "youtube.com" in link_url or "youtu.be" in link_url: | |
link_type = "video" | |
else: | |
link_type = "other" | |
subsection_content.append({ | |
"title": video_title, | |
"linkUrl": link_url, | |
"date": date_text, | |
"type": link_type, | |
"visible": True, | |
"language": "Czech", | |
"label": "", | |
"tags": [], | |
"description": "", | |
}) | |
if subsection_content: | |
section_content.append({ | |
"subsectionTitle": subsection_title, | |
"subsectionVisible": True, | |
"subsectionContent": subsection_content | |
}) | |
if section_content: | |
data.append({ | |
"sectionTitle": section_title, | |
"sectionVisible": True, | |
"sectionContent": section_content | |
}) | |
return data | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Crawl outreach webpage for YouTube videos.") | |
parser.add_argument("target_url", help="The URL of the target webpage to crawl.") | |
args = parser.parse_args() | |
data = crawl_outreach(args.target_url) | |
filename = "outreach.json" | |
with open(filename, "w", encoding="utf-8") as f: | |
json.dump(data, f, ensure_ascii=False, indent=4) | |
print("Data saved to {}".format(filename)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import json, re, os, argparse | |
from urllib.parse import urlparse | |
BASE_URL = "https://www.mancal.cz/" | |
def get_soup(url): | |
resp = requests.get(url) | |
resp.raise_for_status() | |
return BeautifulSoup(resp.text, "html.parser") | |
def download_file(url, folder_path, filename): | |
"""Download a file to the specified folder.""" | |
os.makedirs(folder_path, exist_ok=True) | |
file_path = os.path.join(folder_path, filename) | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
with open(file_path, 'wb') as f: | |
f.write(response.content) | |
print(f"Downloaded: {filename}") | |
return True | |
except Exception as e: | |
print(f"Failed to download {filename}: {e}") | |
return False | |
def crawl(target_url): | |
soup = get_soup(target_url) | |
parsed_url = urlparse(target_url) | |
course_name = parsed_url.path.rstrip('/').split("/")[-1] | |
course_data = { | |
"courseName": course_name, | |
"courseCode": "", | |
"courseDescription": "", | |
"language": "English", | |
"semester": "winter", | |
"courseVisible": True, | |
"showDates": False, | |
"content": [] | |
} | |
# Find all sections (h2 tags) | |
for section in soup.find_all("h2")[1:]: # Skip the first h2 which is usually the title | |
section_title = section.get_text(strip=True) | |
section_content = [] | |
# Locate the parent container of the section | |
section_container = section.find_parent("div").find_next_sibling("div") | |
while section_container and not section_container.find("h2"): | |
# Find all subsections (h3 tags) within the section container | |
for subsection in section_container.find_all("h3"): | |
subsection_title = subsection.get_text(strip=True) | |
subsection_content = [] | |
# Locate the list of video links (ol or ul) within the subsection | |
link_list = subsection.find_next("ol") or subsection.find_next("ul") | |
if not link_list: | |
continue | |
# Extract video links from the list | |
for link_item in link_list.find_all("a", href=True): | |
if "teaching/" in link_item["href"]: | |
video_title = link_item.get_text(strip=True) | |
link_url = link_item["href"] | |
if not link_url.startswith("http"): | |
link_url = BASE_URL + link_url | |
# Fetch the video page and extract the YouTube link from the iframe | |
sub_soup = get_soup(link_url) | |
yt_iframe = sub_soup.find("iframe", src=True) | |
if not yt_iframe: | |
continue | |
yt_url = None | |
if yt_iframe: | |
iframe_src = yt_iframe["src"] | |
# Convert embed URL to standard YouTube watch URL | |
if "youtube.com/embed/" in iframe_src: | |
video_id = iframe_src.split("/")[-1].split("?")[0] | |
yt_url = f"https://www.youtube.com/watch?v={video_id}" | |
elif "youtu.be/" in iframe_src: | |
video_id = iframe_src.split("/")[-1] | |
yt_url = f"https://www.youtube.com/watch?v={video_id}" | |
subsection_content.append({ | |
"title": video_title, | |
"videoUrl": yt_url, | |
"pdfName": "", | |
"visible": True, | |
"label": "", | |
"tags": [], | |
"description": "", | |
}) | |
elif "fileadmin/" in link_item["href"]: | |
# Handle PDF links - download the file | |
pdf_url = link_item["href"] | |
if not pdf_url.startswith("http"): | |
pdf_url = BASE_URL + pdf_url | |
pdf_filename = link_item["href"].split("/")[-1] | |
# Download the file to the course folder | |
if download_file(pdf_url, "./{}".format(course_name), pdf_filename): | |
subsection_content[-1]["pdfName"] = pdf_filename | |
if subsection_content: | |
section_content.append({ | |
"subsectionTitle": subsection_title, | |
"subsectionVisible": True, | |
"subsectionContent": subsection_content | |
}) | |
section_container = section_container.find_next_sibling("div") | |
if section_content: | |
course_data["content"].append({ | |
"sectionTitle": section_title, | |
"sectionVisisble": True, | |
"sectionContent": section_content | |
}) | |
return course_data | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Crawl a webpage for YT videos.") | |
parser.add_argument("target_url", help="The URL of the target webpage to crawl.") | |
args = parser.parse_args() | |
data = crawl(args.target_url) | |
filename = "{}.json".format(data["courseName".replace(" ","-")]) | |
with open(filename, "w", encoding="utf-8") as f: | |
json.dump(data, f, ensure_ascii=False, indent=4) | |
print("Data saved to {}".format(filename)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment