Skip to content

Instantly share code, notes, and snippets.

@MichalPt
Created July 7, 2025 07:36
Show Gist options
  • Save MichalPt/36c5f3e569cc449ff6502ca11f97e735 to your computer and use it in GitHub Desktop.
Save MichalPt/36c5f3e569cc449ff6502ca11f97e735 to your computer and use it in GitHub Desktop.
Webcrawler for transfering content from mancal.cz to a JSON
import requests
from bs4 import BeautifulSoup
import json, re, os, argparse
from urllib.parse import urlparse
from datetime import datetime
BASE_URL = "https://www.mancal.cz/"
def get_soup(url):
resp = requests.get(url)
resp.raise_for_status()
return BeautifulSoup(resp.text, "html.parser")
def parse_date(date_text):
"""Parse date from various formats found in the HTML"""
if not date_text:
return ""
# Remove extra whitespace and extract date
date_text = date_text.strip()
# Look for date patterns like "17.11. 2021", "03.01. 2022", etc.
date_match = re.search(r'(\d+\.\s*\d+\.\s*\d+)', date_text)
if date_match:
parsed = date_text.replace(" ","").split(".")
if len(parsed) == 3:
return '-'.join(parsed[::-1]) # Convert to YYYY-MM-DD format
# If no date found, return empty string
return ""
def crawl_outreach(target_url):
soup = get_soup(target_url)
data = []
# Find all main sections by looking for h2 tags
h2_tags = soup.find_all("h2")
for h2_tag in h2_tags:
section_title = h2_tag.get_text(strip=True)
if not section_title:
continue
section_content = []
# Find ALL divs that contain ul.list after this h2 (until next h2)
current_element = h2_tag.parent
all_list_containers = []
while current_element:
current_element = current_element.find_next_sibling()
if not current_element:
break
# Stop if we hit another h2 (next section)
if current_element.find("h2"):
break
# Collect ALL ul.list elements from this div
list_containers = current_element.find_all("ul", class_="list")
all_list_containers.extend(list_containers)
if not all_list_containers:
continue
# Process each list container
for list_container in all_list_containers:
# Find all li elements that contain h3 tags (subsections)
subsection_items = list_container.find_all("li", recursive=False)
for item in subsection_items:
h3_tag = item.find("h3")
if not h3_tag:
continue
subsection_title = h3_tag.get_text(strip=True)
subsection_content = []
# Find the ul.list-unstyled that follows this li
unstyled_list = item.find_next_sibling("ul", class_="list-unstyled")
if not unstyled_list:
continue
# Extract links from the unstyled list
for link_item in unstyled_list.find_all("li"):
# Skip empty list items
item_text = link_item.get_text(strip=True)
if not item_text:
continue
# Find links
link = link_item.find("a", href=True)
if link:
link_url = link["href"]
video_title = link.get_text(strip=True)
else:
link_url = ""
video_title = item_text
# Extract date if present
date_element = link_item.find("time")
if date_element:
date_text = parse_date(date_element.get_text())
else:
date_text = ""
# If no time element, look for date in text
if not date_text:
date_text = parse_date(item_text)
if "youtube.com" in link_url or "youtu.be" in link_url:
link_type = "video"
else:
link_type = "other"
subsection_content.append({
"title": video_title,
"linkUrl": link_url,
"date": date_text,
"type": link_type,
"visible": True,
"language": "Czech",
"label": "",
"tags": [],
"description": "",
})
if subsection_content:
section_content.append({
"subsectionTitle": subsection_title,
"subsectionVisible": True,
"subsectionContent": subsection_content
})
if section_content:
data.append({
"sectionTitle": section_title,
"sectionVisible": True,
"sectionContent": section_content
})
return data
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Crawl outreach webpage for YouTube videos.")
parser.add_argument("target_url", help="The URL of the target webpage to crawl.")
args = parser.parse_args()
data = crawl_outreach(args.target_url)
filename = "outreach.json"
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print("Data saved to {}".format(filename))
import requests
from bs4 import BeautifulSoup
import json, re, os, argparse
from urllib.parse import urlparse
BASE_URL = "https://www.mancal.cz/"
def get_soup(url):
resp = requests.get(url)
resp.raise_for_status()
return BeautifulSoup(resp.text, "html.parser")
def download_file(url, folder_path, filename):
"""Download a file to the specified folder."""
os.makedirs(folder_path, exist_ok=True)
file_path = os.path.join(folder_path, filename)
try:
response = requests.get(url)
response.raise_for_status()
with open(file_path, 'wb') as f:
f.write(response.content)
print(f"Downloaded: {filename}")
return True
except Exception as e:
print(f"Failed to download {filename}: {e}")
return False
def crawl(target_url):
soup = get_soup(target_url)
parsed_url = urlparse(target_url)
course_name = parsed_url.path.rstrip('/').split("/")[-1]
course_data = {
"courseName": course_name,
"courseCode": "",
"courseDescription": "",
"language": "English",
"semester": "winter",
"courseVisible": True,
"showDates": False,
"content": []
}
# Find all sections (h2 tags)
for section in soup.find_all("h2")[1:]: # Skip the first h2 which is usually the title
section_title = section.get_text(strip=True)
section_content = []
# Locate the parent container of the section
section_container = section.find_parent("div").find_next_sibling("div")
while section_container and not section_container.find("h2"):
# Find all subsections (h3 tags) within the section container
for subsection in section_container.find_all("h3"):
subsection_title = subsection.get_text(strip=True)
subsection_content = []
# Locate the list of video links (ol or ul) within the subsection
link_list = subsection.find_next("ol") or subsection.find_next("ul")
if not link_list:
continue
# Extract video links from the list
for link_item in link_list.find_all("a", href=True):
if "teaching/" in link_item["href"]:
video_title = link_item.get_text(strip=True)
link_url = link_item["href"]
if not link_url.startswith("http"):
link_url = BASE_URL + link_url
# Fetch the video page and extract the YouTube link from the iframe
sub_soup = get_soup(link_url)
yt_iframe = sub_soup.find("iframe", src=True)
if not yt_iframe:
continue
yt_url = None
if yt_iframe:
iframe_src = yt_iframe["src"]
# Convert embed URL to standard YouTube watch URL
if "youtube.com/embed/" in iframe_src:
video_id = iframe_src.split("/")[-1].split("?")[0]
yt_url = f"https://www.youtube.com/watch?v={video_id}"
elif "youtu.be/" in iframe_src:
video_id = iframe_src.split("/")[-1]
yt_url = f"https://www.youtube.com/watch?v={video_id}"
subsection_content.append({
"title": video_title,
"videoUrl": yt_url,
"pdfName": "",
"visible": True,
"label": "",
"tags": [],
"description": "",
})
elif "fileadmin/" in link_item["href"]:
# Handle PDF links - download the file
pdf_url = link_item["href"]
if not pdf_url.startswith("http"):
pdf_url = BASE_URL + pdf_url
pdf_filename = link_item["href"].split("/")[-1]
# Download the file to the course folder
if download_file(pdf_url, "./{}".format(course_name), pdf_filename):
subsection_content[-1]["pdfName"] = pdf_filename
if subsection_content:
section_content.append({
"subsectionTitle": subsection_title,
"subsectionVisible": True,
"subsectionContent": subsection_content
})
section_container = section_container.find_next_sibling("div")
if section_content:
course_data["content"].append({
"sectionTitle": section_title,
"sectionVisisble": True,
"sectionContent": section_content
})
return course_data
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Crawl a webpage for YT videos.")
parser.add_argument("target_url", help="The URL of the target webpage to crawl.")
args = parser.parse_args()
data = crawl(args.target_url)
filename = "{}.json".format(data["courseName".replace(" ","-")])
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print("Data saved to {}".format(filename))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment