Skip to content

Instantly share code, notes, and snippets.

@lmBored
Created May 20, 2026 18:27
Show Gist options
  • Select an option

  • Save lmBored/44c3d20ee66df13b3d198340ee321628 to your computer and use it in GitHub Desktop.

Select an option

Save lmBored/44c3d20ee66df13b3d198340ee321628 to your computer and use it in GitHub Desktop.
downloader
#!/usr/bin/env python3
"""ANS Exam Downloader - Downloads exam questions and answers from ANS for offline viewing."""
import contextlib
import html
import os
import re
import sys
from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
BASE_URL = "https://ans.app"
DOWNLOAD_DIR = Path(__file__).resolve().parent / "downloads"
PDF_SUBDIR = "pdfs"
PDFJS_BUILD_FILES = ("pdf.worker.mjs", "pdf.sandbox.mjs")
PDFJS_DEFAULT_LOCALE = "en-US"
# URL components extracted from exam URL (set during download)
URL_COMPONENTS: dict[str, str] = {}
def extract_assignment_id(url: str) -> str:
"""Extract assignment ID from an ANS URL."""
# URL format: .../assignments/1492373/grading/...
match = re.search(r"/assignments/(\d+)/", url)
if match:
return match.group(1)
msg = f"Could not extract assignment ID from URL: {url}"
raise ValueError(msg)
def extract_url_components(url: str) -> dict[str, str]:
"""
Extract URL components from an ANS URL.
Returns dict with: university_id, course_id, assignment_id
"""
# URL format: https://ans.app/universities/15/courses/569592/assignments/1492373/...
match = re.match(
r"https://ans\.app/universities/(\d+)/courses/(\d+)/assignments/(\d+)/",
url,
)
if not match:
msg = f"Could not extract URL components from: {url}"
raise ValueError(msg)
return {
"university_id": match.group(1),
"course_id": match.group(2),
"assignment_id": match.group(3),
}
def fetch_page(session: requests.Session, url: str) -> str:
"""Fetch a page and return its HTML content."""
print(f" Fetching: {url}")
response = session.get(url, timeout=30)
response.raise_for_status()
return response.text
def extract_question_links(html: str, exam_url: str) -> list[dict]:
"""
Extract all question links from the page.
Returns a list of dicts with 'submission_id', 'question_number', and 'url'.
"""
soup = BeautifulSoup(html, "html.parser")
questions = []
# Extract base URL pattern from exam_url
# e.g., https://ans.app/universities/15/courses/569592/assignments/1492373/grading/view/495117398
# -> base: https://ans.app/universities/15/courses/569592/assignments/1492373/grading/view/
match = re.match(
r"(https://ans\.app/universities/\d+/courses/\d+/assignments/\d+/grading/(?:view|review)/)", exam_url
)
if not match:
msg = f"Could not extract base URL pattern from: {exam_url}"
raise ValueError(msg)
base_view_url = match.group(1)
# Find all anchor tags with data-submission-id attribute
for link in soup.find_all("a", attrs={"data-submission-id": True}):
submission_id = link.get("data-submission-id")
if not submission_id:
continue
# Extract question number from the span inside
question_span = link.find("span", class_="question-button")
question_num = "unknown"
if question_span:
question_num = question_span.get_text(strip=True)
# Build the correct view URL with ?nav=result
full_url = f"{base_view_url}{submission_id}?nav=result"
questions.append(
{
"submission_id": submission_id,
"question_number": question_num,
"url": full_url,
}
)
return questions
def download_resource(session: requests.Session, url: str, save_path: Path) -> bool:
"""Download a resource (image, CSS, etc.) and save it locally."""
try:
response = session.get(url, timeout=30)
response.raise_for_status()
save_path.parent.mkdir(parents=True, exist_ok=True)
save_path.write_bytes(response.content)
return True
except Exception as e:
print(f" Warning: Failed to download {url[:80]}...: {e}")
return False
def get_resource_local_path(url: str, resources_dir: Path) -> Path:
"""Generate a local path for a resource URL."""
parsed = urlparse(url)
path = unquote(parsed.path).lstrip("/")
# Handle empty paths or just domain
if not path:
path = "index"
# For external domains, include a short domain prefix to avoid collisions
domain = parsed.netloc
if domain and "ans.app" not in domain and "assets.ans.app" not in domain:
# Create domain-specific subfolder for external resources
domain_folder = domain.replace(".", "_")
path = f"{domain_folder}/{path}"
# Add query string hash to filename if present (for cache-busted assets)
if parsed.query:
# Create a short hash of the query string
query_hash = str(hash(parsed.query))[-8:]
name, ext = os.path.splitext(path)
# If no extension, try to guess from path
if not ext:
ext = ".bin"
path = f"{name}_{query_hash}{ext}"
return resources_dir / path
def is_pdf_url(url: str) -> bool:
"""Return True if the URL points to a PDF file."""
return bool(re.search(r"\.pdf($|[?#])", url, flags=re.IGNORECASE))
def extract_pdfjs_file_url(viewer_url: str) -> str | None:
"""Extract the file parameter from a pdfjs viewer URL, if present."""
parsed = urlparse(viewer_url)
path = parsed.path or ""
if "pdfjs" not in path or "viewer" not in path:
return None
file_url = None
params = parse_qs(parsed.query)
if params.get("file"):
file_url = params["file"][0]
if not file_url and parsed.fragment:
fragment = parsed.fragment
fragment_query = fragment.split("?", 1)[1] if "?" in fragment else fragment
frag_params = parse_qs(fragment_query)
if frag_params.get("file"):
file_url = frag_params["file"][0]
if not file_url:
return None
return unquote(html.unescape(file_url))
def get_pdf_local_path(pdf_url: str, resources_dir: Path) -> Path:
"""Generate a local path for a PDF URL."""
parsed = urlparse(pdf_url)
filename = os.path.basename(unquote(parsed.path)) or "document.pdf"
if not filename.lower().endswith(".pdf"):
filename = f"{filename}.pdf"
# Add a short hash if query/fragment present to reduce collisions
if parsed.query or parsed.fragment:
name, ext = os.path.splitext(filename)
hash_suffix = str(hash(f"{parsed.query}|{parsed.fragment}"))[-8:]
filename = f"{name}_{hash_suffix}{ext}"
return resources_dir / PDF_SUBDIR / filename
def ensure_pdfjs_runtime_assets(session: requests.Session, resources_dir: Path) -> None:
"""Ensure PDF.js worker/sandbox and locale assets exist for offline viewing."""
assignment_root = resources_dir.parent
pdfjs_root = assignment_root / "pdfjs"
for filename in PDFJS_BUILD_FILES:
local_path = pdfjs_root / "build" / filename
if not local_path.exists():
download_resource(session, f"{BASE_URL}/pdfjs/build/{filename}", local_path)
locale_dir = resources_dir / "pdfjs" / "web" / "locale" / PDFJS_DEFAULT_LOCALE
locale_file = locale_dir / "viewer.ftl"
if not locale_file.exists():
download_resource(
session,
f"{BASE_URL}/pdfjs/web/locale/{PDFJS_DEFAULT_LOCALE}/viewer.ftl",
locale_file,
)
locale_index = resources_dir / "pdfjs" / "web" / "locale" / "locale.json"
if not locale_index.exists():
download_resource(
session,
f"{BASE_URL}/pdfjs/web/locale/locale.json",
locale_index,
)
def patch_pdfjs_viewer_asset_paths(resources_dir: Path) -> None:
"""Patch PDF.js viewer runtime paths for subdirectory serving."""
assets_dir = resources_dir / "assets"
if not assets_dir.exists():
return
# From `resources/assets/*` up to the assignment root.
worker_rel = "../../pdfjs/build/pdf.worker.mjs"
sandbox_rel = "../../pdfjs/build/pdf.sandbox.mjs"
worker_pattern = re.compile(r"([\"'])[^\"']*pdfjs/build/pdf\.worker\.mjs\1")
sandbox_pattern = re.compile(r"([\"'])[^\"']*pdfjs/build/pdf\.sandbox\.mjs\1")
for js_path in assets_dir.glob("pdf-viewer-*.js"):
try:
original = js_path.read_text(encoding="utf-8", errors="ignore")
except OSError:
continue
patched = worker_pattern.sub(rf"\1{worker_rel}\1", original)
patched = sandbox_pattern.sub(rf"\1{sandbox_rel}\1", patched)
if patched != original:
with contextlib.suppress(OSError):
js_path.write_text(patched, encoding="utf-8")
def find_pdf_url_for_viewer(tag: Tag, soup: BeautifulSoup) -> str | None:
"""Find a PDF URL in `data-url` attributes near the PDF.js viewer iframe."""
def find_in(scope: Tag | BeautifulSoup) -> str | None:
for el in scope.find_all(attrs={"data-url": True}):
raw = el.get("data-url")
if not raw:
continue
candidate = html.unescape(str(raw))
if is_pdf_url(candidate):
return candidate
return None
for parent in tag.parents:
if hasattr(parent, "find_all"):
found = find_in(parent)
if found:
return found
return find_in(soup)
def disable_ans_pdf_panel_js_for_offline_view(tag: Tag) -> None:
"""Disable ANS' PDF panel controller so offline PDF.js iframes keep working."""
for parent in getattr(tag, "parents", []) or []:
if getattr(parent, "attrs", None) and "data-js-pdf-panel" in parent.attrs:
del parent.attrs["data-js-pdf-panel"]
break
def strip_ans_pdf_panel_hooks(soup: BeautifulSoup) -> int:
"""
Remove ANS' PDF panel hook attributes across the document.
The ANS frontend attaches behavior to `[data-js-pdf-panel]` which will attempt
to load presigned/online URLs and can replace our local PDF.js iframe.
Removing the hook makes offline rendering stable.
"""
removed = 0
for el in soup.select("[data-js-pdf-panel]"):
if "data-js-pdf-panel" in getattr(el, "attrs", {}):
del el.attrs["data-js-pdf-panel"]
removed += 1
return removed
def process_pdf_embeds(
session: requests.Session,
soup: BeautifulSoup,
resources_dir: Path,
page_url: str,
save_path: Path,
) -> int:
"""Download embedded PDFs (pdfjs viewer or direct embeds) and rewrite URLs."""
pdf_attrs = [
("iframe", "src"),
("iframe", "data-src"),
("embed", "src"),
("embed", "data-src"),
("object", "data"),
("object", "data-src"),
]
rewritten = 0
for tag_name, attr in pdf_attrs:
for tag in soup.find_all(tag_name):
raw_url = tag.get(attr)
if not raw_url:
continue
url = (
raw_url
if isinstance(raw_url, str)
else " ".join(raw_url)
if isinstance(raw_url, list)
else str(raw_url)
)
if url.startswith(("data:", "#", "javascript:")):
continue
url = html.unescape(url)
absolute_url = urljoin(page_url, url)
parsed_url = urlparse(absolute_url)
is_viewer_url = "pdfjs" in (parsed_url.path or "") and "viewer" in (parsed_url.path or "")
pdf_url = extract_pdfjs_file_url(absolute_url)
is_viewer = False
if pdf_url:
is_viewer = True
elif is_viewer_url:
pdf_url = find_pdf_url_for_viewer(tag, soup)
if not pdf_url:
continue
is_viewer = True
elif is_pdf_url(absolute_url):
pdf_url = absolute_url
else:
continue
if is_viewer:
absolute_pdf_url = urljoin(absolute_url, pdf_url)
else:
absolute_pdf_url = absolute_url
local_pdf_path = get_pdf_local_path(absolute_pdf_url, resources_dir)
if not download_resource(session, absolute_pdf_url, local_pdf_path):
continue
if is_viewer:
ensure_pdfjs_runtime_assets(session, resources_dir)
viewer_local_path = get_resource_local_path(absolute_url, resources_dir)
if viewer_local_path.suffix == ".bin":
viewer_local_path = viewer_local_path.with_suffix(".html")
if not viewer_local_path.exists():
viewer_html = fetch_page(session, absolute_url)
process_and_save_html(
session,
viewer_html,
viewer_local_path,
resources_dir,
absolute_url,
)
viewer_relative = os.path.relpath(viewer_local_path, save_path.parent)
pdf_relative_to_viewer = os.path.relpath(local_pdf_path, viewer_local_path.parent)
tag[attr] = f"{viewer_relative}?file={quote(pdf_relative_to_viewer)}"
# Prevent ANS' frontend JS from overriding the local iframe with
# presigned URLs (which fails offline and shows the "Retry" button).
disable_ans_pdf_panel_js_for_offline_view(tag)
else:
relative_path = os.path.relpath(local_pdf_path, save_path.parent)
tag[attr] = relative_path
rewritten += 1
return rewritten
def process_css_file(
session: requests.Session,
css_path: Path,
resources_dir: Path,
css_url: str,
) -> None:
"""
Process a downloaded CSS file to download referenced resources and rewrite URLs.
Args:
session: The requests session for downloading resources.
css_path: Local path to the downloaded CSS file.
resources_dir: Directory where resources should be saved.
css_url: The original URL of the CSS file (for resolving relative paths).
"""
try:
css_content = css_path.read_text(encoding="utf-8")
except Exception as e:
print(f" Warning: Could not read CSS file {css_path}: {e}")
return
# Find all url() references in CSS
# Matches: url(/path), url("/path"), url('/path'), url(../path), etc.
url_pattern = re.compile(r'url\(["\']?([^)"\']+)["\']?\)')
urls_found = url_pattern.findall(css_content)
if not urls_found:
return
print(f" Processing CSS: found {len(urls_found)} url() references")
# Track replacements to make
replacements = {}
for url in urls_found:
# Skip data URLs
if url.startswith("data:"):
continue
# Skip already processed relative paths
if url.startswith(("./", "../")):
continue
# Unescape any HTML entities
url_clean = html.unescape(url.strip())
# Build absolute URL
# For absolute paths like /assets/..., use https://assets.ans.app as base
if url_clean.startswith("/"):
absolute_url = f"https://assets.ans.app{url_clean}"
else:
# For relative paths, resolve against the CSS URL
absolute_url = urljoin(css_url, url_clean)
# Get local path for the resource
local_path = get_resource_local_path(absolute_url, resources_dir)
# Download the resource if not already downloaded
if not local_path.exists() and download_resource(session, absolute_url, local_path):
print(f" Downloaded: {url_clean[:60]}...")
# Calculate relative path from CSS file to the downloaded resource
if local_path.exists():
relative_path = os.path.relpath(local_path, css_path.parent)
replacements[url] = relative_path
# Apply all replacements to CSS content
modified_css = css_content
for original_url, new_path in replacements.items():
# Replace all variations: url(/path), url("/path"), url('/path')
# We need to be careful to replace the exact match
patterns = [
f"url({original_url})",
f'url("{original_url}")',
f"url('{original_url}')",
]
for pattern in patterns:
if pattern in modified_css:
modified_css = modified_css.replace(pattern, f'url("{new_path}")')
# Save the modified CSS
try:
css_path.write_text(modified_css, encoding="utf-8")
print(f" Updated CSS with {len(replacements)} local paths")
except Exception as e:
print(f" Warning: Could not save modified CSS {css_path}: {e}")
def process_and_save_html(
session: requests.Session,
html_content: str,
save_path: Path,
resources_dir: Path,
page_url: str,
) -> None:
"""
Process HTML to download resources and update links to local paths.
Saves the modified HTML to save_path.
"""
soup = BeautifulSoup(html_content, "html.parser")
# Disable Turbo/remote interception for question navigation links so local files load normally
for link in soup.find_all("a", attrs={"data-submission-id": True}):
link.attrs.pop("data-remote", None)
link["data-turbo"] = "false"
href = link.get("href")
if href and "question_" in href:
link["href"] = href.split("?", 1)[0]
# Download and rewrite embedded PDFs (pdfjs viewer or direct embeds)
process_pdf_embeds(session, soup, resources_dir, page_url, save_path)
# Even if we couldn't rewrite a particular viewer iframe (e.g., when reprocessing
# already-downloaded HTML), strip ANS' hook so the page doesn't show "Retry".
strip_ans_pdf_panel_hooks(soup)
# Resource attributes to process
resource_attrs = [
("img", "src"),
("img", "data-src"), # Lazy-loaded images
("script", "src"),
("source", "src"),
("source", "data-src"),
("video", "src"),
("video", "data-src"),
("audio", "src"),
]
# Domains to skip (external CDNs that provide standard libraries)
skip_domains = {"gstatic.com", "cloudflare.com", "google.com", "googleapis.com"}
# Process link tags separately to handle CSS files specially
for tag in soup.find_all("link"):
url = tag.get("href")
if not url:
continue
# Skip data URLs, anchors, and javascript
if url.startswith(("data:", "#", "javascript:")):
continue
# Unescape HTML entities in URL (e.g., & -> &)
url = html.unescape(url)
# Build absolute URL
absolute_url = urljoin(page_url, url)
parsed = urlparse(absolute_url)
# Skip external standard CDNs
if any(skip in parsed.netloc for skip in skip_domains):
continue
# Get local path for resource
local_path = get_resource_local_path(absolute_url, resources_dir)
# Download the resource
if download_resource(session, absolute_url, local_path):
# Update the HTML to use relative path
relative_path = os.path.relpath(local_path, save_path.parent)
tag["href"] = relative_path
# If this is a stylesheet, process it to download fonts and other CSS resources
rel = tag.get("rel", [])
is_stylesheet = "stylesheet" in rel if isinstance(rel, list) else rel == "stylesheet"
if is_stylesheet and local_path.suffix == ".css":
process_css_file(session, local_path, resources_dir, absolute_url)
for tag_name, attr in resource_attrs:
for tag in soup.find_all(tag_name):
url = tag.get(attr)
if not url:
continue
# Skip data URLs, anchors, and javascript
if url.startswith(("data:", "#", "javascript:")):
continue
# Unescape HTML entities in URL (e.g., & -> &)
url = html.unescape(url)
# Build absolute URL
absolute_url = urljoin(page_url, url)
parsed = urlparse(absolute_url)
# Skip external standard CDNs
if any(skip in parsed.netloc for skip in skip_domains):
continue
# Get local path for resource
local_path = get_resource_local_path(absolute_url, resources_dir)
# Download the resource
if download_resource(session, absolute_url, local_path):
# Update the HTML to use relative path
relative_path = os.path.relpath(local_path, save_path.parent)
tag[attr] = relative_path
# Also handle inline styles with url()
for tag in soup.find_all(style=True):
style = tag["style"]
urls = re.findall(r'url\(["\']?([^)"\']+)["\']?\)', style)
for url in urls:
if url.startswith(("data:", "#")):
continue
url_unescaped = html.unescape(url)
absolute_url = urljoin(page_url, url_unescaped)
parsed = urlparse(absolute_url)
if any(skip in parsed.netloc for skip in skip_domains):
continue
local_path = get_resource_local_path(absolute_url, resources_dir)
if download_resource(session, absolute_url, local_path):
relative_path = os.path.relpath(local_path, save_path.parent)
tag["style"] = style.replace(url, relative_path)
# Patch the bundled PDF.js viewer runtime to use relative worker/sandbox
# paths, so it works even when the assignment is served from a subdirectory.
patch_pdfjs_viewer_asset_paths(resources_dir)
# Save the modified HTML
save_path.parent.mkdir(parents=True, exist_ok=True)
save_path.write_text(str(soup), encoding="utf-8")
# =============================================================================
# Navigation Fixing Functions
# =============================================================================
def build_submission_mapping(overview_path: Path) -> dict[str, str]:
"""
Build mapping from submission_id to question number.
Parses the overview HTML to find question buttons with their submission IDs.
"""
content = overview_path.read_text()
soup = BeautifulSoup(content, "html.parser")
mapping: dict[str, str] = {}
for link in soup.find_all("a", attrs={"data-submission-id": True}):
submission_id = link.get("data-submission-id")
if not submission_id:
continue
question_span = link.find("span", class_="question-button")
if not question_span:
continue
question_num = question_span.get_text(strip=True)
if not question_num:
continue
mapping[submission_id] = question_num
return mapping
def fix_go_to_links(html_file: Path, url_components: dict[str, str], mapping: dict[str, str]) -> int:
"""
Replace all navigation links using the submission_id to question mapping.
Handles both /go_to/ and /grading/(?:view|review)/ URL patterns.
Returns count of links fixed.
"""
content = html_file.read_text()
original = content
fixes = 0
uni_id = url_components["university_id"]
course_id = url_components["course_id"]
assignment_id = url_components["assignment_id"]
# Replace go_to URLs with local question files
for submission_id, question_num in mapping.items():
# Pattern 1: /go_to/{submission_id}
old_go_to = (
f"/universities/{uni_id}/courses/{course_id}/assignments/{assignment_id}/grading/go_to/{submission_id}"
)
new_href = f"question_{question_num}.html"
if old_go_to in content:
content = content.replace(old_go_to, new_href)
fixes += 1
# Pattern 2: /grading/view/{submission_id} (used by prev/next navigation)
# Handle both with and without query params
old_view = (
f"/universities/{uni_id}/courses/{course_id}/assignments/{assignment_id}/grading/view/{submission_id}"
)
old_review = (
f"/universities/{uni_id}/courses/{course_id}/assignments/{assignment_id}/grading/review/{submission_id}"
)
for old_path in (old_view, old_review):
if old_path in content:
# Replace full URL (https://ans.app/...)
full_old_path = f"https://ans.app{old_path}"
content = content.replace(full_old_path, new_href)
# Also replace relative paths
content = content.replace(old_path, new_href)
fixes += 1
if content != original:
html_file.write_text(content)
return fixes
def sanitize_offline_navigation(html_file: Path) -> int:
"""
Remove remote/turbo attributes from local question links so navigation works offline.
Returns count of attributes removed.
"""
try:
soup = BeautifulSoup(html_file.read_text(), "html.parser")
except Exception:
return 0
removed = 0
changed = False
for link in soup.find_all("a", href=True):
href = link.get("href", "")
if not href.startswith("question_"):
continue
if "?" in href:
clean_href = href.split("?", 1)[0]
if clean_href != href:
link["href"] = clean_href
changed = True
for attr in (
"data-remote",
"data-js-pagination-item",
"data-js-indicator",
"data-disable-with",
"data-submission-id",
):
if attr in link.attrs:
del link[attr]
removed += 1
changed = True
# Explicitly disable turbo if present
if link.get("data-turbo") not in (None, "false"):
link["data-turbo"] = "false"
changed = True
if changed:
html_file.write_text(str(soup))
return removed
def fix_navigation(output_dir: Path, url_components: dict[str, str]) -> None:
"""
Fix navigation links in all downloaded HTML files.
Converts go_to links to local question file references.
"""
print("\n[4/5] Fixing navigation links...")
overview = output_dir / "overview.html"
if not overview.exists():
print(" Warning: overview.html not found, skipping navigation fix")
return
mapping = build_submission_mapping(overview)
if not mapping:
print(" Warning: Could not build submission mapping")
else:
print(f" Built mapping for {len(mapping)} questions")
html_files = list(output_dir.glob("*.html"))
total_fixes = 0
for html_file in html_files:
if mapping:
fixes = fix_go_to_links(html_file, url_components, mapping)
if fixes > 0:
print(f" Fixed {fixes} links in {html_file.name}")
total_fixes += fixes
removed = sanitize_offline_navigation(html_file)
if removed > 0:
print(f" Removed {removed} remote nav attributes in {html_file.name}")
print(f" Total: Fixed {total_fixes} navigation links")
# =============================================================================
# Missing Assets Detection
# =============================================================================
KATEX_FONT_PATTERN = re.compile(r"KaTeX_[A-Za-z0-9_-]+(?:\.(?:woff2|woff|ttf|otf|eot))?")
def detect_katex_fonts_in_asset(asset_path: Path) -> set[str]:
"""Return KaTeX font filenames referenced in a text asset file."""
try:
asset_content = asset_path.read_text(encoding="utf-8")
except Exception:
return set()
return set(KATEX_FONT_PATTERN.findall(asset_content))
def auto_download_katex_fonts(resources_dir: Path) -> tuple[int, list[tuple[str, Path]]]:
"""
Auto-download KaTeX fonts referenced in CSS into resources/assets/fonts.
Returns (downloaded_count, missing_list).
"""
asset_files = list(resources_dir.rglob("*.css")) + list(resources_dir.rglob("*.js"))
if not asset_files:
return 0, []
font_names: set[str] = set()
for asset_file in asset_files:
font_names.update(detect_katex_fonts_in_asset(asset_file))
if not font_names:
return 0, []
fonts_dir = resources_dir / "assets" / "fonts"
downloaded = 0
missing: list[tuple[str, Path]] = []
for name in sorted(font_names):
if "-" not in name:
continue
filename = name if "." in name else f"{name}.woff2"
local_path = fonts_dir / filename
if local_path.exists():
continue
if try_auto_download_font(filename, local_path):
downloaded += 1
else:
missing.append((filename, local_path))
return downloaded, missing
def detect_missing_fonts_in_css(css_path: Path, resources_dir: Path) -> list[tuple[str, Path]]:
"""
Detect missing font/asset references in a CSS file.
Returns list of tuples: (original_url, expected_local_path)
"""
missing = []
try:
css_content = css_path.read_text(encoding="utf-8")
except Exception:
return missing
# Find all url() references
url_pattern = re.compile(r'url\(["\']?([^)"\']+)["\']?\)')
urls = url_pattern.findall(css_content)
for url in urls:
url_raw = html.unescape(url.strip())
# Skip empty, data URLs, anchors, javascript, and CSS variables
if not url_raw or url_raw.startswith(("data:", "#", "javascript:")):
continue
if "var(" in url_raw:
continue
parsed = urlparse(url_raw)
# Skip external and protocol-relative URLs
if parsed.scheme in ("http", "https") or parsed.netloc:
continue
url_path = parsed.path
if not url_path:
continue
# Check if the referenced file exists
# CSS urls are relative to the CSS file unless root-relative
if url_path.startswith("/"):
local_path = resources_dir / url_path.lstrip("/")
else:
local_path = (css_path.parent / url_path).resolve()
if not local_path.exists():
missing.append((url_raw, local_path))
return missing
# Known CDN patterns and their base URLs for auto-downloading
KNOWN_FONT_CDNS = {
"KaTeX_": "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/fonts/",
"material-icons": "https://fonts.gstatic.com/s/materialiconsoutlined/v109/",
}
def try_auto_download_font(
url: str,
local_path: Path,
session: requests.Session | None = None,
) -> bool:
"""
Attempt to auto-download a font from known CDN sources.
Returns True if download succeeded, False otherwise.
"""
filename = local_path.name
# Check if this matches known font patterns
cdn_url = None
for pattern, base_url in KNOWN_FONT_CDNS.items():
if pattern in filename:
cdn_url = f"{base_url}{filename}"
break
if not cdn_url:
return False
# Try to download
try:
if session is None:
session = requests.Session()
print(f" Auto-downloading: {filename}")
response = session.get(cdn_url, timeout=30)
response.raise_for_status()
local_path.parent.mkdir(parents=True, exist_ok=True)
local_path.write_bytes(response.content)
return True
except Exception as e:
print(f" Warning: Could not auto-download {filename}: {e}")
return False
def check_and_report_missing_assets(output_dir: Path) -> list[tuple[str, Path]]:
"""
Scan all CSS files for missing assets, auto-download known fonts, and report.
Returns list of remaining missing assets (url, expected_path).
"""
resources_dir = output_dir / "resources"
css_files = list(resources_dir.rglob("*.css"))
all_missing = []
for css_file in css_files:
missing = detect_missing_fonts_in_css(css_file, resources_dir)
all_missing.extend(missing)
# Deduplicate by path
seen_paths = set()
unique_missing = []
for url, path in all_missing:
if path not in seen_paths:
seen_paths.add(path)
unique_missing.append((url, path))
katex_downloaded, katex_missing = auto_download_katex_fonts(resources_dir)
if not unique_missing and katex_downloaded == 0 and not katex_missing:
return []
# Try to auto-download known fonts
print("\n[5/5] Checking for missing assets...")
still_missing = []
auto_downloaded = 0
for url, path in unique_missing:
if try_auto_download_font(url, path):
auto_downloaded += 1
else:
still_missing.append((url, path))
if auto_downloaded > 0:
print(f" Auto-downloaded {auto_downloaded} fonts from CDN")
if katex_downloaded > 0:
print(f" Auto-downloaded {katex_downloaded} KaTeX fonts from CDN")
if katex_missing:
existing_paths = {path for _, path in still_missing}
for name, path in katex_missing:
if path not in existing_paths:
still_missing.append((name, path))
existing_paths.add(path)
if still_missing:
print("\n" + "=" * 50)
print("MISSING ASSETS (could not auto-download)")
print("=" * 50)
print(f"Found {len(still_missing)} missing font/asset files:")
for url, path in still_missing[:10]: # Show first 10
print(f" - {url}")
print(f" Expected at: {path}")
if len(still_missing) > 10:
print(f" ... and {len(still_missing) - 10} more")
print("\nTo download manually, place the files in:")
print(f" {resources_dir.absolute()}")
print("=" * 50)
return still_missing
def download_exam(exam_url: str, session: requests.Session) -> None:
"""
Download an entire exam from ANS.
Args:
exam_url: The URL of any question page in the exam (contains all question links).
session: Authenticated requests.Session object.
"""
global URL_COMPONENTS
print(f"Starting download of exam: {exam_url}")
# Extract assignment ID and URL components
assignment_id = extract_assignment_id(exam_url)
URL_COMPONENTS = extract_url_components(exam_url)
print(f"Assignment ID: {assignment_id}")
# Create output directories
output_dir = DOWNLOAD_DIR / assignment_id
resources_dir = output_dir / "resources"
output_dir.mkdir(parents=True, exist_ok=True)
resources_dir.mkdir(parents=True, exist_ok=True)
# Fetch the initial page
print("\n[1/5] Fetching initial page to get question list...")
initial_html = fetch_page(session, exam_url)
# Extract all question links
print("\n[2/5] Extracting question links...")
questions = extract_question_links(initial_html, exam_url)
if not questions:
print("Error: No questions found on the page. Ensure you loaded the final exam page.")
sys.exit(1)
print(f"Found {len(questions)} questions")
# Download each question page
print("\n[3/5] Downloading question pages and resources...")
for i, question in enumerate(questions, 1):
q_num = question["question_number"]
q_url = question["url"]
print(f"\nQuestion {i}/{len(questions)} (Q{q_num}):")
# Fetch question page
q_html = fetch_page(session, q_url)
# Save with resources
save_path = output_dir / f"question_{q_num}.html"
print(f" Saving to: {save_path}")
process_and_save_html(session, q_html, save_path, resources_dir, q_url)
# Also save the initial page (might have overview info)
print("\nSaving initial overview page...")
overview_path = output_dir / "overview.html"
process_and_save_html(session, initial_html, overview_path, resources_dir, exam_url)
# Fix navigation links
fix_navigation(output_dir, URL_COMPONENTS)
# Check for missing assets
missing_assets = check_and_report_missing_assets(output_dir)
print(f"\n{'=' * 50}")
print("Download complete!")
print(f"Files saved to: {output_dir.absolute()}")
print(f"Total questions: {len(questions)}")
if missing_assets:
print(f"Warning: {len(missing_assets)} assets could not be downloaded")
def main() -> None:
"""Main entry point."""
try:
from playwright.sync_api import sync_playwright
except ImportError:
print("Error: The 'playwright' package is required.")
print("Please install it by running:")
print(" pip install playwright")
print(" playwright install chromium")
sys.exit(1)
print("\nLaunching browser...")
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
context = browser.new_context(no_viewport=True)
page = context.new_page()
page.goto("https://ans.app")
while True:
print("\n" + "=" * 60)
print("ANS EXAM DOWNLOADER")
print("1. Navigate to the exam page in the opened browser.")
print("2. Once on the page, press ENTER to start downloading.")
print("3. Enter 'q' or press Ctrl+C to quit.")
print("=" * 60 + "\n")
try:
cmd = input("Press ENTER to start downloading, or 'q' to quit: ")
except KeyboardInterrupt:
break
if cmd.lower() == "q":
break
# Let the Playwright event loop process pending events (like URL changes)
# that happened while the script was blocked on input()
page.wait_for_timeout(100)
current_url = None
active_page = None
all_pages = context.pages
# Check all open tabs to find one that matches the exam URL format
for tab in all_pages:
print(f" Checking tab: {tab.url}")
if re.match(r"https://ans\.app/universities/\d+/courses/\d+/assignments/\d+", tab.url):
current_url = tab.url
active_page = tab
break
if not current_url:
open_urls = [tab.url for tab in context.pages]
print(f"\nWarning: Could not find an exam page among {len(context.pages)} open tab(s).")
print("Make sure you have an exam tab open.")
print(f"Current open tabs: {open_urls}")
continue
print("\nSetting up authenticated session...")
session = requests.Session()
cookies = context.cookies()
user_agent = active_page.evaluate("navigator.userAgent")
cookies_set = 0
for cookie in cookies:
session.cookies.set(
cookie["name"],
cookie["value"],
domain=cookie.get("domain", ""),
path=cookie.get("path", "/")
)
cookies_set += 1
session.headers.update(
{
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
)
print(f"Loaded {cookies_set} cookies from the live browser session.")
try:
download_exam(current_url, session)
except Exception as e:
print(f"Error occurred during download: {e}")
browser.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment