Skip to content

Instantly share code, notes, and snippets.

@pj4533
Created June 14, 2025 23:40
Show Gist options
  • Save pj4533/7fc6f975e31032a877d4c5a6035841af to your computer and use it in GitHub Desktop.
Save pj4533/7fc6f975e31032a877d4c5a6035841af to your computer and use it in GitHub Desktop.
Apple Docs Scraper
#!/usr/bin/env python3
"""
Apple Documentation Scraper
Converts Apple documentation sections to markdown files recursively
"""
import requests
import json
import time
import re
from pathlib import Path
from urllib.parse import urlparse, urljoin
from typing import Set, Dict, List
import argparse
class AppleDocsScraper:
def __init__(self, base_url: str = "https://developer.apple.com", rate_limit: float = 0.1):
self.base_url = base_url
self.api_base = f"{base_url}/tutorials/data"
self.rate_limit = rate_limit
self.session = requests.Session()
self.processed_urls: Set[str] = set()
self.content_sections: List[Dict] = []
self.all_references: Dict[str, Dict] = {} # Store all references across all pages
self.title_to_anchor: Dict[str, str] = {} # Map titles to markdown anchors
def url_to_api_url(self, doc_url: str) -> str:
"""Convert documentation URL to API URL"""
# Extract path from documentation URL
parsed = urlparse(doc_url)
path = parsed.path
# Convert to API format
api_url = f"{self.api_base}{path}.json"
return api_url
def fetch_json(self, api_url: str) -> Dict:
"""Fetch JSON from Apple's API with rate limiting"""
time.sleep(self.rate_limit)
try:
response = self.session.get(api_url, timeout=30)
response.raise_for_status()
return response.json()
except Exception as e:
print(f"Failed to fetch {api_url}: {e}")
return {}
def extract_links(self, data: Dict) -> List[str]:
"""Extract all documentation links from JSON data"""
links = []
# Check topic sections for sub-pages
for section in data.get('topicSections', []):
for identifier in section.get('identifiers', []):
# Convert doc:// URLs to web URLs
if identifier.startswith('doc://'):
# Extract the path part
parts = identifier.split('/')
if len(parts) >= 4: # doc://domain/documentation/framework/...
path_parts = parts[3:] # Skip doc://domain
web_path = '/' + '/'.join(path_parts)
web_url = f"{self.base_url}{web_path}"
links.append(web_url)
# Check references section
references = data.get('references', {})
for ref_id, ref_data in references.items():
if isinstance(ref_data, dict) and 'url' in ref_data:
url = ref_data['url']
if url.startswith('/documentation/'):
web_url = f"{self.base_url}{url}"
links.append(web_url)
# Check see also sections
for section in data.get('seeAlsoSections', []):
for identifier in section.get('identifiers', []):
if identifier.startswith('doc://'):
parts = identifier.split('/')
if len(parts) >= 4:
path_parts = parts[3:]
web_path = '/' + '/'.join(path_parts)
web_url = f"{self.base_url}{web_path}"
links.append(web_url)
return list(set(links)) # Remove duplicates
def create_markdown_anchor(self, title: str) -> str:
"""Create a markdown anchor from a title (GitHub-style)"""
# Convert to lowercase, replace spaces and special chars with hyphens
# This follows GitHub's automatic header anchor generation
anchor = re.sub(r'[^\w\s-]', '', title.lower())
anchor = re.sub(r'[-\s]+', '-', anchor)
return anchor.strip('-')
def resolve_reference_link(self, identifier: str) -> str:
"""Resolve a doc:// reference to a markdown anchor link"""
# Check if we have reference data for this identifier
ref_data = self.all_references.get(identifier, {})
title = ref_data.get('title', '')
if title and title in self.title_to_anchor:
anchor = self.title_to_anchor[title]
return f"[{title}](#{anchor})"
elif title:
# Create anchor from title even if we haven't seen it yet
anchor = self.create_markdown_anchor(title)
return f"[{title}](#{anchor})"
else:
# Fallback to the last part of the identifier
fallback_title = identifier.split('/')[-1]
return f"[{fallback_title}](#)"
def parse_inline_content(self, content: List[Dict]) -> str:
"""Parse inline content array to markdown text"""
parts = []
for item in content:
if item.get('type') == 'text':
parts.append(item.get('text', ''))
elif item.get('type') == 'reference':
# Convert references to internal markdown links
identifier = item.get('identifier', '')
parts.append(self.resolve_reference_link(identifier))
elif item.get('type') == 'codeVoice':
text = item.get('code', '')
parts.append(f"`{text}`")
return ''.join(parts)
def parse_content_item(self, item: Dict) -> str:
"""Parse a content item to markdown"""
item_type = item.get('type', '')
if item_type == 'heading':
level = item.get('level', 2)
text = item.get('text', '')
return f"\n{'#' * level} {text}\n"
elif item_type == 'paragraph':
content = item.get('inlineContent', [])
return f"{self.parse_inline_content(content)}\n\n"
elif item_type == 'codeListing':
code = item.get('code', [])
language = item.get('syntax', '')
code_text = '\n'.join(code) if isinstance(code, list) else str(code)
return f"\n```{language}\n{code_text}\n```\n\n"
elif item_type == 'unorderedList':
items = []
for list_item in item.get('items', []):
content = list_item.get('content', [])
for c in content:
if c.get('type') == 'paragraph':
text = self.parse_inline_content(c.get('inlineContent', []))
items.append(f"- {text}")
return '\n'.join(items) + '\n\n'
elif item_type == 'orderedList':
items = []
for i, list_item in enumerate(item.get('items', []), 1):
content = list_item.get('content', [])
for c in content:
if c.get('type') == 'paragraph':
text = self.parse_inline_content(c.get('inlineContent', []))
items.append(f"{i}. {text}")
return '\n'.join(items) + '\n\n'
return ''
def json_to_markdown(self, data: Dict, url: str) -> str:
"""Convert JSON documentation to markdown"""
lines = []
# Store all references from this page
references = data.get('references', {})
self.all_references.update(references)
# Title and metadata
metadata = data.get('metadata', {})
title = metadata.get('title', 'Untitled')
role = metadata.get('role', '')
# Create anchor for this title
anchor = self.create_markdown_anchor(title)
self.title_to_anchor[title] = anchor
lines.append(f"# {title}")
if role:
lines.append(f"*{role}*")
lines.append(f"\n**Source:** {url}\n")
# Abstract/summary
abstract = data.get('abstract', [])
if abstract:
lines.append("## Summary")
abstract_text = self.parse_inline_content(abstract)
lines.append(f"{abstract_text}\n")
# Main content sections
for section in data.get('primaryContentSections', []):
if section.get('kind') == 'content':
for item in section.get('content', []):
lines.append(self.parse_content_item(item))
# Topic sections (methods, properties, etc.)
for section in data.get('topicSections', []):
section_title = section.get('title', 'Topics')
lines.append(f"\n## {section_title}\n")
for identifier in section.get('identifiers', []):
# Get the reference details
ref_data = references.get(identifier, {})
ref_title = ref_data.get('title', identifier.split('/')[-1])
ref_abstract = ref_data.get('abstract', [])
# Create anchor for this subsection
sub_anchor = self.create_markdown_anchor(ref_title)
self.title_to_anchor[ref_title] = sub_anchor
lines.append(f"### {ref_title}")
if ref_abstract:
abstract_text = self.parse_inline_content(ref_abstract)
lines.append(f"{abstract_text}\n")
else:
lines.append("")
# See also section
see_also = data.get('seeAlsoSections', [])
if see_also:
lines.append("\n## See Also\n")
for section in see_also:
section_title = section.get('title', '')
if section_title:
lines.append(f"### {section_title}")
for identifier in section.get('identifiers', []):
ref_data = references.get(identifier, {})
ref_title = ref_data.get('title', identifier.split('/')[-1])
# Create clickable internal link
if ref_title in self.title_to_anchor:
anchor = self.title_to_anchor[ref_title]
lines.append(f"- [{ref_title}](#{anchor})")
else:
lines.append(f"- {ref_title}")
return '\n'.join(lines)
def scrape_recursive(self, start_url: str, max_depth: int = 3, current_depth: int = 0) -> None:
"""Recursively scrape documentation starting from a URL"""
if current_depth > max_depth or start_url in self.processed_urls:
return
print(f"{' ' * current_depth}Processing: {start_url}")
self.processed_urls.add(start_url)
# Convert to API URL and fetch
api_url = self.url_to_api_url(start_url)
data = self.fetch_json(api_url)
if not data:
print(f"{' ' * current_depth} No data found")
return
# Convert to markdown and store
markdown_content = self.json_to_markdown(data, start_url)
# Store this section
self.content_sections.append({
'url': start_url,
'title': data.get('metadata', {}).get('title', 'Untitled'),
'content': markdown_content,
'depth': current_depth
})
# Get linked pages
links = self.extract_links(data)
# Filter links to stay within the same section
filtered_links = []
start_path_parts = urlparse(start_url).path.split('/')[1:3] # Get /documentation/framework
for link in links:
link_path_parts = urlparse(link).path.split('/')[1:3]
# Only include links from the same framework/section
if link_path_parts == start_path_parts:
filtered_links.append(link)
print(f"{' ' * current_depth} Found {len(filtered_links)} related links")
# Recursively process linked pages
for link in filtered_links[:10]: # Limit to prevent explosion
self.scrape_recursive(link, max_depth, current_depth + 1)
def save_combined_markdown(self, output_file: str) -> None:
"""Save all scraped content to a single markdown file"""
with open(output_file, 'w', encoding='utf-8') as f:
f.write("# Apple Documentation Export\n\n")
f.write(f"Generated from {len(self.content_sections)} pages\n\n")
f.write("---\n\n")
# Sort by depth and title
sorted_sections = sorted(self.content_sections, key=lambda x: (x['depth'], x['title']))
for section in sorted_sections:
f.write(section['content'])
f.write("\n\n---\n\n")
print(f"Saved combined documentation to: {output_file}")
def main():
parser = argparse.ArgumentParser(description='Scrape Apple documentation to markdown')
parser.add_argument('url', help='Apple documentation URL to scrape')
parser.add_argument('-o', '--output', default='apple_docs.md', help='Output markdown file')
parser.add_argument('-d', '--depth', type=int, default=2, help='Maximum recursion depth')
parser.add_argument('-r', '--rate-limit', type=float, default=0.1, help='Rate limit between requests (seconds)')
args = parser.parse_args()
# Validate URL
if not args.url.startswith('https://developer.apple.com/documentation/'):
print("Error: URL must be an Apple documentation URL")
return
scraper = AppleDocsScraper(rate_limit=args.rate_limit)
print(f"Starting scrape of: {args.url}")
print(f"Max depth: {args.depth}")
print(f"Rate limit: {args.rate_limit}s between requests")
print()
scraper.scrape_recursive(args.url, max_depth=args.depth)
print(f"\nScraped {len(scraper.content_sections)} pages")
scraper.save_combined_markdown(args.output)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment