Created
June 14, 2025 23:40
-
-
Save pj4533/7fc6f975e31032a877d4c5a6035841af to your computer and use it in GitHub Desktop.
Apple Docs Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Apple Documentation Scraper | |
| Converts Apple documentation sections to markdown files recursively | |
| """ | |
| import requests | |
| import json | |
| import time | |
| import re | |
| from pathlib import Path | |
| from urllib.parse import urlparse, urljoin | |
| from typing import Set, Dict, List | |
| import argparse | |
| class AppleDocsScraper: | |
| def __init__(self, base_url: str = "https://developer.apple.com", rate_limit: float = 0.1): | |
| self.base_url = base_url | |
| self.api_base = f"{base_url}/tutorials/data" | |
| self.rate_limit = rate_limit | |
| self.session = requests.Session() | |
| self.processed_urls: Set[str] = set() | |
| self.content_sections: List[Dict] = [] | |
| self.all_references: Dict[str, Dict] = {} # Store all references across all pages | |
| self.title_to_anchor: Dict[str, str] = {} # Map titles to markdown anchors | |
| def url_to_api_url(self, doc_url: str) -> str: | |
| """Convert documentation URL to API URL""" | |
| # Extract path from documentation URL | |
| parsed = urlparse(doc_url) | |
| path = parsed.path | |
| # Convert to API format | |
| api_url = f"{self.api_base}{path}.json" | |
| return api_url | |
| def fetch_json(self, api_url: str) -> Dict: | |
| """Fetch JSON from Apple's API with rate limiting""" | |
| time.sleep(self.rate_limit) | |
| try: | |
| response = self.session.get(api_url, timeout=30) | |
| response.raise_for_status() | |
| return response.json() | |
| except Exception as e: | |
| print(f"Failed to fetch {api_url}: {e}") | |
| return {} | |
| def extract_links(self, data: Dict) -> List[str]: | |
| """Extract all documentation links from JSON data""" | |
| links = [] | |
| # Check topic sections for sub-pages | |
| for section in data.get('topicSections', []): | |
| for identifier in section.get('identifiers', []): | |
| # Convert doc:// URLs to web URLs | |
| if identifier.startswith('doc://'): | |
| # Extract the path part | |
| parts = identifier.split('/') | |
| if len(parts) >= 4: # doc://domain/documentation/framework/... | |
| path_parts = parts[3:] # Skip doc://domain | |
| web_path = '/' + '/'.join(path_parts) | |
| web_url = f"{self.base_url}{web_path}" | |
| links.append(web_url) | |
| # Check references section | |
| references = data.get('references', {}) | |
| for ref_id, ref_data in references.items(): | |
| if isinstance(ref_data, dict) and 'url' in ref_data: | |
| url = ref_data['url'] | |
| if url.startswith('/documentation/'): | |
| web_url = f"{self.base_url}{url}" | |
| links.append(web_url) | |
| # Check see also sections | |
| for section in data.get('seeAlsoSections', []): | |
| for identifier in section.get('identifiers', []): | |
| if identifier.startswith('doc://'): | |
| parts = identifier.split('/') | |
| if len(parts) >= 4: | |
| path_parts = parts[3:] | |
| web_path = '/' + '/'.join(path_parts) | |
| web_url = f"{self.base_url}{web_path}" | |
| links.append(web_url) | |
| return list(set(links)) # Remove duplicates | |
| def create_markdown_anchor(self, title: str) -> str: | |
| """Create a markdown anchor from a title (GitHub-style)""" | |
| # Convert to lowercase, replace spaces and special chars with hyphens | |
| # This follows GitHub's automatic header anchor generation | |
| anchor = re.sub(r'[^\w\s-]', '', title.lower()) | |
| anchor = re.sub(r'[-\s]+', '-', anchor) | |
| return anchor.strip('-') | |
| def resolve_reference_link(self, identifier: str) -> str: | |
| """Resolve a doc:// reference to a markdown anchor link""" | |
| # Check if we have reference data for this identifier | |
| ref_data = self.all_references.get(identifier, {}) | |
| title = ref_data.get('title', '') | |
| if title and title in self.title_to_anchor: | |
| anchor = self.title_to_anchor[title] | |
| return f"[{title}](#{anchor})" | |
| elif title: | |
| # Create anchor from title even if we haven't seen it yet | |
| anchor = self.create_markdown_anchor(title) | |
| return f"[{title}](#{anchor})" | |
| else: | |
| # Fallback to the last part of the identifier | |
| fallback_title = identifier.split('/')[-1] | |
| return f"[{fallback_title}](#)" | |
| def parse_inline_content(self, content: List[Dict]) -> str: | |
| """Parse inline content array to markdown text""" | |
| parts = [] | |
| for item in content: | |
| if item.get('type') == 'text': | |
| parts.append(item.get('text', '')) | |
| elif item.get('type') == 'reference': | |
| # Convert references to internal markdown links | |
| identifier = item.get('identifier', '') | |
| parts.append(self.resolve_reference_link(identifier)) | |
| elif item.get('type') == 'codeVoice': | |
| text = item.get('code', '') | |
| parts.append(f"`{text}`") | |
| return ''.join(parts) | |
| def parse_content_item(self, item: Dict) -> str: | |
| """Parse a content item to markdown""" | |
| item_type = item.get('type', '') | |
| if item_type == 'heading': | |
| level = item.get('level', 2) | |
| text = item.get('text', '') | |
| return f"\n{'#' * level} {text}\n" | |
| elif item_type == 'paragraph': | |
| content = item.get('inlineContent', []) | |
| return f"{self.parse_inline_content(content)}\n\n" | |
| elif item_type == 'codeListing': | |
| code = item.get('code', []) | |
| language = item.get('syntax', '') | |
| code_text = '\n'.join(code) if isinstance(code, list) else str(code) | |
| return f"\n```{language}\n{code_text}\n```\n\n" | |
| elif item_type == 'unorderedList': | |
| items = [] | |
| for list_item in item.get('items', []): | |
| content = list_item.get('content', []) | |
| for c in content: | |
| if c.get('type') == 'paragraph': | |
| text = self.parse_inline_content(c.get('inlineContent', [])) | |
| items.append(f"- {text}") | |
| return '\n'.join(items) + '\n\n' | |
| elif item_type == 'orderedList': | |
| items = [] | |
| for i, list_item in enumerate(item.get('items', []), 1): | |
| content = list_item.get('content', []) | |
| for c in content: | |
| if c.get('type') == 'paragraph': | |
| text = self.parse_inline_content(c.get('inlineContent', [])) | |
| items.append(f"{i}. {text}") | |
| return '\n'.join(items) + '\n\n' | |
| return '' | |
| def json_to_markdown(self, data: Dict, url: str) -> str: | |
| """Convert JSON documentation to markdown""" | |
| lines = [] | |
| # Store all references from this page | |
| references = data.get('references', {}) | |
| self.all_references.update(references) | |
| # Title and metadata | |
| metadata = data.get('metadata', {}) | |
| title = metadata.get('title', 'Untitled') | |
| role = metadata.get('role', '') | |
| # Create anchor for this title | |
| anchor = self.create_markdown_anchor(title) | |
| self.title_to_anchor[title] = anchor | |
| lines.append(f"# {title}") | |
| if role: | |
| lines.append(f"*{role}*") | |
| lines.append(f"\n**Source:** {url}\n") | |
| # Abstract/summary | |
| abstract = data.get('abstract', []) | |
| if abstract: | |
| lines.append("## Summary") | |
| abstract_text = self.parse_inline_content(abstract) | |
| lines.append(f"{abstract_text}\n") | |
| # Main content sections | |
| for section in data.get('primaryContentSections', []): | |
| if section.get('kind') == 'content': | |
| for item in section.get('content', []): | |
| lines.append(self.parse_content_item(item)) | |
| # Topic sections (methods, properties, etc.) | |
| for section in data.get('topicSections', []): | |
| section_title = section.get('title', 'Topics') | |
| lines.append(f"\n## {section_title}\n") | |
| for identifier in section.get('identifiers', []): | |
| # Get the reference details | |
| ref_data = references.get(identifier, {}) | |
| ref_title = ref_data.get('title', identifier.split('/')[-1]) | |
| ref_abstract = ref_data.get('abstract', []) | |
| # Create anchor for this subsection | |
| sub_anchor = self.create_markdown_anchor(ref_title) | |
| self.title_to_anchor[ref_title] = sub_anchor | |
| lines.append(f"### {ref_title}") | |
| if ref_abstract: | |
| abstract_text = self.parse_inline_content(ref_abstract) | |
| lines.append(f"{abstract_text}\n") | |
| else: | |
| lines.append("") | |
| # See also section | |
| see_also = data.get('seeAlsoSections', []) | |
| if see_also: | |
| lines.append("\n## See Also\n") | |
| for section in see_also: | |
| section_title = section.get('title', '') | |
| if section_title: | |
| lines.append(f"### {section_title}") | |
| for identifier in section.get('identifiers', []): | |
| ref_data = references.get(identifier, {}) | |
| ref_title = ref_data.get('title', identifier.split('/')[-1]) | |
| # Create clickable internal link | |
| if ref_title in self.title_to_anchor: | |
| anchor = self.title_to_anchor[ref_title] | |
| lines.append(f"- [{ref_title}](#{anchor})") | |
| else: | |
| lines.append(f"- {ref_title}") | |
| return '\n'.join(lines) | |
| def scrape_recursive(self, start_url: str, max_depth: int = 3, current_depth: int = 0) -> None: | |
| """Recursively scrape documentation starting from a URL""" | |
| if current_depth > max_depth or start_url in self.processed_urls: | |
| return | |
| print(f"{' ' * current_depth}Processing: {start_url}") | |
| self.processed_urls.add(start_url) | |
| # Convert to API URL and fetch | |
| api_url = self.url_to_api_url(start_url) | |
| data = self.fetch_json(api_url) | |
| if not data: | |
| print(f"{' ' * current_depth} No data found") | |
| return | |
| # Convert to markdown and store | |
| markdown_content = self.json_to_markdown(data, start_url) | |
| # Store this section | |
| self.content_sections.append({ | |
| 'url': start_url, | |
| 'title': data.get('metadata', {}).get('title', 'Untitled'), | |
| 'content': markdown_content, | |
| 'depth': current_depth | |
| }) | |
| # Get linked pages | |
| links = self.extract_links(data) | |
| # Filter links to stay within the same section | |
| filtered_links = [] | |
| start_path_parts = urlparse(start_url).path.split('/')[1:3] # Get /documentation/framework | |
| for link in links: | |
| link_path_parts = urlparse(link).path.split('/')[1:3] | |
| # Only include links from the same framework/section | |
| if link_path_parts == start_path_parts: | |
| filtered_links.append(link) | |
| print(f"{' ' * current_depth} Found {len(filtered_links)} related links") | |
| # Recursively process linked pages | |
| for link in filtered_links[:10]: # Limit to prevent explosion | |
| self.scrape_recursive(link, max_depth, current_depth + 1) | |
| def save_combined_markdown(self, output_file: str) -> None: | |
| """Save all scraped content to a single markdown file""" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| f.write("# Apple Documentation Export\n\n") | |
| f.write(f"Generated from {len(self.content_sections)} pages\n\n") | |
| f.write("---\n\n") | |
| # Sort by depth and title | |
| sorted_sections = sorted(self.content_sections, key=lambda x: (x['depth'], x['title'])) | |
| for section in sorted_sections: | |
| f.write(section['content']) | |
| f.write("\n\n---\n\n") | |
| print(f"Saved combined documentation to: {output_file}") | |
| def main(): | |
| parser = argparse.ArgumentParser(description='Scrape Apple documentation to markdown') | |
| parser.add_argument('url', help='Apple documentation URL to scrape') | |
| parser.add_argument('-o', '--output', default='apple_docs.md', help='Output markdown file') | |
| parser.add_argument('-d', '--depth', type=int, default=2, help='Maximum recursion depth') | |
| parser.add_argument('-r', '--rate-limit', type=float, default=0.1, help='Rate limit between requests (seconds)') | |
| args = parser.parse_args() | |
| # Validate URL | |
| if not args.url.startswith('https://developer.apple.com/documentation/'): | |
| print("Error: URL must be an Apple documentation URL") | |
| return | |
| scraper = AppleDocsScraper(rate_limit=args.rate_limit) | |
| print(f"Starting scrape of: {args.url}") | |
| print(f"Max depth: {args.depth}") | |
| print(f"Rate limit: {args.rate_limit}s between requests") | |
| print() | |
| scraper.scrape_recursive(args.url, max_depth=args.depth) | |
| print(f"\nScraped {len(scraper.content_sections)} pages") | |
| scraper.save_combined_markdown(args.output) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment