pj4533 · June 14, 2025 23:40
diff --git a/apple_docs_scraper.py b/apple_docs_scraper.py
 #!/usr/bin/env python3
 """
 Apple Documentation Scraper
 Converts Apple documentation sections to markdown files recursively
 """

 import requests
 import json
 import time
 import re
 from pathlib import Path
 from urllib.parse import urlparse, urljoin
 from typing import Set, Dict, List
 import argparse

 class AppleDocsScraper:
    def __init__(self, base_url: str = "https://developer.apple.com", rate_limit: float = 0.1):
        self.base_url = base_url
        self.api_base = f"{base_url}/tutorials/data"
        self.rate_limit = rate_limit
        self.session = requests.Session()
        self.processed_urls: Set[str] = set()
        self.content_sections: List[Dict] = []
        self.all_references: Dict[str, Dict] = {}  # Store all references across all pages
        self.title_to_anchor: Dict[str, str] = {}  # Map titles to markdown anchors
        
    def url_to_api_url(self, doc_url: str) -> str:
        """Convert documentation URL to API URL"""
        # Extract path from documentation URL
        parsed = urlparse(doc_url)
        path = parsed.path
        
        # Convert to API format
        api_url = f"{self.api_base}{path}.json"
        return api_url
    
    def fetch_json(self, api_url: str) -> Dict:
        """Fetch JSON from Apple's API with rate limiting"""
        time.sleep(self.rate_limit)
        
        try:
            response = self.session.get(api_url, timeout=30)
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"Failed to fetch {api_url}: {e}")
            return {}
    
    def extract_links(self, data: Dict) -> List[str]:
        """Extract all documentation links from JSON data"""
        links = []
        
        # Check topic sections for sub-pages
        for section in data.get('topicSections', []):
            for identifier in section.get('identifiers', []):
                # Convert doc:// URLs to web URLs
                if identifier.startswith('doc://'):
                    # Extract the path part
                    parts = identifier.split('/')
                    if len(parts) >= 4:  # doc://domain/documentation/framework/...
                        path_parts = parts[3:]  # Skip doc://domain
                        web_path = '/' + '/'.join(path_parts)
                        web_url = f"{self.base_url}{web_path}"
                        links.append(web_url)
        
        # Check references section
        references = data.get('references', {})
        for ref_id, ref_data in references.items():
            if isinstance(ref_data, dict) and 'url' in ref_data:
                url = ref_data['url']
                if url.startswith('/documentation/'):
                    web_url = f"{self.base_url}{url}"
                    links.append(web_url)
        
        # Check see also sections
        for section in data.get('seeAlsoSections', []):
            for identifier in section.get('identifiers', []):
                if identifier.startswith('doc://'):
                    parts = identifier.split('/')
                    if len(parts) >= 4:
                        path_parts = parts[3:]
                        web_path = '/' + '/'.join(path_parts)
                        web_url = f"{self.base_url}{web_path}"
                        links.append(web_url)
        
        return list(set(links))  # Remove duplicates
    
    def create_markdown_anchor(self, title: str) -> str:
        """Create a markdown anchor from a title (GitHub-style)"""
        # Convert to lowercase, replace spaces and special chars with hyphens
        # This follows GitHub's automatic header anchor generation
        anchor = re.sub(r'[^\w\s-]', '', title.lower())
        anchor = re.sub(r'[-\s]+', '-', anchor)
        return anchor.strip('-')
    
    def resolve_reference_link(self, identifier: str) -> str:
        """Resolve a doc:// reference to a markdown anchor link"""
        # Check if we have reference data for this identifier
        ref_data = self.all_references.get(identifier, {})
        title = ref_data.get('title', '')
        
        if title and title in self.title_to_anchor:
            anchor = self.title_to_anchor[title]
            return f"[{title}](#{anchor})"
        elif title:
            # Create anchor from title even if we haven't seen it yet
            anchor = self.create_markdown_anchor(title)
            return f"[{title}](#{anchor})"
        else:
            # Fallback to the last part of the identifier
            fallback_title = identifier.split('/')[-1]
            return f"[{fallback_title}](#)"
    
    def parse_inline_content(self, content: List[Dict]) -> str:
        """Parse inline content array to markdown text"""
        parts = []
        for item in content:
            if item.get('type') == 'text':
                parts.append(item.get('text', ''))
            elif item.get('type') == 'reference':
                # Convert references to internal markdown links
                identifier = item.get('identifier', '')
                parts.append(self.resolve_reference_link(identifier))
            elif item.get('type') == 'codeVoice':
                text = item.get('code', '')
                parts.append(f"`{text}`")
        return ''.join(parts)
    
    def parse_content_item(self, item: Dict) -> str:
        """Parse a content item to markdown"""
        item_type = item.get('type', '')
        
        if item_type == 'heading':
            level = item.get('level', 2)
            text = item.get('text', '')
            return f"\n{'#' * level} {text}\n"
        
        elif item_type == 'paragraph':
            content = item.get('inlineContent', [])
            return f"{self.parse_inline_content(content)}\n\n"
        
        elif item_type == 'codeListing':
            code = item.get('code', [])
            language = item.get('syntax', '')
            code_text = '\n'.join(code) if isinstance(code, list) else str(code)
            return f"\n```{language}\n{code_text}\n```\n\n"
        
        elif item_type == 'unorderedList':
            items = []
            for list_item in item.get('items', []):
                content = list_item.get('content', [])
                for c in content:
                    if c.get('type') == 'paragraph':
                        text = self.parse_inline_content(c.get('inlineContent', []))
                        items.append(f"- {text}")
            return '\n'.join(items) + '\n\n'
        
        elif item_type == 'orderedList':
            items = []
            for i, list_item in enumerate(item.get('items', []), 1):
                content = list_item.get('content', [])
                for c in content:
                    if c.get('type') == 'paragraph':
                        text = self.parse_inline_content(c.get('inlineContent', []))
                        items.append(f"{i}. {text}")
            return '\n'.join(items) + '\n\n'
        
        return ''
    
    def json_to_markdown(self, data: Dict, url: str) -> str:
        """Convert JSON documentation to markdown"""
        lines = []
        
        # Store all references from this page
        references = data.get('references', {})
        self.all_references.update(references)
        
        # Title and metadata
        metadata = data.get('metadata', {})
        title = metadata.get('title', 'Untitled')
        role = metadata.get('role', '')
        
        # Create anchor for this title
        anchor = self.create_markdown_anchor(title)
        self.title_to_anchor[title] = anchor
        
        lines.append(f"# {title}")
        if role:
            lines.append(f"*{role}*")
        lines.append(f"\n**Source:** {url}\n")
        
        # Abstract/summary
        abstract = data.get('abstract', [])
        if abstract:
            lines.append("## Summary")
            abstract_text = self.parse_inline_content(abstract)
            lines.append(f"{abstract_text}\n")
        
        # Main content sections
        for section in data.get('primaryContentSections', []):
            if section.get('kind') == 'content':
                for item in section.get('content', []):
                    lines.append(self.parse_content_item(item))
        
        # Topic sections (methods, properties, etc.)
        for section in data.get('topicSections', []):
            section_title = section.get('title', 'Topics')
            lines.append(f"\n## {section_title}\n")
            
            for identifier in section.get('identifiers', []):
                # Get the reference details
                ref_data = references.get(identifier, {})
                ref_title = ref_data.get('title', identifier.split('/')[-1])
                ref_abstract = ref_data.get('abstract', [])
                
                # Create anchor for this subsection
                sub_anchor = self.create_markdown_anchor(ref_title)
                self.title_to_anchor[ref_title] = sub_anchor
                
                lines.append(f"### {ref_title}")
                if ref_abstract:
                    abstract_text = self.parse_inline_content(ref_abstract)
                    lines.append(f"{abstract_text}\n")
                else:
                    lines.append("")
        
        # See also section
        see_also = data.get('seeAlsoSections', [])
        if see_also:
            lines.append("\n## See Also\n")
            for section in see_also:
                section_title = section.get('title', '')
                if section_title:
                    lines.append(f"### {section_title}")
                
                for identifier in section.get('identifiers', []):
                    ref_data = references.get(identifier, {})
                    ref_title = ref_data.get('title', identifier.split('/')[-1])
                    
                    # Create clickable internal link
                    if ref_title in self.title_to_anchor:
                        anchor = self.title_to_anchor[ref_title]
                        lines.append(f"- [{ref_title}](#{anchor})")
                    else:
                        lines.append(f"- {ref_title}")
        
        return '\n'.join(lines)
    
    def scrape_recursive(self, start_url: str, max_depth: int = 3, current_depth: int = 0) -> None:
        """Recursively scrape documentation starting from a URL"""
        if current_depth > max_depth or start_url in self.processed_urls:
            return
        
        print(f"{'  ' * current_depth}Processing: {start_url}")
        self.processed_urls.add(start_url)
        
        # Convert to API URL and fetch
        api_url = self.url_to_api_url(start_url)
        data = self.fetch_json(api_url)
        
        if not data:
            print(f"{'  ' * current_depth}  No data found")
            return
        
        # Convert to markdown and store
        markdown_content = self.json_to_markdown(data, start_url)
        
        # Store this section
        self.content_sections.append({
            'url': start_url,
            'title': data.get('metadata', {}).get('title', 'Untitled'),
            'content': markdown_content,
            'depth': current_depth
        })
        
        # Get linked pages
        links = self.extract_links(data)
        
        # Filter links to stay within the same section
        filtered_links = []
        start_path_parts = urlparse(start_url).path.split('/')[1:3]  # Get /documentation/framework
        
        for link in links:
            link_path_parts = urlparse(link).path.split('/')[1:3]
            # Only include links from the same framework/section
            if link_path_parts == start_path_parts:
                filtered_links.append(link)
        
        print(f"{'  ' * current_depth}  Found {len(filtered_links)} related links")
        
        # Recursively process linked pages
        for link in filtered_links[:10]:  # Limit to prevent explosion
            self.scrape_recursive(link, max_depth, current_depth + 1)
    
    def save_combined_markdown(self, output_file: str) -> None:
        """Save all scraped content to a single markdown file"""
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write("# Apple Documentation Export\n\n")
            f.write(f"Generated from {len(self.content_sections)} pages\n\n")
            f.write("---\n\n")
            
            # Sort by depth and title
            sorted_sections = sorted(self.content_sections, key=lambda x: (x['depth'], x['title']))
            
            for section in sorted_sections:
                f.write(section['content'])
                f.write("\n\n---\n\n")
        
        print(f"Saved combined documentation to: {output_file}")

 def main():
    parser = argparse.ArgumentParser(description='Scrape Apple documentation to markdown')
    parser.add_argument('url', help='Apple documentation URL to scrape')
    parser.add_argument('-o', '--output', default='apple_docs.md', help='Output markdown file')
    parser.add_argument('-d', '--depth', type=int, default=2, help='Maximum recursion depth')
    parser.add_argument('-r', '--rate-limit', type=float, default=0.1, help='Rate limit between requests (seconds)')
    
    args = parser.parse_args()
    
    # Validate URL
    if not args.url.startswith('https://developer.apple.com/documentation/'):
        print("Error: URL must be an Apple documentation URL")
        return
    
    scraper = AppleDocsScraper(rate_limit=args.rate_limit)
    
    print(f"Starting scrape of: {args.url}")
    print(f"Max depth: {args.depth}")
    print(f"Rate limit: {args.rate_limit}s between requests")
    print()
    
    scraper.scrape_recursive(args.url, max_depth=args.depth)
    
    print(f"\nScraped {len(scraper.content_sections)} pages")
    scraper.save_combined_markdown(args.output)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Apple Documentation Scraper
	Converts Apple documentation sections to markdown files recursively
	"""

	import requests
	import json
	import time
	import re
	from pathlib import Path
	from urllib.parse import urlparse, urljoin
	from typing import Set, Dict, List
	import argparse

	class AppleDocsScraper:
	def __init__(self, base_url: str = "https://developer.apple.com", rate_limit: float = 0.1):
	self.base_url = base_url
	self.api_base = f"{base_url}/tutorials/data"
	self.rate_limit = rate_limit
	self.session = requests.Session()
	self.processed_urls: Set[str] = set()
	self.content_sections: List[Dict] = []
	self.all_references: Dict[str, Dict] = {} # Store all references across all pages
	self.title_to_anchor: Dict[str, str] = {} # Map titles to markdown anchors

	def url_to_api_url(self, doc_url: str) -> str:
	"""Convert documentation URL to API URL"""
	# Extract path from documentation URL
	parsed = urlparse(doc_url)
	path = parsed.path

	# Convert to API format
	api_url = f"{self.api_base}{path}.json"
	return api_url

	def fetch_json(self, api_url: str) -> Dict:
	"""Fetch JSON from Apple's API with rate limiting"""
	time.sleep(self.rate_limit)

	try:
	response = self.session.get(api_url, timeout=30)
	response.raise_for_status()
	return response.json()
	except Exception as e:
	print(f"Failed to fetch {api_url}: {e}")
	return {}

	def extract_links(self, data: Dict) -> List[str]:
	"""Extract all documentation links from JSON data"""
	links = []

	# Check topic sections for sub-pages
	for section in data.get('topicSections', []):
	for identifier in section.get('identifiers', []):
	# Convert doc:// URLs to web URLs
	if identifier.startswith('doc://'):
	# Extract the path part
	parts = identifier.split('/')
	if len(parts) >= 4: # doc://domain/documentation/framework/...
	path_parts = parts[3:] # Skip doc://domain
	web_path = '/' + '/'.join(path_parts)
	web_url = f"{self.base_url}{web_path}"
	links.append(web_url)

	# Check references section
	references = data.get('references', {})
	for ref_id, ref_data in references.items():
	if isinstance(ref_data, dict) and 'url' in ref_data:
	url = ref_data['url']
	if url.startswith('/documentation/'):
	web_url = f"{self.base_url}{url}"
	links.append(web_url)

	# Check see also sections
	for section in data.get('seeAlsoSections', []):
	for identifier in section.get('identifiers', []):
	if identifier.startswith('doc://'):
	parts = identifier.split('/')
	if len(parts) >= 4:
	path_parts = parts[3:]
	web_path = '/' + '/'.join(path_parts)
	web_url = f"{self.base_url}{web_path}"
	links.append(web_url)

	return list(set(links)) # Remove duplicates

	def create_markdown_anchor(self, title: str) -> str:
	"""Create a markdown anchor from a title (GitHub-style)"""
	# Convert to lowercase, replace spaces and special chars with hyphens
	# This follows GitHub's automatic header anchor generation
	anchor = re.sub(r'[^\w\s-]', '', title.lower())
	anchor = re.sub(r'[-\s]+', '-', anchor)
	return anchor.strip('-')

	def resolve_reference_link(self, identifier: str) -> str:
	"""Resolve a doc:// reference to a markdown anchor link"""
	# Check if we have reference data for this identifier
	ref_data = self.all_references.get(identifier, {})
	title = ref_data.get('title', '')

	if title and title in self.title_to_anchor:
	anchor = self.title_to_anchor[title]
	return f"[{title}](#{anchor})"
	elif title:
	# Create anchor from title even if we haven't seen it yet
	anchor = self.create_markdown_anchor(title)
	return f"[{title}](#{anchor})"
	else:
	# Fallback to the last part of the identifier
	fallback_title = identifier.split('/')[-1]
	return f"[{fallback_title}](#)"

	def parse_inline_content(self, content: List[Dict]) -> str:
	"""Parse inline content array to markdown text"""
	parts = []
	for item in content:
	if item.get('type') == 'text':
	parts.append(item.get('text', ''))
	elif item.get('type') == 'reference':
	# Convert references to internal markdown links
	identifier = item.get('identifier', '')
	parts.append(self.resolve_reference_link(identifier))
	elif item.get('type') == 'codeVoice':
	text = item.get('code', '')
	parts.append(f"`{text}`")
	return ''.join(parts)

	def parse_content_item(self, item: Dict) -> str:
	"""Parse a content item to markdown"""
	item_type = item.get('type', '')

	if item_type == 'heading':
	level = item.get('level', 2)
	text = item.get('text', '')
	return f"\n{'#' * level} {text}\n"

	elif item_type == 'paragraph':
	content = item.get('inlineContent', [])
	return f"{self.parse_inline_content(content)}\n\n"

	elif item_type == 'codeListing':
	code = item.get('code', [])
	language = item.get('syntax', '')
	code_text = '\n'.join(code) if isinstance(code, list) else str(code)
	return f"\n```{language}\n{code_text}\n```\n\n"

	elif item_type == 'unorderedList':
	items = []
	for list_item in item.get('items', []):
	content = list_item.get('content', [])
	for c in content:
	if c.get('type') == 'paragraph':
	text = self.parse_inline_content(c.get('inlineContent', []))
	items.append(f"- {text}")
	return '\n'.join(items) + '\n\n'

	elif item_type == 'orderedList':
	items = []
	for i, list_item in enumerate(item.get('items', []), 1):
	content = list_item.get('content', [])
	for c in content:
	if c.get('type') == 'paragraph':
	text = self.parse_inline_content(c.get('inlineContent', []))
	items.append(f"{i}. {text}")
	return '\n'.join(items) + '\n\n'

	return ''

	def json_to_markdown(self, data: Dict, url: str) -> str:
	"""Convert JSON documentation to markdown"""
	lines = []

	# Store all references from this page
	references = data.get('references', {})
	self.all_references.update(references)

	# Title and metadata
	metadata = data.get('metadata', {})
	title = metadata.get('title', 'Untitled')
	role = metadata.get('role', '')

	# Create anchor for this title
	anchor = self.create_markdown_anchor(title)
	self.title_to_anchor[title] = anchor

	lines.append(f"# {title}")
	if role:
	lines.append(f"{role}")
	lines.append(f"\nSource: {url}\n")

	# Abstract/summary
	abstract = data.get('abstract', [])
	if abstract:
	lines.append("## Summary")
	abstract_text = self.parse_inline_content(abstract)
	lines.append(f"{abstract_text}\n")

	# Main content sections
	for section in data.get('primaryContentSections', []):
	if section.get('kind') == 'content':
	for item in section.get('content', []):
	lines.append(self.parse_content_item(item))

	# Topic sections (methods, properties, etc.)
	for section in data.get('topicSections', []):
	section_title = section.get('title', 'Topics')
	lines.append(f"\n## {section_title}\n")

	for identifier in section.get('identifiers', []):
	# Get the reference details
	ref_data = references.get(identifier, {})
	ref_title = ref_data.get('title', identifier.split('/')[-1])
	ref_abstract = ref_data.get('abstract', [])

	# Create anchor for this subsection
	sub_anchor = self.create_markdown_anchor(ref_title)
	self.title_to_anchor[ref_title] = sub_anchor

	lines.append(f"### {ref_title}")
	if ref_abstract:
	abstract_text = self.parse_inline_content(ref_abstract)
	lines.append(f"{abstract_text}\n")
	else:
	lines.append("")

	# See also section
	see_also = data.get('seeAlsoSections', [])
	if see_also:
	lines.append("\n## See Also\n")
	for section in see_also:
	section_title = section.get('title', '')
	if section_title:
	lines.append(f"### {section_title}")

	for identifier in section.get('identifiers', []):
	ref_data = references.get(identifier, {})
	ref_title = ref_data.get('title', identifier.split('/')[-1])

	# Create clickable internal link
	if ref_title in self.title_to_anchor:
	anchor = self.title_to_anchor[ref_title]
	lines.append(f"- [{ref_title}](#{anchor})")
	else:
	lines.append(f"- {ref_title}")

	return '\n'.join(lines)

	def scrape_recursive(self, start_url: str, max_depth: int = 3, current_depth: int = 0) -> None:
	"""Recursively scrape documentation starting from a URL"""
	if current_depth > max_depth or start_url in self.processed_urls:
	return

	print(f"{' ' * current_depth}Processing: {start_url}")
	self.processed_urls.add(start_url)

	# Convert to API URL and fetch
	api_url = self.url_to_api_url(start_url)
	data = self.fetch_json(api_url)

	if not data:
	print(f"{' ' * current_depth} No data found")
	return

	# Convert to markdown and store
	markdown_content = self.json_to_markdown(data, start_url)

	# Store this section
	self.content_sections.append({
	'url': start_url,
	'title': data.get('metadata', {}).get('title', 'Untitled'),
	'content': markdown_content,
	'depth': current_depth
	})

	# Get linked pages
	links = self.extract_links(data)

	# Filter links to stay within the same section
	filtered_links = []
	start_path_parts = urlparse(start_url).path.split('/')[1:3] # Get /documentation/framework

	for link in links:
	link_path_parts = urlparse(link).path.split('/')[1:3]
	# Only include links from the same framework/section
	if link_path_parts == start_path_parts:
	filtered_links.append(link)

	print(f"{' ' * current_depth} Found {len(filtered_links)} related links")

	# Recursively process linked pages
	for link in filtered_links[:10]: # Limit to prevent explosion
	self.scrape_recursive(link, max_depth, current_depth + 1)

	def save_combined_markdown(self, output_file: str) -> None:
	"""Save all scraped content to a single markdown file"""
	with open(output_file, 'w', encoding='utf-8') as f:
	f.write("# Apple Documentation Export\n\n")
	f.write(f"Generated from {len(self.content_sections)} pages\n\n")
	f.write("---\n\n")

	# Sort by depth and title
	sorted_sections = sorted(self.content_sections, key=lambda x: (x['depth'], x['title']))

	for section in sorted_sections:
	f.write(section['content'])
	f.write("\n\n---\n\n")

	print(f"Saved combined documentation to: {output_file}")

	def main():
	parser = argparse.ArgumentParser(description='Scrape Apple documentation to markdown')
	parser.add_argument('url', help='Apple documentation URL to scrape')
	parser.add_argument('-o', '--output', default='apple_docs.md', help='Output markdown file')
	parser.add_argument('-d', '--depth', type=int, default=2, help='Maximum recursion depth')
	parser.add_argument('-r', '--rate-limit', type=float, default=0.1, help='Rate limit between requests (seconds)')

	args = parser.parse_args()

	# Validate URL
	if not args.url.startswith('https://developer.apple.com/documentation/'):
	print("Error: URL must be an Apple documentation URL")
	return

	scraper = AppleDocsScraper(rate_limit=args.rate_limit)

	print(f"Starting scrape of: {args.url}")
	print(f"Max depth: {args.depth}")
	print(f"Rate limit: {args.rate_limit}s between requests")
	print()

	scraper.scrape_recursive(args.url, max_depth=args.depth)

	print(f"\nScraped {len(scraper.content_sections)} pages")
	scraper.save_combined_markdown(args.output)

	if __name__ == "__main__":
	main()
No results found