MaxGhenis · August 11, 2025 13:36
diff --git a/count_policyengine_references.ipynb b/count_policyengine_references.ipynb
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# PolicyEngine-US Source Document Reference Analysis\n",
        "\n",
        "This notebook analyzes the number of distinct source document references in the PolicyEngine-US codebase, both with and without page number fragments."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "import re\n",
        "from pathlib import Path\n",
        "from collections import defaultdict\n",
        "from urllib.parse import urlparse, urlunparse\n",
        "import json\n",
        "import yaml"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Set the base path to PolicyEngine-US repository\n",
        "BASE_PATH = Path(\"/Users/maxghenis/PolicyEngine/policyengine-us\")\n",
        "\n",
        "# Verify the path exists\n",
        "if not BASE_PATH.exists():\n",
        "    print(f\"Error: Path {BASE_PATH} does not exist\")\n",
        "else:\n",
        "    print(f\"Analyzing PolicyEngine-US repository at: {BASE_PATH}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def extract_urls_from_text(text):\n",
        "    \"\"\"Extract URLs from text content.\"\"\"\n",
        "    # Match URLs starting with http/https\n",
        "    url_pattern = r'https?://[^\\s<>\"\\']+'
 n",
        "    urls = re.findall(url_pattern, text)\n",
        "    return urls\n",
        "\n",
        "def extract_legal_citations(text):\n",
        "    \"\"\"Extract legal citations like USC, CFR, state codes.\"\"\"\n",
        "    citations = []\n",
        "    \n",
        "    # USC citations (e.g., \"26 USC 32\", \"26 U.S.C. § 32\")\n",
        "    usc_patterns = [\n",
        "        r'\\d+\\s+U\\.?S\\.?C\\.?\\s+§?\\s*\\d+',\n",
        "        r'\\d+\\s+USC\\s+\\d+',\n",
        "    ]\n",
        "    \n",
        "    # CFR citations (e.g., \"26 CFR 1.32-1\")\n",
        "    cfr_pattern = r'\\d+\\s+C\\.?F\\.?R\\.?\\s+[\\d\\.\\-]+'\n",
        "    \n",
        "    # State code patterns\n",
        "    state_patterns = [\n",
        "        r'\\b[A-Z]{2,}\\s+Rev\\.?\\s+Code\\s+[\\d\\.\\-]+',  # e.g., \"WA Rev Code 123\"\n",
        "        r'\\b[A-Z]{2,}\\s+Stat\\.?\\s+[\\d\\.\\-]+',  # e.g., \"CA Stat 123\"\n",
        "    ]\n",
        "    \n",
        "    for pattern in usc_patterns + [cfr_pattern] + state_patterns:\n",
        "        citations.extend(re.findall(pattern, text, re.IGNORECASE))\n",
        "    \n",
        "    return citations\n",
        "\n",
        "def remove_page_fragment(url):\n",
        "    \"\"\"Remove #page=X fragments from URLs.\"\"\"\n",
        "    # Remove everything after # for page fragments\n",
        "    if '#page=' in url:\n",
        "        return url.split('#page=')[0]\n",
        "    return url"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Initialize counters\n",
        "all_references_with_pages = set()\n",
        "all_references_without_pages = set()\n",
        "url_references = set()\n",
        "legal_citations = set()\n",
        "pdf_references = set()\n",
        "gov_urls = set()\n",
        "\n",
        "# Statistics\n",
        "files_processed = 0\n",
        "files_with_references = 0"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Process Python files in variables/ and parameters/ directories\n",
        "target_dirs = ['policyengine_us/parameters', 'policyengine_us/variables']\n",
        "\n",
        "for target_dir in target_dirs:\n",
        "    dir_path = BASE_PATH / target_dir\n",
        "    if not dir_path.exists():\n",
        "        print(f\"Warning: {dir_path} does not exist\")\n",
        "        continue\n",
        "    \n",
        "    print(f\"\\nProcessing {target_dir}...\")\n",
        "    \n",
        "    # Process all Python and YAML files\n",
        "    for file_path in dir_path.rglob('*'):\n",
        "        if file_path.is_file() and (file_path.suffix in ['.py', '.yaml', '.yml']):\n",
        "            files_processed += 1\n",
        "            file_has_references = False\n",
        "            \n",
        "            try:\n",
        "                content = file_path.read_text(encoding='utf-8', errors='ignore')\n",
        "                \n",
        "                # Extract URLs\n",
        "                urls = extract_urls_from_text(content)\n",
        "                for url in urls:\n",
        "                    file_has_references = True\n",
        "                    all_references_with_pages.add(url)\n",
        "                    \n",
        "                    # Track PDFs specifically\n",
        "                    if '.pdf' in url.lower():\n",
        "                        pdf_references.add(url)\n",
        "                    \n",
        "                    # Track .gov URLs\n",
        "                    if '.gov' in url:\n",
        "                        gov_urls.add(url)\n",
        "                    \n",
        "                    # Add version without page numbers\n",
        "                    url_without_page = remove_page_fragment(url)\n",
        "                    all_references_without_pages.add(url_without_page)\n",
        "                    url_references.add(url_without_page)\n",
        "                \n",
        "                # Extract legal citations\n",
        "                citations = extract_legal_citations(content)\n",
        "                for citation in citations:\n",
        "                    file_has_references = True\n",
        "                    all_references_with_pages.add(citation)\n",
        "                    all_references_without_pages.add(citation)\n",
        "                    legal_citations.add(citation)\n",
        "                \n",
        "                # For YAML files, also check for 'reference', 'source', 'documentation' fields\n",
        "                if file_path.suffix in ['.yaml', '.yml']:\n",
        "                    try:\n",
        "                        data = yaml.safe_load(content)\n",
        "                        if data and isinstance(data, dict):\n",
        "                            # Recursively search for reference fields\n",
        "                            def search_references(obj, path=\"\"):\n",
        "                                if isinstance(obj, dict):\n",
        "                                    for key, value in obj.items():\n",
        "                                        if key in ['reference', 'references', 'source', 'documentation', 'href', 'url']:\n",
        "                                            if isinstance(value, str):\n",
        "                                                if value.startswith('http'):\n",
        "                                                    all_references_with_pages.add(value)\n",
        "                                                    all_references_without_pages.add(remove_page_fragment(value))\n",
        "                                                    file_has_references = True\n",
        "                                            elif isinstance(value, list):\n",
        "                                                for item in value:\n",
        "                                                    if isinstance(item, dict) and 'href' in item:\n",
        "                                                        href = item['href']\n",
        "                                                        all_references_with_pages.add(href)\n",
        "                                                        all_references_without_pages.add(remove_page_fragment(href))\n",
        "                                                        file_has_references = True\n",
        "                                        else:\n",
        "                                            search_references(value, f\"{path}.{key}\")\n",
        "                                elif isinstance(obj, list):\n",
        "                                    for item in obj:\n",
        "                                        search_references(item, path)\n",
        "                            \n",
        "                            search_references(data)\n",
        "                    except yaml.YAMLError:\n",
        "                        pass  # Skip invalid YAML files\n",
        "                \n",
        "                if file_has_references:\n",
        "                    files_with_references += 1\n",
        "                    \n",
        "            except Exception as e:\n",
        "                # Skip files that can't be read\n",
        "                pass\n",
        "\n",
        "print(f\"\\nProcessed {files_processed} files\")\n",
        "print(f\"Files with references: {files_with_references}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Analyze PDF references to understand page fragmentation\n",
        "pdfs_with_pages = [ref for ref in pdf_references if '#page=' in ref]\n",
        "pdfs_without_pages = [ref for ref in pdf_references if '#page=' not in ref]\n",
        "unique_pdf_bases = set(remove_page_fragment(ref) for ref in pdf_references)\n",
        "\n",
        "print(\"PDF Reference Analysis:\")\n",
        "print(f\"  Total PDF references: {len(pdf_references):,}\")\n",
        "print(f\"  PDFs with page fragments: {len(pdfs_with_pages):,}\")\n",
        "print(f\"  PDFs without page fragments: {len(pdfs_without_pages):,}\")\n",
        "print(f\"  Unique PDF documents (base URLs): {len(unique_pdf_bases):,}\")\n",
        "print(f\"  Average pages per PDF: {len(pdfs_with_pages) / max(len(unique_pdf_bases), 1):.1f}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Final summary\n",
        "print(\"\\n\" + \"=\"*60)\n",
        "print(\"POLICYENGINE-US SOURCE REFERENCE SUMMARY\")\n",
        "print(\"=\"*60)\n",
        "\n",
        "print(f\"\\n📊 TOTAL DISTINCT REFERENCES:\")\n",
        "print(f\"  WITH page fragments:    {len(all_references_with_pages):,}\")\n",
        "print(f\"  WITHOUT page fragments: {len(all_references_without_pages):,}\")\n",
        "print(f\"  Reduction factor:       {len(all_references_with_pages) / max(len(all_references_without_pages), 1):.2f}x\")\n",
        "\n",
        "print(f\"\\n📑 BREAKDOWN BY TYPE:\")\n",
        "print(f\"  Government URLs (.gov): {len(gov_urls):,}\")\n",
        "print(f\"  PDF documents:          {len(pdf_references):,}\")\n",
        "print(f\"  Legal citations:        {len(legal_citations):,}\")\n",
        "print(f\"  Other URLs:             {len(url_references - gov_urls):,}\")\n",
        "\n",
        "print(f\"\\n📁 FILE STATISTICS:\")\n",
        "print(f\"  Files processed:        {files_processed:,}\")\n",
        "print(f\"  Files with references:  {files_with_references:,}\")\n",
        "print(f\"  Coverage:               {files_with_references/max(files_processed, 1)*100:.1f}%\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Show some examples\n",
        "print(\"\\n\" + \"=\"*60)\n",
        "print(\"EXAMPLE REFERENCES\")\n",
        "print(\"=\"*60)\n",
        "\n",
        "print(\"\\n🔗 Sample URLs with page fragments:\")\n",
        "for ref in list(pdfs_with_pages)[:5]:\n",
        "    print(f\"  • {ref}\")\n",
        "\n",
        "print(\"\\n📜 Sample legal citations:\")\n",
        "for ref in list(legal_citations)[:5]:\n",
        "    print(f\"  • {ref}\")\n",
        "\n",
        "print(\"\\n🏛️ Sample government URLs:\")\n",
        "for ref in list(gov_urls)[:5]:\n",
        "    if '.pdf' not in ref:  # Show non-PDF gov URLs\n",
        "        print(f\"  • {ref}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Conclusions\n",
        "\n",
        "Based on this analysis:\n",
        "\n",
        "1. **Total distinct references INCLUDING page fragments**: The actual count from the codebase\n",
        "2. **Total distinct references EXCLUDING page fragments**: The deduplicated count of unique documents\n",
        "3. **The reduction factor** shows how many page-specific citations exist per unique document on average\n",
        "\n",
        "This demonstrates the sophisticated level of source documentation in PolicyEngine-US, with precise page-level citations for policy documents."
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.0"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
 }
diff --git a/count_references.py b/count_references.py
 #!/usr/bin/env python3
 """
 PolicyEngine-US Source Document Reference Analysis

 This script analyzes the number of distinct source document references 
 in the PolicyEngine-US codebase, both with and without page number fragments.
 """

 import os
 import re
 from pathlib import Path
 from collections import defaultdict
 import yaml

 def extract_urls_from_text(text):
    """Extract URLs from text content."""
    # Match URLs starting with http/https
    url_pattern = r'https?://[^\s<>"\']+' 
    urls = re.findall(url_pattern, text)
    return urls

 def extract_legal_citations(text):
    """Extract legal citations like USC, CFR, state codes."""
    citations = []
    
    # USC citations (e.g., "26 USC 32", "26 U.S.C. § 32")
    usc_patterns = [
        r'\d+\s+U\.?S\.?C\.?\s+§?\s*\d+',
        r'\d+\s+USC\s+\d+',
    ]
    
    # CFR citations (e.g., "26 CFR 1.32-1")
    cfr_pattern = r'\d+\s+C\.?F\.?R\.?\s+[\d\.\-]+'
    
    # State code patterns
    state_patterns = [
        r'\b[A-Z]{2,}\s+Rev\.?\s+Code\s+[\d\.\-]+',  # e.g., "WA Rev Code 123"
        r'\b[A-Z]{2,}\s+Stat\.?\s+[\d\.\-]+',  # e.g., "CA Stat 123"
    ]
    
    for pattern in usc_patterns + [cfr_pattern] + state_patterns:
        citations.extend(re.findall(pattern, text, re.IGNORECASE))
    
    return citations

 def remove_page_fragment(url):
    """Remove #page=X fragments from URLs."""
    # Remove everything after # for page fragments
    if '#page=' in url:
        return url.split('#page=')[0]
    return url

 def main():
    # Set the base path to PolicyEngine-US repository
    BASE_PATH = Path("/Users/maxghenis/PolicyEngine/policyengine-us")
    
    # Verify the path exists
    if not BASE_PATH.exists():
        print(f"Error: Path {BASE_PATH} does not exist")
        return
    
    print(f"Analyzing PolicyEngine-US repository at: {BASE_PATH}")
    
    # Initialize counters
    all_references_with_pages = set()
    all_references_without_pages = set()
    url_references = set()
    legal_citations = set()
    pdf_references = set()
    gov_urls = set()
    
    # Statistics
    files_processed = 0
    files_with_references = 0
    
    # Process Python files in variables/ and parameters/ directories
    target_dirs = ['policyengine_us/parameters', 'policyengine_us/variables']
    
    for target_dir in target_dirs:
        dir_path = BASE_PATH / target_dir
        if not dir_path.exists():
            print(f"Warning: {dir_path} does not exist")
            continue
        
        print(f"\nProcessing {target_dir}...")
        
        # Process all Python and YAML files
        for file_path in dir_path.rglob('*'):
            if file_path.is_file() and (file_path.suffix in ['.py', '.yaml', '.yml']):
                files_processed += 1
                file_has_references = False
                
                try:
                    content = file_path.read_text(encoding='utf-8', errors='ignore')
                    
                    # Extract URLs
                    urls = extract_urls_from_text(content)
                    for url in urls:
                        file_has_references = True
                        all_references_with_pages.add(url)
                        
                        # Track PDFs specifically
                        if '.pdf' in url.lower():
                            pdf_references.add(url)
                        
                        # Track .gov URLs
                        if '.gov' in url:
                            gov_urls.add(url)
                        
                        # Add version without page numbers
                        url_without_page = remove_page_fragment(url)
                        all_references_without_pages.add(url_without_page)
                        url_references.add(url_without_page)
                    
                    # Extract legal citations
                    citations = extract_legal_citations(content)
                    for citation in citations:
                        file_has_references = True
                        all_references_with_pages.add(citation)
                        all_references_without_pages.add(citation)
                        legal_citations.add(citation)
                    
                    # For YAML files, also check for 'reference', 'source', 'documentation' fields
                    if file_path.suffix in ['.yaml', '.yml']:
                        try:
                            data = yaml.safe_load(content)
                            if data and isinstance(data, dict):
                                # Recursively search for reference fields
                                def search_references(obj, path=""):
                                    if isinstance(obj, dict):
                                        for key, value in obj.items():
                                            if key in ['reference', 'references', 'source', 'documentation', 'href', 'url']:
                                                if isinstance(value, str):
                                                    if value.startswith('http'):
                                                        all_references_with_pages.add(value)
                                                        all_references_without_pages.add(remove_page_fragment(value))
                                                        file_has_references = True
                                                        if '.pdf' in value.lower():
                                                            pdf_references.add(value)
                                                        if '.gov' in value:
                                                            gov_urls.add(value)
                                                elif isinstance(value, list):
                                                    for item in value:
                                                        if isinstance(item, dict) and 'href' in item:
                                                            href = item['href']
                                                            all_references_with_pages.add(href)
                                                            all_references_without_pages.add(remove_page_fragment(href))
                                                            file_has_references = True
                                                            if '.pdf' in href.lower():
                                                                pdf_references.add(href)
                                                            if '.gov' in href:
                                                                gov_urls.add(href)
                                            else:
                                                search_references(value, f"{path}.{key}")
                                    elif isinstance(obj, list):
                                        for item in obj:
                                            search_references(item, path)
                                
                                search_references(data)
                        except yaml.YAMLError:
                            pass  # Skip invalid YAML files
                    
                    if file_has_references:
                        files_with_references += 1
                        
                except Exception as e:
                    # Skip files that can't be read
                    pass
    
    print(f"\nProcessed {files_processed} files")
    print(f"Files with references: {files_with_references}")
    
    # Analyze PDF references to understand page fragmentation
    pdfs_with_pages = [ref for ref in pdf_references if '#page=' in ref]
    pdfs_without_pages = [ref for ref in pdf_references if '#page=' not in ref]
    unique_pdf_bases = set(remove_page_fragment(ref) for ref in pdf_references)
    
    print("\nPDF Reference Analysis:")
    print(f"  Total PDF references: {len(pdf_references):,}")
    print(f"  PDFs with page fragments: {len(pdfs_with_pages):,}")
    print(f"  PDFs without page fragments: {len(pdfs_without_pages):,}")
    print(f"  Unique PDF documents (base URLs): {len(unique_pdf_bases):,}")
    if len(unique_pdf_bases) > 0:
        print(f"  Average pages per PDF: {len(pdfs_with_pages) / len(unique_pdf_bases):.1f}")
    
    # Final summary
    print("\n" + "="*60)
    print("POLICYENGINE-US SOURCE REFERENCE SUMMARY")
    print("="*60)
    
    print(f"\n📊 TOTAL DISTINCT REFERENCES:")
    print(f"  WITH page fragments:    {len(all_references_with_pages):,}")
    print(f"  WITHOUT page fragments: {len(all_references_without_pages):,}")
    if len(all_references_without_pages) > 0:
        print(f"  Reduction factor:       {len(all_references_with_pages) / len(all_references_without_pages):.2f}x")
    
    print(f"\n📑 BREAKDOWN BY TYPE:")
    print(f"  Government URLs (.gov): {len(gov_urls):,}")
    print(f"  PDF documents:          {len(pdf_references):,}")
    print(f"  Legal citations:        {len(legal_citations):,}")
    print(f"  Other URLs:             {len(url_references - gov_urls):,}")
    
    print(f"\n📁 FILE STATISTICS:")
    print(f"  Files processed:        {files_processed:,}")
    print(f"  Files with references:  {files_with_references:,}")
    if files_processed > 0:
        print(f"  Coverage:               {files_with_references/files_processed*100:.1f}%")
    
    # Show some examples
    print("\n" + "="*60)
    print("EXAMPLE REFERENCES")
    print("="*60)
    
    print("\n🔗 Sample URLs with page fragments:")
    for ref in list(pdfs_with_pages)[:5]:
        print(f"  • {ref}")
    
    print("\n📜 Sample legal citations:")
    for ref in list(legal_citations)[:5]:
        print(f"  • {ref}")
    
    print("\n🏛️ Sample government URLs:")
    count = 0
    for ref in list(gov_urls):
        if '.pdf' not in ref:  # Show non-PDF gov URLs
            print(f"  • {ref}")
            count += 1
            if count >= 5:
                break

 if __name__ == "__main__":
    main()
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# PolicyEngine-US Source Document Reference Analysis\n",
	"\n",
	"This notebook analyzes the number of distinct source document references in the PolicyEngine-US codebase, both with and without page number fragments."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import re\n",
	"from pathlib import Path\n",
	"from collections import defaultdict\n",
	"from urllib.parse import urlparse, urlunparse\n",
	"import json\n",
	"import yaml"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Set the base path to PolicyEngine-US repository\n",
	"BASE_PATH = Path(\"/Users/maxghenis/PolicyEngine/policyengine-us\")\n",
	"\n",
	"# Verify the path exists\n",
	"if not BASE_PATH.exists():\n",
	" print(f\"Error: Path {BASE_PATH} does not exist\")\n",
	"else:\n",
	" print(f\"Analyzing PolicyEngine-US repository at: {BASE_PATH}\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def extract_urls_from_text(text):\n",
	" \"\"\"Extract URLs from text content.\"\"\"\n",
	" # Match URLs starting with http/https\n",
	" url_pattern = r'https?://[^\\s<>\"\\']+'
	n",
	" urls = re.findall(url_pattern, text)\n",
	" return urls\n",
	"\n",
	"def extract_legal_citations(text):\n",
	" \"\"\"Extract legal citations like USC, CFR, state codes.\"\"\"\n",
	" citations = []\n",
	" \n",
	" # USC citations (e.g., \"26 USC 32\", \"26 U.S.C. § 32\")\n",
	" usc_patterns = [\n",
	" r'\\d+\\s+U\\.?S\\.?C\\.?\\s+§?\\s*\\d+',\n",
	" r'\\d+\\s+USC\\s+\\d+',\n",
	" ]\n",
	" \n",
	" # CFR citations (e.g., \"26 CFR 1.32-1\")\n",
	" cfr_pattern = r'\\d+\\s+C\\.?F\\.?R\\.?\\s+[\\d\\.\\-]+'\n",
	" \n",
	" # State code patterns\n",
	" state_patterns = [\n",
	" r'\\b[A-Z]{2,}\\s+Rev\\.?\\s+Code\\s+[\\d\\.\\-]+', # e.g., \"WA Rev Code 123\"\n",
	" r'\\b[A-Z]{2,}\\s+Stat\\.?\\s+[\\d\\.\\-]+', # e.g., \"CA Stat 123\"\n",
	" ]\n",
	" \n",
	" for pattern in usc_patterns + [cfr_pattern] + state_patterns:\n",
	" citations.extend(re.findall(pattern, text, re.IGNORECASE))\n",
	" \n",
	" return citations\n",
	"\n",
	"def remove_page_fragment(url):\n",
	" \"\"\"Remove #page=X fragments from URLs.\"\"\"\n",
	" # Remove everything after # for page fragments\n",
	" if '#page=' in url:\n",
	" return url.split('#page=')[0]\n",
	" return url"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Initialize counters\n",
	"all_references_with_pages = set()\n",
	"all_references_without_pages = set()\n",
	"url_references = set()\n",
	"legal_citations = set()\n",
	"pdf_references = set()\n",
	"gov_urls = set()\n",
	"\n",
	"# Statistics\n",
	"files_processed = 0\n",
	"files_with_references = 0"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Process Python files in variables/ and parameters/ directories\n",
	"target_dirs = ['policyengine_us/parameters', 'policyengine_us/variables']\n",
	"\n",
	"for target_dir in target_dirs:\n",
	" dir_path = BASE_PATH / target_dir\n",
	" if not dir_path.exists():\n",
	" print(f\"Warning: {dir_path} does not exist\")\n",
	" continue\n",
	" \n",
	" print(f\"\\nProcessing {target_dir}...\")\n",
	" \n",
	" # Process all Python and YAML files\n",
	" for file_path in dir_path.rglob('*'):\n",
	" if file_path.is_file() and (file_path.suffix in ['.py', '.yaml', '.yml']):\n",
	" files_processed += 1\n",
	" file_has_references = False\n",
	" \n",
	" try:\n",
	" content = file_path.read_text(encoding='utf-8', errors='ignore')\n",
	" \n",
	" # Extract URLs\n",
	" urls = extract_urls_from_text(content)\n",
	" for url in urls:\n",
	" file_has_references = True\n",
	" all_references_with_pages.add(url)\n",
	" \n",
	" # Track PDFs specifically\n",
	" if '.pdf' in url.lower():\n",
	" pdf_references.add(url)\n",
	" \n",
	" # Track .gov URLs\n",
	" if '.gov' in url:\n",
	" gov_urls.add(url)\n",
	" \n",
	" # Add version without page numbers\n",
	" url_without_page = remove_page_fragment(url)\n",
	" all_references_without_pages.add(url_without_page)\n",
	" url_references.add(url_without_page)\n",
	" \n",
	" # Extract legal citations\n",
	" citations = extract_legal_citations(content)\n",
	" for citation in citations:\n",
	" file_has_references = True\n",
	" all_references_with_pages.add(citation)\n",
	" all_references_without_pages.add(citation)\n",
	" legal_citations.add(citation)\n",
	" \n",
	" # For YAML files, also check for 'reference', 'source', 'documentation' fields\n",
	" if file_path.suffix in ['.yaml', '.yml']:\n",
	" try:\n",
	" data = yaml.safe_load(content)\n",
	" if data and isinstance(data, dict):\n",
	" # Recursively search for reference fields\n",
	" def search_references(obj, path=\"\"):\n",
	" if isinstance(obj, dict):\n",
	" for key, value in obj.items():\n",
	" if key in ['reference', 'references', 'source', 'documentation', 'href', 'url']:\n",
	" if isinstance(value, str):\n",
	" if value.startswith('http'):\n",
	" all_references_with_pages.add(value)\n",
	" all_references_without_pages.add(remove_page_fragment(value))\n",
	" file_has_references = True\n",
	" elif isinstance(value, list):\n",
	" for item in value:\n",
	" if isinstance(item, dict) and 'href' in item:\n",
	" href = item['href']\n",
	" all_references_with_pages.add(href)\n",
	" all_references_without_pages.add(remove_page_fragment(href))\n",
	" file_has_references = True\n",
	" else:\n",
	" search_references(value, f\"{path}.{key}\")\n",
	" elif isinstance(obj, list):\n",
	" for item in obj:\n",
	" search_references(item, path)\n",
	" \n",
	" search_references(data)\n",
	" except yaml.YAMLError:\n",
	" pass # Skip invalid YAML files\n",
	" \n",
	" if file_has_references:\n",
	" files_with_references += 1\n",
	" \n",
	" except Exception as e:\n",
	" # Skip files that can't be read\n",
	" pass\n",
	"\n",
	"print(f\"\\nProcessed {files_processed} files\")\n",
	"print(f\"Files with references: {files_with_references}\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Analyze PDF references to understand page fragmentation\n",
	"pdfs_with_pages = [ref for ref in pdf_references if '#page=' in ref]\n",
	"pdfs_without_pages = [ref for ref in pdf_references if '#page=' not in ref]\n",
	"unique_pdf_bases = set(remove_page_fragment(ref) for ref in pdf_references)\n",
	"\n",
	"print(\"PDF Reference Analysis:\")\n",
	"print(f\" Total PDF references: {len(pdf_references):,}\")\n",
	"print(f\" PDFs with page fragments: {len(pdfs_with_pages):,}\")\n",
	"print(f\" PDFs without page fragments: {len(pdfs_without_pages):,}\")\n",
	"print(f\" Unique PDF documents (base URLs): {len(unique_pdf_bases):,}\")\n",
	"print(f\" Average pages per PDF: {len(pdfs_with_pages) / max(len(unique_pdf_bases), 1):.1f}\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Final summary\n",
	"print(\"\\n\" + \"=\"*60)\n",
	"print(\"POLICYENGINE-US SOURCE REFERENCE SUMMARY\")\n",
	"print(\"=\"*60)\n",
	"\n",
	"print(f\"\\n📊 TOTAL DISTINCT REFERENCES:\")\n",
	"print(f\" WITH page fragments: {len(all_references_with_pages):,}\")\n",
	"print(f\" WITHOUT page fragments: {len(all_references_without_pages):,}\")\n",
	"print(f\" Reduction factor: {len(all_references_with_pages) / max(len(all_references_without_pages), 1):.2f}x\")\n",
	"\n",
	"print(f\"\\n📑 BREAKDOWN BY TYPE:\")\n",
	"print(f\" Government URLs (.gov): {len(gov_urls):,}\")\n",
	"print(f\" PDF documents: {len(pdf_references):,}\")\n",
	"print(f\" Legal citations: {len(legal_citations):,}\")\n",
	"print(f\" Other URLs: {len(url_references - gov_urls):,}\")\n",
	"\n",
	"print(f\"\\n📁 FILE STATISTICS:\")\n",
	"print(f\" Files processed: {files_processed:,}\")\n",
	"print(f\" Files with references: {files_with_references:,}\")\n",
	"print(f\" Coverage: {files_with_references/max(files_processed, 1)*100:.1f}%\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Show some examples\n",
	"print(\"\\n\" + \"=\"*60)\n",
	"print(\"EXAMPLE REFERENCES\")\n",
	"print(\"=\"*60)\n",
	"\n",
	"print(\"\\n🔗 Sample URLs with page fragments:\")\n",
	"for ref in list(pdfs_with_pages)[:5]:\n",
	" print(f\" • {ref}\")\n",
	"\n",
	"print(\"\\n📜 Sample legal citations:\")\n",
	"for ref in list(legal_citations)[:5]:\n",
	" print(f\" • {ref}\")\n",
	"\n",
	"print(\"\\n🏛️ Sample government URLs:\")\n",
	"for ref in list(gov_urls)[:5]:\n",
	" if '.pdf' not in ref: # Show non-PDF gov URLs\n",
	" print(f\" • {ref}\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Conclusions\n",
	"\n",
	"Based on this analysis:\n",
	"\n",
	"1. Total distinct references INCLUDING page fragments: The actual count from the codebase\n",
	"2. Total distinct references EXCLUDING page fragments: The deduplicated count of unique documents\n",
	"3. The reduction factor shows how many page-specific citations exist per unique document on average\n",
	"\n",
	"This demonstrates the sophisticated level of source documentation in PolicyEngine-US, with precise page-level citations for policy documents."
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.0"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}
	#!/usr/bin/env python3
	"""
	PolicyEngine-US Source Document Reference Analysis

	This script analyzes the number of distinct source document references
	in the PolicyEngine-US codebase, both with and without page number fragments.
	"""

	import os
	import re
	from pathlib import Path
	from collections import defaultdict
	import yaml

	def extract_urls_from_text(text):
	"""Extract URLs from text content."""
	# Match URLs starting with http/https
	url_pattern = r'https?://[^\s<>"\']+'
	urls = re.findall(url_pattern, text)
	return urls

	def extract_legal_citations(text):
	"""Extract legal citations like USC, CFR, state codes."""
	citations = []

	# USC citations (e.g., "26 USC 32", "26 U.S.C. § 32")
	usc_patterns = [
	r'\d+\s+U\.?S\.?C\.?\s+§?\s*\d+',
	r'\d+\s+USC\s+\d+',
	]

	# CFR citations (e.g., "26 CFR 1.32-1")
	cfr_pattern = r'\d+\s+C\.?F\.?R\.?\s+[\d\.\-]+'

	# State code patterns
	state_patterns = [
	r'\b[A-Z]{2,}\s+Rev\.?\s+Code\s+[\d\.\-]+', # e.g., "WA Rev Code 123"
	r'\b[A-Z]{2,}\s+Stat\.?\s+[\d\.\-]+', # e.g., "CA Stat 123"
	]

	for pattern in usc_patterns + [cfr_pattern] + state_patterns:
	citations.extend(re.findall(pattern, text, re.IGNORECASE))

	return citations

	def remove_page_fragment(url):
	"""Remove #page=X fragments from URLs."""
	# Remove everything after # for page fragments
	if '#page=' in url:
	return url.split('#page=')[0]
	return url

	def main():
	# Set the base path to PolicyEngine-US repository
	BASE_PATH = Path("/Users/maxghenis/PolicyEngine/policyengine-us")

	# Verify the path exists
	if not BASE_PATH.exists():
	print(f"Error: Path {BASE_PATH} does not exist")
	return

	print(f"Analyzing PolicyEngine-US repository at: {BASE_PATH}")

	# Initialize counters
	all_references_with_pages = set()
	all_references_without_pages = set()
	url_references = set()
	legal_citations = set()
	pdf_references = set()
	gov_urls = set()

	# Statistics
	files_processed = 0
	files_with_references = 0

	# Process Python files in variables/ and parameters/ directories
	target_dirs = ['policyengine_us/parameters', 'policyengine_us/variables']

	for target_dir in target_dirs:
	dir_path = BASE_PATH / target_dir
	if not dir_path.exists():
	print(f"Warning: {dir_path} does not exist")
	continue

	print(f"\nProcessing {target_dir}...")

	# Process all Python and YAML files
	for file_path in dir_path.rglob('*'):
	if file_path.is_file() and (file_path.suffix in ['.py', '.yaml', '.yml']):
	files_processed += 1
	file_has_references = False

	try:
	content = file_path.read_text(encoding='utf-8', errors='ignore')

	# Extract URLs
	urls = extract_urls_from_text(content)
	for url in urls:
	file_has_references = True
	all_references_with_pages.add(url)

	# Track PDFs specifically
	if '.pdf' in url.lower():
	pdf_references.add(url)

	# Track .gov URLs
	if '.gov' in url:
	gov_urls.add(url)

	# Add version without page numbers
	url_without_page = remove_page_fragment(url)
	all_references_without_pages.add(url_without_page)
	url_references.add(url_without_page)

	# Extract legal citations
	citations = extract_legal_citations(content)
	for citation in citations:
	file_has_references = True
	all_references_with_pages.add(citation)
	all_references_without_pages.add(citation)
	legal_citations.add(citation)

	# For YAML files, also check for 'reference', 'source', 'documentation' fields
	if file_path.suffix in ['.yaml', '.yml']:
	try:
	data = yaml.safe_load(content)
	if data and isinstance(data, dict):
	# Recursively search for reference fields
	def search_references(obj, path=""):
	if isinstance(obj, dict):
	for key, value in obj.items():
	if key in ['reference', 'references', 'source', 'documentation', 'href', 'url']:
	if isinstance(value, str):
	if value.startswith('http'):
	all_references_with_pages.add(value)
	all_references_without_pages.add(remove_page_fragment(value))
	file_has_references = True
	if '.pdf' in value.lower():
	pdf_references.add(value)
	if '.gov' in value:
	gov_urls.add(value)
	elif isinstance(value, list):
	for item in value:
	if isinstance(item, dict) and 'href' in item:
	href = item['href']
	all_references_with_pages.add(href)
	all_references_without_pages.add(remove_page_fragment(href))
	file_has_references = True
	if '.pdf' in href.lower():
	pdf_references.add(href)
	if '.gov' in href:
	gov_urls.add(href)
	else:
	search_references(value, f"{path}.{key}")
	elif isinstance(obj, list):
	for item in obj:
	search_references(item, path)

	search_references(data)
	except yaml.YAMLError:
	pass # Skip invalid YAML files

	if file_has_references:
	files_with_references += 1

	except Exception as e:
	# Skip files that can't be read
	pass

	print(f"\nProcessed {files_processed} files")
	print(f"Files with references: {files_with_references}")

	# Analyze PDF references to understand page fragmentation
	pdfs_with_pages = [ref for ref in pdf_references if '#page=' in ref]
	pdfs_without_pages = [ref for ref in pdf_references if '#page=' not in ref]
	unique_pdf_bases = set(remove_page_fragment(ref) for ref in pdf_references)

	print("\nPDF Reference Analysis:")
	print(f" Total PDF references: {len(pdf_references):,}")
	print(f" PDFs with page fragments: {len(pdfs_with_pages):,}")
	print(f" PDFs without page fragments: {len(pdfs_without_pages):,}")
	print(f" Unique PDF documents (base URLs): {len(unique_pdf_bases):,}")
	if len(unique_pdf_bases) > 0:
	print(f" Average pages per PDF: {len(pdfs_with_pages) / len(unique_pdf_bases):.1f}")

	# Final summary
	print("\n" + "="*60)
	print("POLICYENGINE-US SOURCE REFERENCE SUMMARY")
	print("="*60)

	print(f"\n📊 TOTAL DISTINCT REFERENCES:")
	print(f" WITH page fragments: {len(all_references_with_pages):,}")
	print(f" WITHOUT page fragments: {len(all_references_without_pages):,}")
	if len(all_references_without_pages) > 0:
	print(f" Reduction factor: {len(all_references_with_pages) / len(all_references_without_pages):.2f}x")

	print(f"\n📑 BREAKDOWN BY TYPE:")
	print(f" Government URLs (.gov): {len(gov_urls):,}")
	print(f" PDF documents: {len(pdf_references):,}")
	print(f" Legal citations: {len(legal_citations):,}")
	print(f" Other URLs: {len(url_references - gov_urls):,}")

	print(f"\n📁 FILE STATISTICS:")
	print(f" Files processed: {files_processed:,}")
	print(f" Files with references: {files_with_references:,}")
	if files_processed > 0:
	print(f" Coverage: {files_with_references/files_processed*100:.1f}%")

	# Show some examples
	print("\n" + "="*60)
	print("EXAMPLE REFERENCES")
	print("="*60)

	print("\n🔗 Sample URLs with page fragments:")
	for ref in list(pdfs_with_pages)[:5]:
	print(f" • {ref}")

	print("\n📜 Sample legal citations:")
	for ref in list(legal_citations)[:5]:
	print(f" • {ref}")

	print("\n🏛️ Sample government URLs:")
	count = 0
	for ref in list(gov_urls):
	if '.pdf' not in ref: # Show non-PDF gov URLs
	print(f" • {ref}")
	count += 1
	if count >= 5:
	break

	if __name__ == "__main__":
	main()