Skip to content

Instantly share code, notes, and snippets.

@MaxGhenis
Created August 11, 2025 13:36
Show Gist options
  • Save MaxGhenis/67e66d7627ab6a466b023717d6ef6833 to your computer and use it in GitHub Desktop.
Save MaxGhenis/67e66d7627ab6a466b023717d6ef6833 to your computer and use it in GitHub Desktop.
PolicyEngine-US Source Document Reference Analysis
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PolicyEngine-US Source Document Reference Analysis\n",
"\n",
"This notebook analyzes the number of distinct source document references in the PolicyEngine-US codebase, both with and without page number fragments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import re\n",
"from pathlib import Path\n",
"from collections import defaultdict\n",
"from urllib.parse import urlparse, urlunparse\n",
"import json\n",
"import yaml"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Set the base path to PolicyEngine-US repository\n",
"BASE_PATH = Path(\"/Users/maxghenis/PolicyEngine/policyengine-us\")\n",
"\n",
"# Verify the path exists\n",
"if not BASE_PATH.exists():\n",
" print(f\"Error: Path {BASE_PATH} does not exist\")\n",
"else:\n",
" print(f\"Analyzing PolicyEngine-US repository at: {BASE_PATH}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def extract_urls_from_text(text):\n",
" \"\"\"Extract URLs from text content.\"\"\"\n",
" # Match URLs starting with http/https\n",
" url_pattern = r'https?://[^\\s<>\"\\']+'
n",
" urls = re.findall(url_pattern, text)\n",
" return urls\n",
"\n",
"def extract_legal_citations(text):\n",
" \"\"\"Extract legal citations like USC, CFR, state codes.\"\"\"\n",
" citations = []\n",
" \n",
" # USC citations (e.g., \"26 USC 32\", \"26 U.S.C. Β§ 32\")\n",
" usc_patterns = [\n",
" r'\\d+\\s+U\\.?S\\.?C\\.?\\s+Β§?\\s*\\d+',\n",
" r'\\d+\\s+USC\\s+\\d+',\n",
" ]\n",
" \n",
" # CFR citations (e.g., \"26 CFR 1.32-1\")\n",
" cfr_pattern = r'\\d+\\s+C\\.?F\\.?R\\.?\\s+[\\d\\.\\-]+'\n",
" \n",
" # State code patterns\n",
" state_patterns = [\n",
" r'\\b[A-Z]{2,}\\s+Rev\\.?\\s+Code\\s+[\\d\\.\\-]+', # e.g., \"WA Rev Code 123\"\n",
" r'\\b[A-Z]{2,}\\s+Stat\\.?\\s+[\\d\\.\\-]+', # e.g., \"CA Stat 123\"\n",
" ]\n",
" \n",
" for pattern in usc_patterns + [cfr_pattern] + state_patterns:\n",
" citations.extend(re.findall(pattern, text, re.IGNORECASE))\n",
" \n",
" return citations\n",
"\n",
"def remove_page_fragment(url):\n",
" \"\"\"Remove #page=X fragments from URLs.\"\"\"\n",
" # Remove everything after # for page fragments\n",
" if '#page=' in url:\n",
" return url.split('#page=')[0]\n",
" return url"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Initialize counters\n",
"all_references_with_pages = set()\n",
"all_references_without_pages = set()\n",
"url_references = set()\n",
"legal_citations = set()\n",
"pdf_references = set()\n",
"gov_urls = set()\n",
"\n",
"# Statistics\n",
"files_processed = 0\n",
"files_with_references = 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Process Python files in variables/ and parameters/ directories\n",
"target_dirs = ['policyengine_us/parameters', 'policyengine_us/variables']\n",
"\n",
"for target_dir in target_dirs:\n",
" dir_path = BASE_PATH / target_dir\n",
" if not dir_path.exists():\n",
" print(f\"Warning: {dir_path} does not exist\")\n",
" continue\n",
" \n",
" print(f\"\\nProcessing {target_dir}...\")\n",
" \n",
" # Process all Python and YAML files\n",
" for file_path in dir_path.rglob('*'):\n",
" if file_path.is_file() and (file_path.suffix in ['.py', '.yaml', '.yml']):\n",
" files_processed += 1\n",
" file_has_references = False\n",
" \n",
" try:\n",
" content = file_path.read_text(encoding='utf-8', errors='ignore')\n",
" \n",
" # Extract URLs\n",
" urls = extract_urls_from_text(content)\n",
" for url in urls:\n",
" file_has_references = True\n",
" all_references_with_pages.add(url)\n",
" \n",
" # Track PDFs specifically\n",
" if '.pdf' in url.lower():\n",
" pdf_references.add(url)\n",
" \n",
" # Track .gov URLs\n",
" if '.gov' in url:\n",
" gov_urls.add(url)\n",
" \n",
" # Add version without page numbers\n",
" url_without_page = remove_page_fragment(url)\n",
" all_references_without_pages.add(url_without_page)\n",
" url_references.add(url_without_page)\n",
" \n",
" # Extract legal citations\n",
" citations = extract_legal_citations(content)\n",
" for citation in citations:\n",
" file_has_references = True\n",
" all_references_with_pages.add(citation)\n",
" all_references_without_pages.add(citation)\n",
" legal_citations.add(citation)\n",
" \n",
" # For YAML files, also check for 'reference', 'source', 'documentation' fields\n",
" if file_path.suffix in ['.yaml', '.yml']:\n",
" try:\n",
" data = yaml.safe_load(content)\n",
" if data and isinstance(data, dict):\n",
" # Recursively search for reference fields\n",
" def search_references(obj, path=\"\"):\n",
" if isinstance(obj, dict):\n",
" for key, value in obj.items():\n",
" if key in ['reference', 'references', 'source', 'documentation', 'href', 'url']:\n",
" if isinstance(value, str):\n",
" if value.startswith('http'):\n",
" all_references_with_pages.add(value)\n",
" all_references_without_pages.add(remove_page_fragment(value))\n",
" file_has_references = True\n",
" elif isinstance(value, list):\n",
" for item in value:\n",
" if isinstance(item, dict) and 'href' in item:\n",
" href = item['href']\n",
" all_references_with_pages.add(href)\n",
" all_references_without_pages.add(remove_page_fragment(href))\n",
" file_has_references = True\n",
" else:\n",
" search_references(value, f\"{path}.{key}\")\n",
" elif isinstance(obj, list):\n",
" for item in obj:\n",
" search_references(item, path)\n",
" \n",
" search_references(data)\n",
" except yaml.YAMLError:\n",
" pass # Skip invalid YAML files\n",
" \n",
" if file_has_references:\n",
" files_with_references += 1\n",
" \n",
" except Exception as e:\n",
" # Skip files that can't be read\n",
" pass\n",
"\n",
"print(f\"\\nProcessed {files_processed} files\")\n",
"print(f\"Files with references: {files_with_references}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Analyze PDF references to understand page fragmentation\n",
"pdfs_with_pages = [ref for ref in pdf_references if '#page=' in ref]\n",
"pdfs_without_pages = [ref for ref in pdf_references if '#page=' not in ref]\n",
"unique_pdf_bases = set(remove_page_fragment(ref) for ref in pdf_references)\n",
"\n",
"print(\"PDF Reference Analysis:\")\n",
"print(f\" Total PDF references: {len(pdf_references):,}\")\n",
"print(f\" PDFs with page fragments: {len(pdfs_with_pages):,}\")\n",
"print(f\" PDFs without page fragments: {len(pdfs_without_pages):,}\")\n",
"print(f\" Unique PDF documents (base URLs): {len(unique_pdf_bases):,}\")\n",
"print(f\" Average pages per PDF: {len(pdfs_with_pages) / max(len(unique_pdf_bases), 1):.1f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Final summary\n",
"print(\"\\n\" + \"=\"*60)\n",
"print(\"POLICYENGINE-US SOURCE REFERENCE SUMMARY\")\n",
"print(\"=\"*60)\n",
"\n",
"print(f\"\\nπŸ“Š TOTAL DISTINCT REFERENCES:\")\n",
"print(f\" WITH page fragments: {len(all_references_with_pages):,}\")\n",
"print(f\" WITHOUT page fragments: {len(all_references_without_pages):,}\")\n",
"print(f\" Reduction factor: {len(all_references_with_pages) / max(len(all_references_without_pages), 1):.2f}x\")\n",
"\n",
"print(f\"\\nπŸ“‘ BREAKDOWN BY TYPE:\")\n",
"print(f\" Government URLs (.gov): {len(gov_urls):,}\")\n",
"print(f\" PDF documents: {len(pdf_references):,}\")\n",
"print(f\" Legal citations: {len(legal_citations):,}\")\n",
"print(f\" Other URLs: {len(url_references - gov_urls):,}\")\n",
"\n",
"print(f\"\\nπŸ“ FILE STATISTICS:\")\n",
"print(f\" Files processed: {files_processed:,}\")\n",
"print(f\" Files with references: {files_with_references:,}\")\n",
"print(f\" Coverage: {files_with_references/max(files_processed, 1)*100:.1f}%\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Show some examples\n",
"print(\"\\n\" + \"=\"*60)\n",
"print(\"EXAMPLE REFERENCES\")\n",
"print(\"=\"*60)\n",
"\n",
"print(\"\\nπŸ”— Sample URLs with page fragments:\")\n",
"for ref in list(pdfs_with_pages)[:5]:\n",
" print(f\" β€’ {ref}\")\n",
"\n",
"print(\"\\nπŸ“œ Sample legal citations:\")\n",
"for ref in list(legal_citations)[:5]:\n",
" print(f\" β€’ {ref}\")\n",
"\n",
"print(\"\\nπŸ›οΈ Sample government URLs:\")\n",
"for ref in list(gov_urls)[:5]:\n",
" if '.pdf' not in ref: # Show non-PDF gov URLs\n",
" print(f\" β€’ {ref}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Conclusions\n",
"\n",
"Based on this analysis:\n",
"\n",
"1. **Total distinct references INCLUDING page fragments**: The actual count from the codebase\n",
"2. **Total distinct references EXCLUDING page fragments**: The deduplicated count of unique documents\n",
"3. **The reduction factor** shows how many page-specific citations exist per unique document on average\n",
"\n",
"This demonstrates the sophisticated level of source documentation in PolicyEngine-US, with precise page-level citations for policy documents."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
#!/usr/bin/env python3
"""
PolicyEngine-US Source Document Reference Analysis
This script analyzes the number of distinct source document references
in the PolicyEngine-US codebase, both with and without page number fragments.
"""
import os
import re
from pathlib import Path
from collections import defaultdict
import yaml
def extract_urls_from_text(text):
"""Extract URLs from text content."""
# Match URLs starting with http/https
url_pattern = r'https?://[^\s<>"\']+'
urls = re.findall(url_pattern, text)
return urls
def extract_legal_citations(text):
"""Extract legal citations like USC, CFR, state codes."""
citations = []
# USC citations (e.g., "26 USC 32", "26 U.S.C. Β§ 32")
usc_patterns = [
r'\d+\s+U\.?S\.?C\.?\s+Β§?\s*\d+',
r'\d+\s+USC\s+\d+',
]
# CFR citations (e.g., "26 CFR 1.32-1")
cfr_pattern = r'\d+\s+C\.?F\.?R\.?\s+[\d\.\-]+'
# State code patterns
state_patterns = [
r'\b[A-Z]{2,}\s+Rev\.?\s+Code\s+[\d\.\-]+', # e.g., "WA Rev Code 123"
r'\b[A-Z]{2,}\s+Stat\.?\s+[\d\.\-]+', # e.g., "CA Stat 123"
]
for pattern in usc_patterns + [cfr_pattern] + state_patterns:
citations.extend(re.findall(pattern, text, re.IGNORECASE))
return citations
def remove_page_fragment(url):
"""Remove #page=X fragments from URLs."""
# Remove everything after # for page fragments
if '#page=' in url:
return url.split('#page=')[0]
return url
def main():
# Set the base path to PolicyEngine-US repository
BASE_PATH = Path("/Users/maxghenis/PolicyEngine/policyengine-us")
# Verify the path exists
if not BASE_PATH.exists():
print(f"Error: Path {BASE_PATH} does not exist")
return
print(f"Analyzing PolicyEngine-US repository at: {BASE_PATH}")
# Initialize counters
all_references_with_pages = set()
all_references_without_pages = set()
url_references = set()
legal_citations = set()
pdf_references = set()
gov_urls = set()
# Statistics
files_processed = 0
files_with_references = 0
# Process Python files in variables/ and parameters/ directories
target_dirs = ['policyengine_us/parameters', 'policyengine_us/variables']
for target_dir in target_dirs:
dir_path = BASE_PATH / target_dir
if not dir_path.exists():
print(f"Warning: {dir_path} does not exist")
continue
print(f"\nProcessing {target_dir}...")
# Process all Python and YAML files
for file_path in dir_path.rglob('*'):
if file_path.is_file() and (file_path.suffix in ['.py', '.yaml', '.yml']):
files_processed += 1
file_has_references = False
try:
content = file_path.read_text(encoding='utf-8', errors='ignore')
# Extract URLs
urls = extract_urls_from_text(content)
for url in urls:
file_has_references = True
all_references_with_pages.add(url)
# Track PDFs specifically
if '.pdf' in url.lower():
pdf_references.add(url)
# Track .gov URLs
if '.gov' in url:
gov_urls.add(url)
# Add version without page numbers
url_without_page = remove_page_fragment(url)
all_references_without_pages.add(url_without_page)
url_references.add(url_without_page)
# Extract legal citations
citations = extract_legal_citations(content)
for citation in citations:
file_has_references = True
all_references_with_pages.add(citation)
all_references_without_pages.add(citation)
legal_citations.add(citation)
# For YAML files, also check for 'reference', 'source', 'documentation' fields
if file_path.suffix in ['.yaml', '.yml']:
try:
data = yaml.safe_load(content)
if data and isinstance(data, dict):
# Recursively search for reference fields
def search_references(obj, path=""):
if isinstance(obj, dict):
for key, value in obj.items():
if key in ['reference', 'references', 'source', 'documentation', 'href', 'url']:
if isinstance(value, str):
if value.startswith('http'):
all_references_with_pages.add(value)
all_references_without_pages.add(remove_page_fragment(value))
file_has_references = True
if '.pdf' in value.lower():
pdf_references.add(value)
if '.gov' in value:
gov_urls.add(value)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict) and 'href' in item:
href = item['href']
all_references_with_pages.add(href)
all_references_without_pages.add(remove_page_fragment(href))
file_has_references = True
if '.pdf' in href.lower():
pdf_references.add(href)
if '.gov' in href:
gov_urls.add(href)
else:
search_references(value, f"{path}.{key}")
elif isinstance(obj, list):
for item in obj:
search_references(item, path)
search_references(data)
except yaml.YAMLError:
pass # Skip invalid YAML files
if file_has_references:
files_with_references += 1
except Exception as e:
# Skip files that can't be read
pass
print(f"\nProcessed {files_processed} files")
print(f"Files with references: {files_with_references}")
# Analyze PDF references to understand page fragmentation
pdfs_with_pages = [ref for ref in pdf_references if '#page=' in ref]
pdfs_without_pages = [ref for ref in pdf_references if '#page=' not in ref]
unique_pdf_bases = set(remove_page_fragment(ref) for ref in pdf_references)
print("\nPDF Reference Analysis:")
print(f" Total PDF references: {len(pdf_references):,}")
print(f" PDFs with page fragments: {len(pdfs_with_pages):,}")
print(f" PDFs without page fragments: {len(pdfs_without_pages):,}")
print(f" Unique PDF documents (base URLs): {len(unique_pdf_bases):,}")
if len(unique_pdf_bases) > 0:
print(f" Average pages per PDF: {len(pdfs_with_pages) / len(unique_pdf_bases):.1f}")
# Final summary
print("\n" + "="*60)
print("POLICYENGINE-US SOURCE REFERENCE SUMMARY")
print("="*60)
print(f"\nπŸ“Š TOTAL DISTINCT REFERENCES:")
print(f" WITH page fragments: {len(all_references_with_pages):,}")
print(f" WITHOUT page fragments: {len(all_references_without_pages):,}")
if len(all_references_without_pages) > 0:
print(f" Reduction factor: {len(all_references_with_pages) / len(all_references_without_pages):.2f}x")
print(f"\nπŸ“‘ BREAKDOWN BY TYPE:")
print(f" Government URLs (.gov): {len(gov_urls):,}")
print(f" PDF documents: {len(pdf_references):,}")
print(f" Legal citations: {len(legal_citations):,}")
print(f" Other URLs: {len(url_references - gov_urls):,}")
print(f"\nπŸ“ FILE STATISTICS:")
print(f" Files processed: {files_processed:,}")
print(f" Files with references: {files_with_references:,}")
if files_processed > 0:
print(f" Coverage: {files_with_references/files_processed*100:.1f}%")
# Show some examples
print("\n" + "="*60)
print("EXAMPLE REFERENCES")
print("="*60)
print("\nπŸ”— Sample URLs with page fragments:")
for ref in list(pdfs_with_pages)[:5]:
print(f" β€’ {ref}")
print("\nπŸ“œ Sample legal citations:")
for ref in list(legal_citations)[:5]:
print(f" β€’ {ref}")
print("\nπŸ›οΈ Sample government URLs:")
count = 0
for ref in list(gov_urls):
if '.pdf' not in ref: # Show non-PDF gov URLs
print(f" β€’ {ref}")
count += 1
if count >= 5:
break
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment