pszemraj · September 20, 2025 00:21
diff --git a/check_md_links.py b/check_md_links.py
 #!/usr/bin/env python3
 """
 Markdown broken link checker - properly distinguishes between URLs and file paths.
 """

 import re
 import sys
 import argparse
 from pathlib import Path
 from typing import Dict, List, Tuple, Optional, Set, Iterator
 from dataclasses import dataclass
 from urllib.parse import unquote
 from collections import defaultdict


 @dataclass
 class BrokenLink:
    """Represents a broken link found in a markdown file."""

    file_path: Path
    line_num: int
    line_content: str
    link_text: str
    link_target: str
    resolved_path: Optional[Path]
    reason: str


 @dataclass
 class CheckerStats:
    """Statistics for the checking process."""

    total_files: int = 0
    total_links: int = 0
    external_links: int = 0
    file_links: int = 0


 class MarkdownLinkChecker:
    """Markdown link checker that properly handles URLs vs file paths."""

    # Default directories to exclude
    DEFAULT_EXCLUDES = {
        ".git",
        ".hg",
        ".svn",
        ".venv",
        "venv",
        "env",
        "node_modules",
        "bower_components",
        "dist",
        "build",
        "out",
        "_build",
        ".mypy_cache",
        ".pytest_cache",
        ".tox",
        ".idea",
        ".vscode",
        ".vs",
        "__pycache__",
        ".DS_Store",
        "coverage",
        "htmlcov",
        ".coverage",
        ".terraform",
        ".serverless",
    }

    # Markdown extensions to check
    MD_EXTENSIONS = {".md", ".markdown", ".mkd", ".mdx"}

    # Regex patterns
    FENCE_PATTERN = re.compile(r"^\s*([`~]{3,})")
    INLINE_CODE_PATTERN = re.compile(r"`[^`\n]+`")

    # Link patterns
    INLINE_LINK_PATTERN = re.compile(r"(!?)\[([^\]]*)\]\(([^)]+)\)")
    REF_USE_PATTERN = re.compile(r"\[([^\]]+)\]\[([^\]]*)\]")
    REF_DEF_PATTERN = re.compile(r"^\s{0,3}\[([^\]]+)\]:\s*(\S+)(?:\s+[\"'(].*[\"')])?")

    # HTML patterns
    HTML_LINK_PATTERN = re.compile(
        r'<a[^>]+href\s*=\s*["\']([^"\']+)["\'][^>]*>', re.IGNORECASE
    )
    HTML_IMG_PATTERN = re.compile(
        r'<img[^>]+src\s*=\s*["\']([^"\']+)["\'][^>]*>', re.IGNORECASE
    )

    def __init__(
        self,
        repo_path: Path,
        excludes: Optional[Set[str]] = None,
        verbose: bool = False,
    ):
        self.repo_path = repo_path.resolve()
        self.excludes = self.DEFAULT_EXCLUDES.copy()
        if excludes:
            self.excludes.update(excludes)
        self.verbose = verbose

        self.broken_links: List[BrokenLink] = []
        self.stats = CheckerStats()

    def should_skip_path(self, path: Path) -> bool:
        """Check if a path should be skipped based on excludes."""
        parts = set(path.parts)
        return bool(self.excludes & parts)

    def is_url_or_special(self, target: str) -> bool:
        """
        Check if target is a URL, email, anchor, or other non-file reference.
        This must be called BEFORE any cleaning/processing of the target.
        """
        target = target.strip()

        # Anchors
        if target.startswith("#"):
            return True

        # Protocol-relative URLs
        if target.startswith("//"):
            return True

        # Check for common URL patterns
        url_patterns = [
            r"^https?://",
            r"^ftps?://",
            r"^wss?://",
            r"^mailto:",
            r"^tel:",
            r"^sms:",
            r"^data:",
            r"^javascript:",
            r"^vbscript:",
            r"^file://",
            r"^news:",
            r"^nntp:",
            r"^ircs?://",
            r"^gopher://",
            r"^gemini://",
            r"^magnet:",
            r"^ssh://",
            r"^git://",
        ]

        for pattern in url_patterns:
            if re.match(pattern, target, re.IGNORECASE):
                return True

        # Check if it has a domain-like structure (contains :// or common domains)
        if "://" in target:
            return True

        # Common domains without protocol (sometimes used in markdown)
        domain_patterns = [
            r"^[a-z]+\.(com|org|net|io|dev|edu|gov|co\.[a-z]{2})",
            r"^github\.com/",
            r"^gitlab\.com/",
            r"^bitbucket\.org/",
            r"^stackoverflow\.com/",
            r"^huggingface\.co/",
            r"^arxiv\.org/",
        ]

        for pattern in domain_patterns:
            if re.match(pattern, target, re.IGNORECASE):
                return True

        return False

    def clean_file_path(self, target: str) -> str:
        """
        Clean a file path target by removing fragments, queries, and titles.
        Only call this AFTER confirming it's not a URL.
        """
        # Remove angle brackets if present
        if target.startswith("<") and ">" in target:
            end_idx = target.index(">")
            target = target[1:end_idx]

        # Remove markdown link titles (text after space and quotes)
        # "path/file.md "title"" -> "path/file.md"
        if ' "' in target or " '" in target:
            # Find first unquoted space and take everything before it
            parts = target.split()
            if parts:
                target = parts[0]

        # Remove URL fragment (#section)
        if "#" in target:
            target = target.split("#")[0]

        # Remove query string (?param=value)
        if "?" in target:
            target = target.split("?")[0]

        # Decode percent-encoding
        target = unquote(target)

        return target.strip()

    def resolve_link_path(
        self, link: str, source_file: Path
    ) -> Tuple[Optional[Path], str]:
        """
        Resolve a link to an absolute path if it's a file reference.

        Returns: (resolved_path, reason_if_skipped)
        """
        # First check if it's a URL or special link
        if self.is_url_or_special(link):
            self.stats.external_links += 1
            return None, "external URL or anchor"

        # Now we know it's meant to be a file path, so clean it
        link = self.clean_file_path(link)

        if not link:
            return None, "empty after cleaning"

        self.stats.file_links += 1

        # Resolve based on type
        if link.startswith("/"):
            # Repository-absolute path
            resolved = self.repo_path / link[1:]
        else:
            # Relative to source file's directory
            resolved = source_file.parent / link

        # Normalize path
        try:
            resolved = resolved.resolve(strict=False)
        except Exception as e:
            return None, f"path resolution failed: {e}"

        # Security check: ensure path doesn't escape repo
        try:
            resolved.relative_to(self.repo_path)
        except ValueError:
            return resolved, "escapes repository root (security issue)"

        return resolved, ""

    def collect_reference_definitions(self, lines: List[str]) -> Dict[str, str]:
        """Collect all reference-style link definitions, respecting code blocks."""
        refs = {}
        inside_fence = False
        fence_marker = None

        for line in lines:
            # Check for code fence
            fence_match = self.FENCE_PATTERN.match(line)
            if fence_match:
                marker = fence_match.group(1)
                if not inside_fence:
                    inside_fence = True
                    fence_marker = marker
                elif marker.startswith(fence_marker):
                    inside_fence = False
                    fence_marker = None
                continue

            if inside_fence:
                continue

            # Check for reference definition
            ref_match = self.REF_DEF_PATTERN.match(line)
            if ref_match:
                label = ref_match.group(1).strip().lower()
                target = ref_match.group(2)
                refs[label] = target

        return refs

    def extract_links_from_line(
        self, line: str, refs: Dict[str, str]
    ) -> Iterator[Tuple[str, str]]:
        """Extract all links from a line of text. Yields (link_text, link_target)."""
        # Remove inline code spans first
        line_cleaned = self.INLINE_CODE_PATTERN.sub("", line)

        # Standard markdown links: [text](url) or ![alt](url)
        for match in self.INLINE_LINK_PATTERN.finditer(line_cleaned):
            is_image = bool(match.group(1))
            text = match.group(2) or ("image" if is_image else "link")
            target = match.group(3)
            yield text, target

        # Reference-style links: [text][ref]
        for match in self.REF_USE_PATTERN.finditer(line_cleaned):
            text = match.group(1)
            ref_label = match.group(2) or text
            ref_label = ref_label.strip().lower()
            if ref_label in refs:
                yield text, refs[ref_label]

        # HTML <a href="...">
        for match in self.HTML_LINK_PATTERN.finditer(line_cleaned):
            yield "HTML link", match.group(1)

        # HTML <img src="...">
        for match in self.HTML_IMG_PATTERN.finditer(line_cleaned):
            yield "HTML image", match.group(1)

    def check_file(self, md_file: Path) -> None:
        """Check a single markdown file for broken links."""
        try:
            content = md_file.read_text(encoding="utf-8", errors="replace")
        except Exception as e:
            if self.verbose:
                print(
                    f"⚠️  Error reading {md_file.relative_to(self.repo_path)}: {e}",
                    file=sys.stderr,
                )
            return

        lines = content.splitlines()
        refs = self.collect_reference_definitions(lines)

        inside_fence = False
        fence_marker = None

        for line_num, line in enumerate(lines, 1):
            # Skip code blocks
            fence_match = self.FENCE_PATTERN.match(line)
            if fence_match:
                marker = fence_match.group(1)
                if not inside_fence:
                    inside_fence = True
                    fence_marker = marker
                elif marker.startswith(fence_marker):
                    inside_fence = False
                    fence_marker = None
                continue

            if inside_fence:
                continue

            # Extract and check links
            for link_text, link_target in self.extract_links_from_line(line, refs):
                self.stats.total_links += 1

                resolved_path, reason = self.resolve_link_path(link_target, md_file)

                # Skip external links and anchors
                if reason in ("external URL or anchor", "empty after cleaning"):
                    continue

                # Report security issues
                if reason and ("security" in reason or "escape" in reason):
                    self.broken_links.append(
                        BrokenLink(
                            file_path=md_file,
                            line_num=line_num,
                            line_content=line.strip()[:100],
                            link_text=link_text,
                            link_target=link_target,
                            resolved_path=resolved_path,
                            reason=reason,
                        )
                    )
                    continue

                # Check if file exists
                if resolved_path and not resolved_path.exists():
                    self.broken_links.append(
                        BrokenLink(
                            file_path=md_file,
                            line_num=line_num,
                            line_content=line.strip()[:100],
                            link_text=link_text,
                            link_target=link_target,
                            resolved_path=resolved_path,
                            reason="file does not exist",
                        )
                    )

    def scan_repository(self) -> None:
        """Efficiently scan repository with directory pruning."""
        print(f"🔍 Scanning {self.repo_path}")
        if self.verbose and self.excludes:
            print(f"   Excluding: {', '.join(sorted(self.excludes))}")

        # Manual traversal with pruning
        stack = [self.repo_path]

        while stack:
            current = stack.pop()

            if current.is_dir():
                if self.should_skip_path(current):
                    continue

                try:
                    for child in sorted(current.iterdir()):
                        stack.append(child)
                except PermissionError:
                    continue

            elif current.suffix.lower() in self.MD_EXTENSIONS:
                self.stats.total_files += 1
                if self.verbose:
                    print(f"   Checking: {current.relative_to(self.repo_path)}")
                self.check_file(current)

    def generate_report(self) -> None:
        """Generate comprehensive report with statistics and grouping."""
        print("\n📊 Scan Statistics:")
        print(f"   Files scanned: {self.stats.total_files}")
        print(f"   Total links: {self.stats.total_links}")
        print(f"   External URLs/anchors: {self.stats.external_links}")
        print(f"   File path links: {self.stats.file_links}")

        if not self.broken_links:
            print("\n✅ No broken file links found!")
            return

        print(f"\n❌ Found {len(self.broken_links)} broken file link(s):")
        print("=" * 80)

        # Group by file
        by_file = defaultdict(list)
        for broken in self.broken_links:
            by_file[broken.file_path].append(broken)

        # Display broken links grouped by file
        for file_path in sorted(by_file.keys()):
            rel_path = file_path.relative_to(self.repo_path)
            links = by_file[file_path]

            print(
                f"\n📄 {rel_path} ({len(links)} issue{'s' if len(links) > 1 else ''})"
            )
            print("-" * 80)

            for broken in sorted(links, key=lambda x: x.line_num):
                print(
                    f"  Line {broken.line_num}: [{broken.link_text}]({broken.link_target})"
                )

                if broken.resolved_path:
                    try:
                        resolved_rel = broken.resolved_path.relative_to(self.repo_path)
                    except ValueError:
                        resolved_rel = broken.resolved_path
                    print(f"    → Resolved to: {resolved_rel}")

                print(f"    ❌ {broken.reason}")

                if broken.line_content:
                    print(f"    Context: ...{broken.line_content}...")
                print()

        # Most commonly missing files
        missing_counts = defaultdict(int)
        for broken in self.broken_links:
            if broken.resolved_path and broken.reason == "file does not exist":
                missing_counts[broken.resolved_path] += 1

        if missing_counts:
            print("=" * 80)
            print("\n🔝 Most referenced missing files:")
            for path, count in sorted(missing_counts.items(), key=lambda x: -x[1])[:5]:
                try:
                    rel = path.relative_to(self.repo_path)
                except ValueError:
                    rel = path
                suffix = f" ({count} reference{'s' if count > 1 else ''})"
                print(f"   • {rel}{suffix}")


 def main():
    parser = argparse.ArgumentParser(
        description="Check markdown files for broken file path links",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 This tool checks for broken file path references in markdown files.
 It automatically ignores:
  • External URLs (http://, https://, etc.)  
  • Email addresses (mailto:)
  • Anchors (#section)
  • Links inside code blocks

 Examples:
  %(prog)s                     # Check current directory
  %(prog)s /path/to/repo       # Check specific repository  
  %(prog)s . --verbose         # Verbose output
        """,
    )

    parser.add_argument(
        "repo_path",
        nargs="?",
        default=".",
        help="Path to repository (default: current directory)",
    )

    parser.add_argument(
        "-v", "--verbose", action="store_true", help="Show verbose output"
    )

    parser.add_argument(
        "--no-default-excludes",
        action="store_true",
        help="Disable default directory exclusions",
    )

    parser.add_argument(
        "--exclude",
        action="append",
        default=[],
        help="Additional paths to exclude (can be repeated)",
    )

    args = parser.parse_args()

    repo_path = Path(args.repo_path).resolve()

    if not repo_path.exists():
        print(f"❌ Error: Path does not exist: {repo_path}", file=sys.stderr)
        return 2

    if not repo_path.is_dir():
        print(f"❌ Error: Path is not a directory: {repo_path}", file=sys.stderr)
        return 2

    excludes = set() if args.no_default_excludes else None
    if args.exclude:
        if excludes is None:
            excludes = set()
        excludes.update(args.exclude)

    checker = MarkdownLinkChecker(repo_path, excludes=excludes, verbose=args.verbose)

    checker.scan_repository()
    checker.generate_report()

    return 1 if checker.broken_links else 0


 if __name__ == "__main__":
    sys.exit(main())
	#!/usr/bin/env python3
	"""
	Markdown broken link checker - properly distinguishes between URLs and file paths.
	"""

	import re
	import sys
	import argparse
	from pathlib import Path
	from typing import Dict, List, Tuple, Optional, Set, Iterator
	from dataclasses import dataclass
	from urllib.parse import unquote
	from collections import defaultdict


	@dataclass
	class BrokenLink:
	"""Represents a broken link found in a markdown file."""

	file_path: Path
	line_num: int
	line_content: str
	link_text: str
	link_target: str
	resolved_path: Optional[Path]
	reason: str


	@dataclass
	class CheckerStats:
	"""Statistics for the checking process."""

	total_files: int = 0
	total_links: int = 0
	external_links: int = 0
	file_links: int = 0


	class MarkdownLinkChecker:
	"""Markdown link checker that properly handles URLs vs file paths."""

	# Default directories to exclude
	DEFAULT_EXCLUDES = {
	".git",
	".hg",
	".svn",
	".venv",
	"venv",
	"env",
	"node_modules",
	"bower_components",
	"dist",
	"build",
	"out",
	"_build",
	".mypy_cache",
	".pytest_cache",
	".tox",
	".idea",
	".vscode",
	".vs",
	"__pycache__",
	".DS_Store",
	"coverage",
	"htmlcov",
	".coverage",
	".terraform",
	".serverless",
	}

	# Markdown extensions to check
	MD_EXTENSIONS = {".md", ".markdown", ".mkd", ".mdx"}

	# Regex patterns
	FENCE_PATTERN = re.compile(r"^\s*([`~]{3,})")
	INLINE_CODE_PATTERN = re.compile(r"`[^`\n]+`")

	# Link patterns
	INLINE_LINK_PATTERN = re.compile(r"(!?)\[([^\]]*)\]\(([^)]+)\)")
	REF_USE_PATTERN = re.compile(r"\[([^\]]+)\]\[([^\]]*)\]")
	REF_DEF_PATTERN = re.compile(r"^\s{0,3}\[([^\]]+)\]:\s(\S+)(?:\s+[\"'(].[\"')])?")

	# HTML patterns
	HTML_LINK_PATTERN = re.compile(
	r'<a[^>]+href\s=\s["\']([^"\']+)["\'][^>]*>', re.IGNORECASE
	)
	HTML_IMG_PATTERN = re.compile(
	r'<img[^>]+src\s=\s["\']([^"\']+)["\'][^>]*>', re.IGNORECASE
	)

	def __init__(
	self,
	repo_path: Path,
	excludes: Optional[Set[str]] = None,
	verbose: bool = False,
	):
	self.repo_path = repo_path.resolve()
	self.excludes = self.DEFAULT_EXCLUDES.copy()
	if excludes:
	self.excludes.update(excludes)
	self.verbose = verbose

	self.broken_links: List[BrokenLink] = []
	self.stats = CheckerStats()

	def should_skip_path(self, path: Path) -> bool:
	"""Check if a path should be skipped based on excludes."""
	parts = set(path.parts)
	return bool(self.excludes & parts)

	def is_url_or_special(self, target: str) -> bool:
	"""
	Check if target is a URL, email, anchor, or other non-file reference.
	This must be called BEFORE any cleaning/processing of the target.
	"""
	target = target.strip()

	# Anchors
	if target.startswith("#"):
	return True

	# Protocol-relative URLs
	if target.startswith("//"):
	return True

	# Check for common URL patterns
	url_patterns = [
	r"^https?://",
	r"^ftps?://",
	r"^wss?://",
	r"^mailto:",
	r"^tel:",
	r"^sms:",
	r"^data:",
	r"^javascript:",
	r"^vbscript:",
	r"^file://",
	r"^news:",
	r"^nntp:",
	r"^ircs?://",
	r"^gopher://",
	r"^gemini://",
	r"^magnet:",
	r"^ssh://",
	r"^git://",
	]

	for pattern in url_patterns:
	if re.match(pattern, target, re.IGNORECASE):
	return True

	# Check if it has a domain-like structure (contains :// or common domains)
	if "://" in target:
	return True

	# Common domains without protocol (sometimes used in markdown)
	domain_patterns = [
	r"^[a-z]+\.(com\|org\|net\|io\|dev\|edu\|gov\|co\.[a-z]{2})",
	r"^github\.com/",
	r"^gitlab\.com/",
	r"^bitbucket\.org/",
	r"^stackoverflow\.com/",
	r"^huggingface\.co/",
	r"^arxiv\.org/",
	]

	for pattern in domain_patterns:
	if re.match(pattern, target, re.IGNORECASE):
	return True

	return False

	def clean_file_path(self, target: str) -> str:
	"""
	Clean a file path target by removing fragments, queries, and titles.
	Only call this AFTER confirming it's not a URL.
	"""
	# Remove angle brackets if present
	if target.startswith("<") and ">" in target:
	end_idx = target.index(">")
	target = target[1:end_idx]

	# Remove markdown link titles (text after space and quotes)
	# "path/file.md "title"" -> "path/file.md"
	if ' "' in target or " '" in target:
	# Find first unquoted space and take everything before it
	parts = target.split()
	if parts:
	target = parts[0]

	# Remove URL fragment (#section)
	if "#" in target:
	target = target.split("#")[0]

	# Remove query string (?param=value)
	if "?" in target:
	target = target.split("?")[0]

	# Decode percent-encoding
	target = unquote(target)

	return target.strip()

	def resolve_link_path(
	self, link: str, source_file: Path
	) -> Tuple[Optional[Path], str]:
	"""
	Resolve a link to an absolute path if it's a file reference.

	Returns: (resolved_path, reason_if_skipped)
	"""
	# First check if it's a URL or special link
	if self.is_url_or_special(link):
	self.stats.external_links += 1
	return None, "external URL or anchor"

	# Now we know it's meant to be a file path, so clean it
	link = self.clean_file_path(link)

	if not link:
	return None, "empty after cleaning"

	self.stats.file_links += 1

	# Resolve based on type
	if link.startswith("/"):
	# Repository-absolute path
	resolved = self.repo_path / link[1:]
	else:
	# Relative to source file's directory
	resolved = source_file.parent / link

	# Normalize path
	try:
	resolved = resolved.resolve(strict=False)
	except Exception as e:
	return None, f"path resolution failed: {e}"

	# Security check: ensure path doesn't escape repo
	try:
	resolved.relative_to(self.repo_path)
	except ValueError:
	return resolved, "escapes repository root (security issue)"

	return resolved, ""

	def collect_reference_definitions(self, lines: List[str]) -> Dict[str, str]:
	"""Collect all reference-style link definitions, respecting code blocks."""
	refs = {}
	inside_fence = False
	fence_marker = None

	for line in lines:
	# Check for code fence
	fence_match = self.FENCE_PATTERN.match(line)
	if fence_match:
	marker = fence_match.group(1)
	if not inside_fence:
	inside_fence = True
	fence_marker = marker
	elif marker.startswith(fence_marker):
	inside_fence = False
	fence_marker = None
	continue

	if inside_fence:
	continue

	# Check for reference definition
	ref_match = self.REF_DEF_PATTERN.match(line)
	if ref_match:
	label = ref_match.group(1).strip().lower()
	target = ref_match.group(2)
	refs[label] = target

	return refs

	def extract_links_from_line(
	self, line: str, refs: Dict[str, str]
	) -> Iterator[Tuple[str, str]]:
	"""Extract all links from a line of text. Yields (link_text, link_target)."""
	# Remove inline code spans first
	line_cleaned = self.INLINE_CODE_PATTERN.sub("", line)

	# Standard markdown links: [text](url) or ![alt](url)
	for match in self.INLINE_LINK_PATTERN.finditer(line_cleaned):
	is_image = bool(match.group(1))
	text = match.group(2) or ("image" if is_image else "link")
	target = match.group(3)
	yield text, target

	# Reference-style links: [text][ref]
	for match in self.REF_USE_PATTERN.finditer(line_cleaned):
	text = match.group(1)
	ref_label = match.group(2) or text
	ref_label = ref_label.strip().lower()
	if ref_label in refs:
	yield text, refs[ref_label]

	# HTML <a href="...">
	for match in self.HTML_LINK_PATTERN.finditer(line_cleaned):
	yield "HTML link", match.group(1)

	# HTML <img src="...">
	for match in self.HTML_IMG_PATTERN.finditer(line_cleaned):
	yield "HTML image", match.group(1)

	def check_file(self, md_file: Path) -> None:
	"""Check a single markdown file for broken links."""
	try:
	content = md_file.read_text(encoding="utf-8", errors="replace")
	except Exception as e:
	if self.verbose:
	print(
	f"⚠️ Error reading {md_file.relative_to(self.repo_path)}: {e}",
	file=sys.stderr,
	)
	return

	lines = content.splitlines()
	refs = self.collect_reference_definitions(lines)

	inside_fence = False
	fence_marker = None

	for line_num, line in enumerate(lines, 1):
	# Skip code blocks
	fence_match = self.FENCE_PATTERN.match(line)
	if fence_match:
	marker = fence_match.group(1)
	if not inside_fence:
	inside_fence = True
	fence_marker = marker
	elif marker.startswith(fence_marker):
	inside_fence = False
	fence_marker = None
	continue

	if inside_fence:
	continue

	# Extract and check links
	for link_text, link_target in self.extract_links_from_line(line, refs):
	self.stats.total_links += 1

	resolved_path, reason = self.resolve_link_path(link_target, md_file)

	# Skip external links and anchors
	if reason in ("external URL or anchor", "empty after cleaning"):
	continue

	# Report security issues
	if reason and ("security" in reason or "escape" in reason):
	self.broken_links.append(
	BrokenLink(
	file_path=md_file,
	line_num=line_num,
	line_content=line.strip()[:100],
	link_text=link_text,
	link_target=link_target,
	resolved_path=resolved_path,
	reason=reason,
	)
	)
	continue

	# Check if file exists
	if resolved_path and not resolved_path.exists():
	self.broken_links.append(
	BrokenLink(
	file_path=md_file,
	line_num=line_num,
	line_content=line.strip()[:100],
	link_text=link_text,
	link_target=link_target,
	resolved_path=resolved_path,
	reason="file does not exist",
	)
	)

	def scan_repository(self) -> None:
	"""Efficiently scan repository with directory pruning."""
	print(f"🔍 Scanning {self.repo_path}")
	if self.verbose and self.excludes:
	print(f" Excluding: {', '.join(sorted(self.excludes))}")

	# Manual traversal with pruning
	stack = [self.repo_path]

	while stack:
	current = stack.pop()

	if current.is_dir():
	if self.should_skip_path(current):
	continue

	try:
	for child in sorted(current.iterdir()):
	stack.append(child)
	except PermissionError:
	continue

	elif current.suffix.lower() in self.MD_EXTENSIONS:
	self.stats.total_files += 1
	if self.verbose:
	print(f" Checking: {current.relative_to(self.repo_path)}")
	self.check_file(current)

	def generate_report(self) -> None:
	"""Generate comprehensive report with statistics and grouping."""
	print("\n📊 Scan Statistics:")
	print(f" Files scanned: {self.stats.total_files}")
	print(f" Total links: {self.stats.total_links}")
	print(f" External URLs/anchors: {self.stats.external_links}")
	print(f" File path links: {self.stats.file_links}")

	if not self.broken_links:
	print("\n✅ No broken file links found!")
	return

	print(f"\n❌ Found {len(self.broken_links)} broken file link(s):")
	print("=" * 80)

	# Group by file
	by_file = defaultdict(list)
	for broken in self.broken_links:
	by_file[broken.file_path].append(broken)

	# Display broken links grouped by file
	for file_path in sorted(by_file.keys()):
	rel_path = file_path.relative_to(self.repo_path)
	links = by_file[file_path]

	print(
	f"\n📄 {rel_path} ({len(links)} issue{'s' if len(links) > 1 else ''})"
	)
	print("-" * 80)

	for broken in sorted(links, key=lambda x: x.line_num):
	print(
	f" Line {broken.line_num}: [{broken.link_text}]({broken.link_target})"
	)

	if broken.resolved_path:
	try:
	resolved_rel = broken.resolved_path.relative_to(self.repo_path)
	except ValueError:
	resolved_rel = broken.resolved_path
	print(f" → Resolved to: {resolved_rel}")

	print(f" ❌ {broken.reason}")

	if broken.line_content:
	print(f" Context: ...{broken.line_content}...")
	print()

	# Most commonly missing files
	missing_counts = defaultdict(int)
	for broken in self.broken_links:
	if broken.resolved_path and broken.reason == "file does not exist":
	missing_counts[broken.resolved_path] += 1

	if missing_counts:
	print("=" * 80)
	print("\n🔝 Most referenced missing files:")
	for path, count in sorted(missing_counts.items(), key=lambda x: -x[1])[:5]:
	try:
	rel = path.relative_to(self.repo_path)
	except ValueError:
	rel = path
	suffix = f" ({count} reference{'s' if count > 1 else ''})"
	print(f" • {rel}{suffix}")


	def main():
	parser = argparse.ArgumentParser(
	description="Check markdown files for broken file path links",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	This tool checks for broken file path references in markdown files.
	It automatically ignores:
	• External URLs (http://, https://, etc.)
	• Email addresses (mailto:)
	• Anchors (#section)
	• Links inside code blocks

	Examples:
	%(prog)s # Check current directory
	%(prog)s /path/to/repo # Check specific repository
	%(prog)s . --verbose # Verbose output
	""",
	)

	parser.add_argument(
	"repo_path",
	nargs="?",
	default=".",
	help="Path to repository (default: current directory)",
	)

	parser.add_argument(
	"-v", "--verbose", action="store_true", help="Show verbose output"
	)

	parser.add_argument(
	"--no-default-excludes",
	action="store_true",
	help="Disable default directory exclusions",
	)

	parser.add_argument(
	"--exclude",
	action="append",
	default=[],
	help="Additional paths to exclude (can be repeated)",
	)

	args = parser.parse_args()

	repo_path = Path(args.repo_path).resolve()

	if not repo_path.exists():
	print(f"❌ Error: Path does not exist: {repo_path}", file=sys.stderr)
	return 2

	if not repo_path.is_dir():
	print(f"❌ Error: Path is not a directory: {repo_path}", file=sys.stderr)
	return 2

	excludes = set() if args.no_default_excludes else None
	if args.exclude:
	if excludes is None:
	excludes = set()
	excludes.update(args.exclude)

	checker = MarkdownLinkChecker(repo_path, excludes=excludes, verbose=args.verbose)

	checker.scan_repository()
	checker.generate_report()

	return 1 if checker.broken_links else 0


	if __name__ == "__main__":
	sys.exit(main())