Last active
September 20, 2025 00:21
-
-
Save pszemraj/d5836c88cfa7f5b9371ac150be473252 to your computer and use it in GitHub Desktop.
check through .md files a repo [directory] and subdirs for broken/nonexistent links to files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Markdown broken link checker - properly distinguishes between URLs and file paths. | |
| """ | |
| import re | |
| import sys | |
| import argparse | |
| from pathlib import Path | |
| from typing import Dict, List, Tuple, Optional, Set, Iterator | |
| from dataclasses import dataclass | |
| from urllib.parse import unquote | |
| from collections import defaultdict | |
| @dataclass | |
| class BrokenLink: | |
| """Represents a broken link found in a markdown file.""" | |
| file_path: Path | |
| line_num: int | |
| line_content: str | |
| link_text: str | |
| link_target: str | |
| resolved_path: Optional[Path] | |
| reason: str | |
| @dataclass | |
| class CheckerStats: | |
| """Statistics for the checking process.""" | |
| total_files: int = 0 | |
| total_links: int = 0 | |
| external_links: int = 0 | |
| file_links: int = 0 | |
| class MarkdownLinkChecker: | |
| """Markdown link checker that properly handles URLs vs file paths.""" | |
| # Default directories to exclude | |
| DEFAULT_EXCLUDES = { | |
| ".git", | |
| ".hg", | |
| ".svn", | |
| ".venv", | |
| "venv", | |
| "env", | |
| "node_modules", | |
| "bower_components", | |
| "dist", | |
| "build", | |
| "out", | |
| "_build", | |
| ".mypy_cache", | |
| ".pytest_cache", | |
| ".tox", | |
| ".idea", | |
| ".vscode", | |
| ".vs", | |
| "__pycache__", | |
| ".DS_Store", | |
| "coverage", | |
| "htmlcov", | |
| ".coverage", | |
| ".terraform", | |
| ".serverless", | |
| } | |
| # Markdown extensions to check | |
| MD_EXTENSIONS = {".md", ".markdown", ".mkd", ".mdx"} | |
| # Regex patterns | |
| FENCE_PATTERN = re.compile(r"^\s*([`~]{3,})") | |
| INLINE_CODE_PATTERN = re.compile(r"`[^`\n]+`") | |
| # Link patterns | |
| INLINE_LINK_PATTERN = re.compile(r"(!?)\[([^\]]*)\]\(([^)]+)\)") | |
| REF_USE_PATTERN = re.compile(r"\[([^\]]+)\]\[([^\]]*)\]") | |
| REF_DEF_PATTERN = re.compile(r"^\s{0,3}\[([^\]]+)\]:\s*(\S+)(?:\s+[\"'(].*[\"')])?") | |
| # HTML patterns | |
| HTML_LINK_PATTERN = re.compile( | |
| r'<a[^>]+href\s*=\s*["\']([^"\']+)["\'][^>]*>', re.IGNORECASE | |
| ) | |
| HTML_IMG_PATTERN = re.compile( | |
| r'<img[^>]+src\s*=\s*["\']([^"\']+)["\'][^>]*>', re.IGNORECASE | |
| ) | |
| def __init__( | |
| self, | |
| repo_path: Path, | |
| excludes: Optional[Set[str]] = None, | |
| verbose: bool = False, | |
| ): | |
| self.repo_path = repo_path.resolve() | |
| self.excludes = self.DEFAULT_EXCLUDES.copy() | |
| if excludes: | |
| self.excludes.update(excludes) | |
| self.verbose = verbose | |
| self.broken_links: List[BrokenLink] = [] | |
| self.stats = CheckerStats() | |
| def should_skip_path(self, path: Path) -> bool: | |
| """Check if a path should be skipped based on excludes.""" | |
| parts = set(path.parts) | |
| return bool(self.excludes & parts) | |
| def is_url_or_special(self, target: str) -> bool: | |
| """ | |
| Check if target is a URL, email, anchor, or other non-file reference. | |
| This must be called BEFORE any cleaning/processing of the target. | |
| """ | |
| target = target.strip() | |
| # Anchors | |
| if target.startswith("#"): | |
| return True | |
| # Protocol-relative URLs | |
| if target.startswith("//"): | |
| return True | |
| # Check for common URL patterns | |
| url_patterns = [ | |
| r"^https?://", | |
| r"^ftps?://", | |
| r"^wss?://", | |
| r"^mailto:", | |
| r"^tel:", | |
| r"^sms:", | |
| r"^data:", | |
| r"^javascript:", | |
| r"^vbscript:", | |
| r"^file://", | |
| r"^news:", | |
| r"^nntp:", | |
| r"^ircs?://", | |
| r"^gopher://", | |
| r"^gemini://", | |
| r"^magnet:", | |
| r"^ssh://", | |
| r"^git://", | |
| ] | |
| for pattern in url_patterns: | |
| if re.match(pattern, target, re.IGNORECASE): | |
| return True | |
| # Check if it has a domain-like structure (contains :// or common domains) | |
| if "://" in target: | |
| return True | |
| # Common domains without protocol (sometimes used in markdown) | |
| domain_patterns = [ | |
| r"^[a-z]+\.(com|org|net|io|dev|edu|gov|co\.[a-z]{2})", | |
| r"^github\.com/", | |
| r"^gitlab\.com/", | |
| r"^bitbucket\.org/", | |
| r"^stackoverflow\.com/", | |
| r"^huggingface\.co/", | |
| r"^arxiv\.org/", | |
| ] | |
| for pattern in domain_patterns: | |
| if re.match(pattern, target, re.IGNORECASE): | |
| return True | |
| return False | |
| def clean_file_path(self, target: str) -> str: | |
| """ | |
| Clean a file path target by removing fragments, queries, and titles. | |
| Only call this AFTER confirming it's not a URL. | |
| """ | |
| # Remove angle brackets if present | |
| if target.startswith("<") and ">" in target: | |
| end_idx = target.index(">") | |
| target = target[1:end_idx] | |
| # Remove markdown link titles (text after space and quotes) | |
| # "path/file.md "title"" -> "path/file.md" | |
| if ' "' in target or " '" in target: | |
| # Find first unquoted space and take everything before it | |
| parts = target.split() | |
| if parts: | |
| target = parts[0] | |
| # Remove URL fragment (#section) | |
| if "#" in target: | |
| target = target.split("#")[0] | |
| # Remove query string (?param=value) | |
| if "?" in target: | |
| target = target.split("?")[0] | |
| # Decode percent-encoding | |
| target = unquote(target) | |
| return target.strip() | |
| def resolve_link_path( | |
| self, link: str, source_file: Path | |
| ) -> Tuple[Optional[Path], str]: | |
| """ | |
| Resolve a link to an absolute path if it's a file reference. | |
| Returns: (resolved_path, reason_if_skipped) | |
| """ | |
| # First check if it's a URL or special link | |
| if self.is_url_or_special(link): | |
| self.stats.external_links += 1 | |
| return None, "external URL or anchor" | |
| # Now we know it's meant to be a file path, so clean it | |
| link = self.clean_file_path(link) | |
| if not link: | |
| return None, "empty after cleaning" | |
| self.stats.file_links += 1 | |
| # Resolve based on type | |
| if link.startswith("/"): | |
| # Repository-absolute path | |
| resolved = self.repo_path / link[1:] | |
| else: | |
| # Relative to source file's directory | |
| resolved = source_file.parent / link | |
| # Normalize path | |
| try: | |
| resolved = resolved.resolve(strict=False) | |
| except Exception as e: | |
| return None, f"path resolution failed: {e}" | |
| # Security check: ensure path doesn't escape repo | |
| try: | |
| resolved.relative_to(self.repo_path) | |
| except ValueError: | |
| return resolved, "escapes repository root (security issue)" | |
| return resolved, "" | |
| def collect_reference_definitions(self, lines: List[str]) -> Dict[str, str]: | |
| """Collect all reference-style link definitions, respecting code blocks.""" | |
| refs = {} | |
| inside_fence = False | |
| fence_marker = None | |
| for line in lines: | |
| # Check for code fence | |
| fence_match = self.FENCE_PATTERN.match(line) | |
| if fence_match: | |
| marker = fence_match.group(1) | |
| if not inside_fence: | |
| inside_fence = True | |
| fence_marker = marker | |
| elif marker.startswith(fence_marker): | |
| inside_fence = False | |
| fence_marker = None | |
| continue | |
| if inside_fence: | |
| continue | |
| # Check for reference definition | |
| ref_match = self.REF_DEF_PATTERN.match(line) | |
| if ref_match: | |
| label = ref_match.group(1).strip().lower() | |
| target = ref_match.group(2) | |
| refs[label] = target | |
| return refs | |
| def extract_links_from_line( | |
| self, line: str, refs: Dict[str, str] | |
| ) -> Iterator[Tuple[str, str]]: | |
| """Extract all links from a line of text. Yields (link_text, link_target).""" | |
| # Remove inline code spans first | |
| line_cleaned = self.INLINE_CODE_PATTERN.sub("", line) | |
| # Standard markdown links: [text](url) or  | |
| for match in self.INLINE_LINK_PATTERN.finditer(line_cleaned): | |
| is_image = bool(match.group(1)) | |
| text = match.group(2) or ("image" if is_image else "link") | |
| target = match.group(3) | |
| yield text, target | |
| # Reference-style links: [text][ref] | |
| for match in self.REF_USE_PATTERN.finditer(line_cleaned): | |
| text = match.group(1) | |
| ref_label = match.group(2) or text | |
| ref_label = ref_label.strip().lower() | |
| if ref_label in refs: | |
| yield text, refs[ref_label] | |
| # HTML <a href="..."> | |
| for match in self.HTML_LINK_PATTERN.finditer(line_cleaned): | |
| yield "HTML link", match.group(1) | |
| # HTML <img src="..."> | |
| for match in self.HTML_IMG_PATTERN.finditer(line_cleaned): | |
| yield "HTML image", match.group(1) | |
| def check_file(self, md_file: Path) -> None: | |
| """Check a single markdown file for broken links.""" | |
| try: | |
| content = md_file.read_text(encoding="utf-8", errors="replace") | |
| except Exception as e: | |
| if self.verbose: | |
| print( | |
| f"β οΈ Error reading {md_file.relative_to(self.repo_path)}: {e}", | |
| file=sys.stderr, | |
| ) | |
| return | |
| lines = content.splitlines() | |
| refs = self.collect_reference_definitions(lines) | |
| inside_fence = False | |
| fence_marker = None | |
| for line_num, line in enumerate(lines, 1): | |
| # Skip code blocks | |
| fence_match = self.FENCE_PATTERN.match(line) | |
| if fence_match: | |
| marker = fence_match.group(1) | |
| if not inside_fence: | |
| inside_fence = True | |
| fence_marker = marker | |
| elif marker.startswith(fence_marker): | |
| inside_fence = False | |
| fence_marker = None | |
| continue | |
| if inside_fence: | |
| continue | |
| # Extract and check links | |
| for link_text, link_target in self.extract_links_from_line(line, refs): | |
| self.stats.total_links += 1 | |
| resolved_path, reason = self.resolve_link_path(link_target, md_file) | |
| # Skip external links and anchors | |
| if reason in ("external URL or anchor", "empty after cleaning"): | |
| continue | |
| # Report security issues | |
| if reason and ("security" in reason or "escape" in reason): | |
| self.broken_links.append( | |
| BrokenLink( | |
| file_path=md_file, | |
| line_num=line_num, | |
| line_content=line.strip()[:100], | |
| link_text=link_text, | |
| link_target=link_target, | |
| resolved_path=resolved_path, | |
| reason=reason, | |
| ) | |
| ) | |
| continue | |
| # Check if file exists | |
| if resolved_path and not resolved_path.exists(): | |
| self.broken_links.append( | |
| BrokenLink( | |
| file_path=md_file, | |
| line_num=line_num, | |
| line_content=line.strip()[:100], | |
| link_text=link_text, | |
| link_target=link_target, | |
| resolved_path=resolved_path, | |
| reason="file does not exist", | |
| ) | |
| ) | |
| def scan_repository(self) -> None: | |
| """Efficiently scan repository with directory pruning.""" | |
| print(f"π Scanning {self.repo_path}") | |
| if self.verbose and self.excludes: | |
| print(f" Excluding: {', '.join(sorted(self.excludes))}") | |
| # Manual traversal with pruning | |
| stack = [self.repo_path] | |
| while stack: | |
| current = stack.pop() | |
| if current.is_dir(): | |
| if self.should_skip_path(current): | |
| continue | |
| try: | |
| for child in sorted(current.iterdir()): | |
| stack.append(child) | |
| except PermissionError: | |
| continue | |
| elif current.suffix.lower() in self.MD_EXTENSIONS: | |
| self.stats.total_files += 1 | |
| if self.verbose: | |
| print(f" Checking: {current.relative_to(self.repo_path)}") | |
| self.check_file(current) | |
| def generate_report(self) -> None: | |
| """Generate comprehensive report with statistics and grouping.""" | |
| print("\nπ Scan Statistics:") | |
| print(f" Files scanned: {self.stats.total_files}") | |
| print(f" Total links: {self.stats.total_links}") | |
| print(f" External URLs/anchors: {self.stats.external_links}") | |
| print(f" File path links: {self.stats.file_links}") | |
| if not self.broken_links: | |
| print("\nβ No broken file links found!") | |
| return | |
| print(f"\nβ Found {len(self.broken_links)} broken file link(s):") | |
| print("=" * 80) | |
| # Group by file | |
| by_file = defaultdict(list) | |
| for broken in self.broken_links: | |
| by_file[broken.file_path].append(broken) | |
| # Display broken links grouped by file | |
| for file_path in sorted(by_file.keys()): | |
| rel_path = file_path.relative_to(self.repo_path) | |
| links = by_file[file_path] | |
| print( | |
| f"\nπ {rel_path} ({len(links)} issue{'s' if len(links) > 1 else ''})" | |
| ) | |
| print("-" * 80) | |
| for broken in sorted(links, key=lambda x: x.line_num): | |
| print( | |
| f" Line {broken.line_num}: [{broken.link_text}]({broken.link_target})" | |
| ) | |
| if broken.resolved_path: | |
| try: | |
| resolved_rel = broken.resolved_path.relative_to(self.repo_path) | |
| except ValueError: | |
| resolved_rel = broken.resolved_path | |
| print(f" β Resolved to: {resolved_rel}") | |
| print(f" β {broken.reason}") | |
| if broken.line_content: | |
| print(f" Context: ...{broken.line_content}...") | |
| print() | |
| # Most commonly missing files | |
| missing_counts = defaultdict(int) | |
| for broken in self.broken_links: | |
| if broken.resolved_path and broken.reason == "file does not exist": | |
| missing_counts[broken.resolved_path] += 1 | |
| if missing_counts: | |
| print("=" * 80) | |
| print("\nπ Most referenced missing files:") | |
| for path, count in sorted(missing_counts.items(), key=lambda x: -x[1])[:5]: | |
| try: | |
| rel = path.relative_to(self.repo_path) | |
| except ValueError: | |
| rel = path | |
| suffix = f" ({count} reference{'s' if count > 1 else ''})" | |
| print(f" β’ {rel}{suffix}") | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Check markdown files for broken file path links", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| This tool checks for broken file path references in markdown files. | |
| It automatically ignores: | |
| β’ External URLs (http://, https://, etc.) | |
| β’ Email addresses (mailto:) | |
| β’ Anchors (#section) | |
| β’ Links inside code blocks | |
| Examples: | |
| %(prog)s # Check current directory | |
| %(prog)s /path/to/repo # Check specific repository | |
| %(prog)s . --verbose # Verbose output | |
| """, | |
| ) | |
| parser.add_argument( | |
| "repo_path", | |
| nargs="?", | |
| default=".", | |
| help="Path to repository (default: current directory)", | |
| ) | |
| parser.add_argument( | |
| "-v", "--verbose", action="store_true", help="Show verbose output" | |
| ) | |
| parser.add_argument( | |
| "--no-default-excludes", | |
| action="store_true", | |
| help="Disable default directory exclusions", | |
| ) | |
| parser.add_argument( | |
| "--exclude", | |
| action="append", | |
| default=[], | |
| help="Additional paths to exclude (can be repeated)", | |
| ) | |
| args = parser.parse_args() | |
| repo_path = Path(args.repo_path).resolve() | |
| if not repo_path.exists(): | |
| print(f"β Error: Path does not exist: {repo_path}", file=sys.stderr) | |
| return 2 | |
| if not repo_path.is_dir(): | |
| print(f"β Error: Path is not a directory: {repo_path}", file=sys.stderr) | |
| return 2 | |
| excludes = set() if args.no_default_excludes else None | |
| if args.exclude: | |
| if excludes is None: | |
| excludes = set() | |
| excludes.update(args.exclude) | |
| checker = MarkdownLinkChecker(repo_path, excludes=excludes, verbose=args.verbose) | |
| checker.scan_repository() | |
| checker.generate_report() | |
| return 1 if checker.broken_links else 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment