Last active
March 30, 2026 13:36
-
-
Save Red-Eyed/b85e585b2df08e4c2d963343bd4e50de to your computer and use it in GitHub Desktop.
Anonymize a git repository by stripping copyright/author metadata. Supports undo via git revert.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env -S uv run --script | |
| # /// script | |
| # requires-python = ">=3.11" | |
| # dependencies = [ | |
| # "pydantic-settings>=2.7", | |
| # ] | |
| # /// | |
| """ | |
| Anonymize a git repository by stripping copyright/author metadata. | |
| Supports undo via git revert. | |
| Usage: | |
| uv run anonymize_repo.py anonymize [--commit-msg MSG] [--dry-run] | |
| uv run anonymize_repo.py undo [--commit-sha SHA] | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import re | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| from typing import Optional | |
| from pydantic import Field | |
| from pydantic_settings import BaseSettings | |
| # --------------------------------------------------------------------------- | |
| # Patterns to strip / redact | |
| # --------------------------------------------------------------------------- | |
| COPYRIGHT_LINE_RE = re.compile( | |
| r"^[^\S\r\n]*(?://|#|<!--|/\*|\*|--)\s*" | |
| r"(?:copyright|\(c\)|©|copyright\s*\(c\)|copyright\s*©)" | |
| r".*$", | |
| re.IGNORECASE | re.MULTILINE, | |
| ) | |
| AUTHOR_TAG_RE = re.compile( | |
| r"^[^\S\r\n]*(?://|#|\*|--)\s*@author\b.*$", | |
| re.IGNORECASE | re.MULTILINE, | |
| ) | |
| AUTHOR_HEADER_RE = re.compile( | |
| r"^[^\S\r\n]*(?://|#|\*|--|<!--)?\s*authors?:\s*.+$", | |
| re.IGNORECASE | re.MULTILINE, | |
| ) | |
| EMAIL_RE = re.compile(r"[\w.+-]+@[\w-]+(?:\.[\w-]+)+") | |
| COMMIT_MSG_MARKER = "chore: anonymize repository — remove copyright and author info" | |
| # --------------------------------------------------------------------------- | |
| # Git helpers | |
| # --------------------------------------------------------------------------- | |
| def git(*args: str, cwd: Path, check: bool = True) -> subprocess.CompletedProcess[str]: | |
| return subprocess.run( | |
| ["git", *args], | |
| cwd=cwd, | |
| capture_output=True, | |
| text=True, | |
| check=check, | |
| ) | |
| def repo_root() -> Path: | |
| result = git("rev-parse", "--show-toplevel", cwd=Path.cwd()) | |
| return Path(result.stdout.strip()) | |
| def tracked_text_files(root: Path) -> list[Path]: | |
| result = git("ls-files", "-z", cwd=root) | |
| paths = [root / p for p in result.stdout.split("\0") if p] | |
| text_files = [] | |
| for p in paths: | |
| if not p.is_file(): | |
| continue | |
| try: | |
| p.read_text(encoding="utf-8") | |
| text_files.append(p) | |
| except (UnicodeDecodeError, PermissionError): | |
| pass | |
| return text_files | |
| def anonymize_content(text: str) -> tuple[str, int]: | |
| """Return (redacted_text, change_count).""" | |
| original = text | |
| text = COPYRIGHT_LINE_RE.sub("", text) | |
| text = AUTHOR_TAG_RE.sub("", text) | |
| text = AUTHOR_HEADER_RE.sub("", text) | |
| text = EMAIL_RE.sub("[email protected]", text) | |
| text = re.sub(r"\n{3,}", "\n\n", text) | |
| changes = sum(1 for a, b in zip(original.splitlines(), text.splitlines()) if a != b) | |
| changes += abs(len(original.splitlines()) - len(text.splitlines())) | |
| return text, changes | |
| # --------------------------------------------------------------------------- | |
| # Settings models (env / .env override support via pydantic-settings) | |
| # --------------------------------------------------------------------------- | |
| class AnonymizeSettings(BaseSettings): | |
| commit_msg: str = Field(default=COMMIT_MSG_MARKER) | |
| dry_run: bool = Field(default=False) | |
| model_config = {"cli_parse_args": False, "env_prefix": "ANON_"} | |
| class UndoSettings(BaseSettings): | |
| commit_sha: Optional[str] = Field(default=None) | |
| model_config = {"cli_parse_args": False, "env_prefix": "ANON_"} | |
| # --------------------------------------------------------------------------- | |
| # Command implementations | |
| # --------------------------------------------------------------------------- | |
| def cmd_anonymize(settings: AnonymizeSettings) -> None: | |
| root = repo_root() | |
| files = tracked_text_files(root) | |
| modified: list[tuple[Path, int]] = [] | |
| for path in files: | |
| original = path.read_text(encoding="utf-8") | |
| redacted, n = anonymize_content(original) | |
| if n == 0: | |
| continue | |
| if settings.dry_run: | |
| print(f"[dry-run] would modify {path.relative_to(root)} ({n} line(s))") | |
| else: | |
| path.write_text(redacted, encoding="utf-8") | |
| modified.append((path, n)) | |
| if not modified: | |
| print("Nothing to anonymize — repository is already clean.") | |
| return | |
| total = sum(n for _, n in modified) | |
| print( | |
| f"{'Would modify' if settings.dry_run else 'Modified'} " | |
| f"{len(modified)} file(s), {total} line(s) total." | |
| ) | |
| if settings.dry_run: | |
| return | |
| git("add", "-u", cwd=root) | |
| if git("diff", "--cached", "--quiet", cwd=root, check=False).returncode == 0: | |
| print("No staged changes after anonymization (files may already be clean).") | |
| return | |
| git( | |
| "commit", | |
| "--author=Anonymous <[email protected]>", | |
| "-m", settings.commit_msg, | |
| cwd=root, | |
| ) | |
| sha = git("rev-parse", "HEAD", cwd=root).stdout.strip() | |
| print(f"Committed anonymization as {sha[:12]}.") | |
| print(f"To undo: uv run anonymize_repo.py undo --commit-sha {sha[:12]}") | |
| def cmd_undo(settings: UndoSettings) -> None: | |
| root = repo_root() | |
| sha = settings.commit_sha | |
| if sha is None: | |
| result = git( | |
| "log", "--oneline", | |
| "--grep", COMMIT_MSG_MARKER, | |
| "--max-count=20", | |
| cwd=root, | |
| ) | |
| lines = [l for l in result.stdout.strip().splitlines() if l] | |
| if not lines: | |
| print( | |
| "Could not find an anonymization commit in recent history.\n" | |
| "Pass --commit-sha explicitly.", | |
| file=sys.stderr, | |
| ) | |
| sys.exit(1) | |
| sha = lines[0].split()[0] | |
| print(f"Found anonymization commit: {sha}") | |
| git("revert", "--no-edit", sha, cwd=root) | |
| print(f"Reverted {sha} — original content restored.") | |
| # --------------------------------------------------------------------------- | |
| # CLI | |
| # --------------------------------------------------------------------------- | |
| def build_parser() -> argparse.ArgumentParser: | |
| parser = argparse.ArgumentParser( | |
| description="Anonymize a git repo by removing copyright/author info.", | |
| ) | |
| sub = parser.add_subparsers(dest="command", required=True) | |
| p_anon = sub.add_parser("anonymize", help="Strip copyright and author info and commit.") | |
| p_anon.add_argument( | |
| "--commit-msg", | |
| default=COMMIT_MSG_MARKER, | |
| help="Git commit message (default: marker message).", | |
| ) | |
| p_anon.add_argument( | |
| "--dry-run", | |
| action="store_true", | |
| help="Print what would change without writing or committing.", | |
| ) | |
| p_undo = sub.add_parser("undo", help="Revert the anonymization commit.") | |
| p_undo.add_argument( | |
| "--commit-sha", | |
| default=None, | |
| help="SHA to revert (auto-detected if omitted).", | |
| ) | |
| return parser | |
| def main() -> None: | |
| args = build_parser().parse_args() | |
| if args.command == "anonymize": | |
| settings = AnonymizeSettings(commit_msg=args.commit_msg, dry_run=args.dry_run) | |
| cmd_anonymize(settings) | |
| elif args.command == "undo": | |
| settings = UndoSettings(commit_sha=args.commit_sha) | |
| cmd_undo(settings) | |
| if __name__ == "__main__": | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
uv run https://gist.githubusercontent.com/<user>/<gist-id>/raw/anonymize_repo.py anonymize