Skip to content

Instantly share code, notes, and snippets.

@Red-Eyed
Last active March 30, 2026 13:36
Show Gist options
  • Select an option

  • Save Red-Eyed/b85e585b2df08e4c2d963343bd4e50de to your computer and use it in GitHub Desktop.

Select an option

Save Red-Eyed/b85e585b2df08e4c2d963343bd4e50de to your computer and use it in GitHub Desktop.
Anonymize a git repository by stripping copyright/author metadata. Supports undo via git revert.
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "pydantic-settings>=2.7",
# ]
# ///
"""
Anonymize a git repository by stripping copyright/author metadata.
Supports undo via git revert.
Usage:
uv run anonymize_repo.py anonymize [--commit-msg MSG] [--dry-run]
uv run anonymize_repo.py undo [--commit-sha SHA]
"""
from __future__ import annotations
import argparse
import re
import subprocess
import sys
from pathlib import Path
from typing import Optional
from pydantic import Field
from pydantic_settings import BaseSettings
# ---------------------------------------------------------------------------
# Patterns to strip / redact
# ---------------------------------------------------------------------------
COPYRIGHT_LINE_RE = re.compile(
r"^[^\S\r\n]*(?://|#|<!--|/\*|\*|--)\s*"
r"(?:copyright|\(c\)|©|copyright\s*\(c\)|copyright\s*©)"
r".*$",
re.IGNORECASE | re.MULTILINE,
)
AUTHOR_TAG_RE = re.compile(
r"^[^\S\r\n]*(?://|#|\*|--)\s*@author\b.*$",
re.IGNORECASE | re.MULTILINE,
)
AUTHOR_HEADER_RE = re.compile(
r"^[^\S\r\n]*(?://|#|\*|--|<!--)?\s*authors?:\s*.+$",
re.IGNORECASE | re.MULTILINE,
)
EMAIL_RE = re.compile(r"[\w.+-]+@[\w-]+(?:\.[\w-]+)+")
COMMIT_MSG_MARKER = "chore: anonymize repository — remove copyright and author info"
# ---------------------------------------------------------------------------
# Git helpers
# ---------------------------------------------------------------------------
def git(*args: str, cwd: Path, check: bool = True) -> subprocess.CompletedProcess[str]:
return subprocess.run(
["git", *args],
cwd=cwd,
capture_output=True,
text=True,
check=check,
)
def repo_root() -> Path:
result = git("rev-parse", "--show-toplevel", cwd=Path.cwd())
return Path(result.stdout.strip())
def tracked_text_files(root: Path) -> list[Path]:
result = git("ls-files", "-z", cwd=root)
paths = [root / p for p in result.stdout.split("\0") if p]
text_files = []
for p in paths:
if not p.is_file():
continue
try:
p.read_text(encoding="utf-8")
text_files.append(p)
except (UnicodeDecodeError, PermissionError):
pass
return text_files
def anonymize_content(text: str) -> tuple[str, int]:
"""Return (redacted_text, change_count)."""
original = text
text = COPYRIGHT_LINE_RE.sub("", text)
text = AUTHOR_TAG_RE.sub("", text)
text = AUTHOR_HEADER_RE.sub("", text)
text = EMAIL_RE.sub("[email protected]", text)
text = re.sub(r"\n{3,}", "\n\n", text)
changes = sum(1 for a, b in zip(original.splitlines(), text.splitlines()) if a != b)
changes += abs(len(original.splitlines()) - len(text.splitlines()))
return text, changes
# ---------------------------------------------------------------------------
# Settings models (env / .env override support via pydantic-settings)
# ---------------------------------------------------------------------------
class AnonymizeSettings(BaseSettings):
commit_msg: str = Field(default=COMMIT_MSG_MARKER)
dry_run: bool = Field(default=False)
model_config = {"cli_parse_args": False, "env_prefix": "ANON_"}
class UndoSettings(BaseSettings):
commit_sha: Optional[str] = Field(default=None)
model_config = {"cli_parse_args": False, "env_prefix": "ANON_"}
# ---------------------------------------------------------------------------
# Command implementations
# ---------------------------------------------------------------------------
def cmd_anonymize(settings: AnonymizeSettings) -> None:
root = repo_root()
files = tracked_text_files(root)
modified: list[tuple[Path, int]] = []
for path in files:
original = path.read_text(encoding="utf-8")
redacted, n = anonymize_content(original)
if n == 0:
continue
if settings.dry_run:
print(f"[dry-run] would modify {path.relative_to(root)} ({n} line(s))")
else:
path.write_text(redacted, encoding="utf-8")
modified.append((path, n))
if not modified:
print("Nothing to anonymize — repository is already clean.")
return
total = sum(n for _, n in modified)
print(
f"{'Would modify' if settings.dry_run else 'Modified'} "
f"{len(modified)} file(s), {total} line(s) total."
)
if settings.dry_run:
return
git("add", "-u", cwd=root)
if git("diff", "--cached", "--quiet", cwd=root, check=False).returncode == 0:
print("No staged changes after anonymization (files may already be clean).")
return
git(
"commit",
"--author=Anonymous <[email protected]>",
"-m", settings.commit_msg,
cwd=root,
)
sha = git("rev-parse", "HEAD", cwd=root).stdout.strip()
print(f"Committed anonymization as {sha[:12]}.")
print(f"To undo: uv run anonymize_repo.py undo --commit-sha {sha[:12]}")
def cmd_undo(settings: UndoSettings) -> None:
root = repo_root()
sha = settings.commit_sha
if sha is None:
result = git(
"log", "--oneline",
"--grep", COMMIT_MSG_MARKER,
"--max-count=20",
cwd=root,
)
lines = [l for l in result.stdout.strip().splitlines() if l]
if not lines:
print(
"Could not find an anonymization commit in recent history.\n"
"Pass --commit-sha explicitly.",
file=sys.stderr,
)
sys.exit(1)
sha = lines[0].split()[0]
print(f"Found anonymization commit: {sha}")
git("revert", "--no-edit", sha, cwd=root)
print(f"Reverted {sha} — original content restored.")
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Anonymize a git repo by removing copyright/author info.",
)
sub = parser.add_subparsers(dest="command", required=True)
p_anon = sub.add_parser("anonymize", help="Strip copyright and author info and commit.")
p_anon.add_argument(
"--commit-msg",
default=COMMIT_MSG_MARKER,
help="Git commit message (default: marker message).",
)
p_anon.add_argument(
"--dry-run",
action="store_true",
help="Print what would change without writing or committing.",
)
p_undo = sub.add_parser("undo", help="Revert the anonymization commit.")
p_undo.add_argument(
"--commit-sha",
default=None,
help="SHA to revert (auto-detected if omitted).",
)
return parser
def main() -> None:
args = build_parser().parse_args()
if args.command == "anonymize":
settings = AnonymizeSettings(commit_msg=args.commit_msg, dry_run=args.dry_run)
cmd_anonymize(settings)
elif args.command == "undo":
settings = UndoSettings(commit_sha=args.commit_sha)
cmd_undo(settings)
if __name__ == "__main__":
main()
@Red-Eyed
Copy link
Copy Markdown
Author

Red-Eyed commented Mar 30, 2026

uv run https://gist.githubusercontent.com/<user>/<gist-id>/raw/anonymize_repo.py anonymize

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment