Created
January 25, 2026 23:39
-
-
Save getjump/d71891ebec8254c6260cd6eea3b46bd5 to your computer and use it in GitHub Desktop.
session store sanitizer (privacy respecting)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import json | |
| import os | |
| import re | |
| import hashlib | |
| # ----------------------- | |
| # Settings / patterns | |
| # ----------------------- | |
| REDACTED = "REDACTED" | |
| # Keys that should be REMOVED from the output entirely (not redacted) | |
| DROP_KEYS_LOWER = { | |
| "structuredclonestate", | |
| } | |
| # Keys that should be redacted (value replaced with REDACTED) | |
| SENSITIVE_KEY_RE = re.compile( | |
| r"(pass(word)?|pwd|secret|token|jwt|bearer|auth(orization)?|session|sid|sess|csrf|xsrf|cookie|" | |
| r"localstorage|sessionstorage|indexeddb|storage|formdata|postdata|" | |
| r"credit|card|iban|ssn|email|phone|address|" | |
| r"device(id)?|client(id)?|telemetry|fingerprint|api[_-]?key|private|key|signature)", | |
| re.IGNORECASE, | |
| ) | |
| # Human-readable titles/labels etc that often contain private info | |
| TITLE_KEY_RE = re.compile( | |
| r"(^title$|tabtitle|pagetitle|entrytitle|label|caption|headline|subject|name$)", | |
| re.IGNORECASE, | |
| ) | |
| # If a value looks like a token/credential, redact it even if the key is innocent | |
| SENSITIVE_VALUE_RE = re.compile( | |
| r"(eyJ[a-zA-Z0-9_\-]+=*\.[a-zA-Z0-9_\-]+=*\.[a-zA-Z0-9_\-]+=*)" # JWT-ish | |
| r"|(\bBearer\s+[A-Za-z0-9\-_\.=]+\b)" | |
| r"|(\b[A-Fa-f0-9]{32,}\b)" # long hex | |
| r"|(\b[A-Za-z0-9_\-]{40,}\b)", # long random token-ish | |
| re.IGNORECASE, | |
| ) | |
| # Detect URL-like strings (whole-string or embedded) | |
| URL_EMBED_RE = re.compile(r"(https?|file)://[^\s\"\'<>]+", re.IGNORECASE) | |
| URL_WHOLE_RE = re.compile(r"^(https?|file)://", re.IGNORECASE) | |
| def url_id(u: str, n: int = 12) -> str: | |
| """Stable short id so dev can see uniqueness without leaking URL data.""" | |
| h = hashlib.sha256(u.encode("utf-8", errors="ignore")).hexdigest() | |
| return h[:n] | |
| def sanitize_url_opaque(url: str) -> str: | |
| """ | |
| Replace any URL with an opaque placeholder that does not contain domain/path/query/fragment. | |
| """ | |
| return f"url:{REDACTED}:{url_id(url)}" | |
| def sanitize_string(s: str) -> str: | |
| # Whole-string URL | |
| st = s.strip() | |
| if URL_WHOLE_RE.match(st): | |
| return sanitize_url_opaque(st) | |
| # Embedded URLs inside longer strings | |
| def _url_repl(m: re.Match) -> str: | |
| return sanitize_url_opaque(m.group(0)) | |
| s2 = URL_EMBED_RE.sub(_url_repl, s) | |
| # Token-ish content | |
| if SENSITIVE_VALUE_RE.search(s2): | |
| return REDACTED | |
| return s2 | |
| def sanitize(obj, *, keep_keys: set[str]): | |
| """ | |
| Recursively sanitize JSON-like structures. | |
| - remove DROP_KEYS_LOWER entirely | |
| - redact TITLE_KEY_RE keys | |
| - redact SENSITIVE_KEY_RE keys | |
| - sanitize strings (URLs -> opaque; token-ish -> REDACTED) | |
| """ | |
| if isinstance(obj, dict): | |
| new = {} | |
| for k, v in obj.items(): | |
| ks = str(k) | |
| ksl = ks.lower() | |
| # Keep key forcefully (use carefully) | |
| if ks in keep_keys: | |
| new[k] = sanitize(v, keep_keys=keep_keys) | |
| continue | |
| # Drop keys entirely | |
| if ksl in DROP_KEYS_LOWER: | |
| continue | |
| # Redact titles/labels | |
| if TITLE_KEY_RE.search(ks): | |
| new[k] = REDACTED | |
| continue | |
| # Redact sensitive keys | |
| if SENSITIVE_KEY_RE.search(ks): | |
| new[k] = REDACTED | |
| continue | |
| # Recurse | |
| new[k] = sanitize(v, keep_keys=keep_keys) | |
| return new | |
| if isinstance(obj, list): | |
| return [sanitize(x, keep_keys=keep_keys) for x in obj] | |
| if isinstance(obj, str): | |
| return sanitize_string(obj) | |
| # numbers / bool / None unchanged | |
| return obj | |
| def main(): | |
| ap = argparse.ArgumentParser( | |
| description="Sanitize sessionstore-like JSON: drop structuredCloneState, redact titles/tokens/cookies/storage, and remove domains from URLs." | |
| ) | |
| ap.add_argument("input", help="Input JSON file") | |
| ap.add_argument("-o", "--output", required=True, help="Output sanitized JSON file") | |
| ap.add_argument( | |
| "--keep-key", | |
| action="append", | |
| default=[], | |
| help="Key name to keep (never drop/redact), can be repeated", | |
| ) | |
| ap.add_argument("--pretty", action="store_true", help="Pretty-print JSON output") | |
| args = ap.parse_args() | |
| with open(args.input, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| sanitized = sanitize(data, keep_keys=set(args.keep_key)) | |
| os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True) | |
| with open(args.output, "w", encoding="utf-8") as f: | |
| if args.pretty: | |
| json.dump(sanitized, f, ensure_ascii=False, indent=2) | |
| else: | |
| json.dump(sanitized, f, ensure_ascii=False, separators=(",", ":")) | |
| print(f"Sanitized -> {args.output}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment