Last active
October 1, 2025 13:21
-
-
Save pszemraj/5bf5a801b617f607b9b147f4bcea8d8b to your computer and use it in GitHub Desktop.
helper scripts for tokenizer encoding viz
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import webbrowser | |
| from pathlib import Path | |
| from typing import Any, Callable, Optional, Union | |
| from tokenizers import Tokenizer as RustTokenizer | |
| from tokenizers.tools import EncodingVisualizer | |
| from transformers import AutoTokenizer, PreTrainedTokenizerBase | |
| SAMPLE_TEXT = '''class DyT(nn.Module): | |
| """ | |
| Dynamic Tanh, Transformers without Normalization | |
| https://jiachenzhu.github.io/DyT/ | |
| """ | |
| def __init__(self, num_features, alpha_init_value=0.5): | |
| """Initialize DyT. | |
| Args: | |
| num_features (int): Number of feature channels for weight/bias. | |
| alpha_init_value (float): Initial value for the gating scalar α. | |
| α is learned (nn.Parameter) and scales the pre-activation | |
| before tanh to control saturation at initialization. | |
| """ | |
| super().__init__() | |
| self.alpha = nn.Parameter(torch.ones(1) * alpha_init_value) | |
| self.weight = nn.Parameter(torch.ones(num_features)) | |
| self.bias = nn.Parameter(torch.zeros(num_features)) | |
| def forward(self, x): | |
| x = torch.tanh(self.alpha * x) | |
| return x * self.weight + self.bias | |
| ''' | |
| def tokenizer_report_and_visualize( | |
| tk: Union[PreTrainedTokenizerBase, RustTokenizer], | |
| sample_text: str = SAMPLE_TEXT, | |
| *, | |
| add_special_tokens: bool = True, | |
| output_html: Optional[str] = None, | |
| open_in_browser: bool = False, | |
| annotation_converter: Optional[Callable[[Any], Any]] = None, | |
| n_first_tokens: int = 15, | |
| ): | |
| """ | |
| Count tokens and ALWAYS produce a visualization HTML file (works outside notebooks). | |
| If running in a notebook, the viz will also render inline. | |
| """ | |
| if isinstance(tk, PreTrainedTokenizerBase): | |
| name = getattr(tk, "name_or_path", tk.__class__.__name__) | |
| backend = getattr(tk, "backend_tokenizer", None) | |
| if backend is None: | |
| raise ValueError( | |
| "EncodingVisualizer needs a *fast* tokenizer. " | |
| "Reload with `use_fast=True` or pass a `tokenizers.Tokenizer`." | |
| ) | |
| input_ids = tk.encode( | |
| sample_text, | |
| add_special_tokens=add_special_tokens, | |
| padding=False, | |
| truncation=False, | |
| ) | |
| tokens = tk.convert_ids_to_tokens(input_ids, skip_special_tokens=False) | |
| rust_tok = backend | |
| elif isinstance(tk, RustTokenizer): | |
| name = "tokenizers.Tokenizer" | |
| enc = tk.encode(sample_text) | |
| input_ids, tokens = enc.ids, enc.tokens | |
| rust_tok = tk | |
| else: | |
| raise TypeError( | |
| "`tk` must be a HF *fast* tokenizer or a `tokenizers.Tokenizer`." | |
| ) | |
| num_tokens = len(input_ids) | |
| print(f"tokenizer ({name}): {num_tokens} tokens") | |
| print( | |
| "first tokens:", | |
| tokens[:n_first_tokens] + (["..."] if len(tokens) > n_first_tokens else []), | |
| ) | |
| # --- build visualizer with default_to_notebook=False to get real HTML text --- | |
| viz = EncodingVisualizer( | |
| tokenizer=rust_tok, | |
| default_to_notebook=False, | |
| annotation_converter=annotation_converter, | |
| ) | |
| html_obj = viz(sample_text) # often a plain str; sometimes an HTML object | |
| # robust extraction of raw HTML | |
| html_str = None | |
| if isinstance(html_obj, str): | |
| html_str = html_obj | |
| elif hasattr(html_obj, "data") and html_obj.data is not None: | |
| html_str = html_obj.data | |
| elif hasattr(html_obj, "_repr_html_"): | |
| html_str = html_obj._repr_html_() | |
| if not html_str: | |
| # last resort: avoid writing "None"-fail loudly with context | |
| raise RuntimeError("Could not extract HTML from EncodingVisualizer output.") | |
| src_basename = ( | |
| tk.name_or_path.split("/")[-1] if len(str(tk.name_or_path)) > 1 else "unknown" | |
| ) | |
| path = ( | |
| Path(output_html).resolve() | |
| if output_html is not None | |
| else Path.cwd() / f"tokenizer_viz-{src_basename}.html" | |
| ) | |
| path.write_text(html_str, encoding="utf-8") | |
| print(f"[saved] {path}") | |
| # try to open a browser when outside notebooks | |
| try: | |
| from IPython import get_ipython | |
| in_nb = bool(get_ipython() and hasattr(get_ipython(), "kernel")) | |
| except Exception: | |
| in_nb = False | |
| if not in_nb and open_in_browser: | |
| webbrowser.open(path.as_uri()) | |
| # also render inline if in a notebook | |
| if in_nb: | |
| try: | |
| from IPython.display import HTML, display | |
| display(HTML(html_str)) | |
| except Exception: | |
| pass | |
| return { | |
| "tokenizer_name": name, | |
| "num_tokens": num_tokens, | |
| "input_ids": input_ids, | |
| "tokens": tokens, | |
| "html_file": str(path), | |
| } | |
| # --- example usage --- | |
| # from transformers import AutoTokenizer | |
| # tk = AutoTokenizer.from_pretrained( | |
| # "gpt2", | |
| # use_fast=True, | |
| # open_in_browser=True, | |
| # ) | |
| # report = tokenizer_report_and_visualize(tk) | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Visualize how a tokenizer encodes text", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| %(prog)s gpt2 | |
| %(prog)s bert-base-uncased --text "Hello, world!" | |
| %(prog)s meta-llama/Llama-2-7b-hf --file input.txt --open | |
| %(prog)s gpt2 --no-special-tokens --output my_viz.html | |
| """, | |
| ) | |
| parser.add_argument( | |
| "tokenizer", | |
| type=str, | |
| help="Hugging Face tokenizer name or path (e.g., 'gpt2', 'bert-base-uncased')", | |
| ) | |
| parser.add_argument( | |
| "--text", | |
| type=str, | |
| default=None, | |
| help="Text to tokenize (default: built-in code sample)", | |
| ) | |
| parser.add_argument( | |
| "--file", | |
| type=str, | |
| default=None, | |
| help="Path to text file to tokenize (overrides --text)", | |
| ) | |
| parser.add_argument( | |
| "--output", | |
| "-o", | |
| type=str, | |
| default=None, | |
| help="Output HTML file path (default: tokenizer_viz-<name>.html)", | |
| ) | |
| parser.add_argument( | |
| "--open", | |
| action="store_true", | |
| help="Open visualization in web browser after creation", | |
| ) | |
| parser.add_argument( | |
| "--no-special-tokens", | |
| action="store_true", | |
| help="Don't add special tokens (BOS, EOS, etc.)", | |
| ) | |
| parser.add_argument( | |
| "--n-first-tokens", | |
| type=int, | |
| default=15, | |
| help="Number of first tokens to print (default: 15)", | |
| ) | |
| args = parser.parse_args() | |
| # Determine text to tokenize | |
| if args.file: | |
| text = Path(args.file).read_text(encoding="utf-8") | |
| print(f"[input text] loaded {args.file}, ({len(text)} chars)") | |
| elif args.text: | |
| text = args.text | |
| else: | |
| text = SAMPLE_TEXT | |
| print("[input text] default sample text") | |
| # Load tokenizer | |
| print(f"[tokenizer] loading {args.tokenizer}") | |
| tk = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True) | |
| print(f"[tokenizer] vocab size:\t{len(tk)}") | |
| # Run visualization | |
| _ = tokenizer_report_and_visualize( | |
| tk, | |
| sample_text=text, | |
| add_special_tokens=not args.no_special_tokens, | |
| output_html=args.output, | |
| open_in_browser=args.open, | |
| n_first_tokens=args.n_first_tokens, | |
| ) | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import argparse | |
| import re | |
| import webbrowser | |
| from pathlib import Path | |
| DARK_BLOCK = r""" | |
| <meta name="color-scheme" content="dark light"> | |
| <style id="dark-mode-overrides"> | |
| :root { color-scheme: light dark; } | |
| /* High-contrast dark with cyan accent */ | |
| @media (prefers-color-scheme: dark){ | |
| /* base palette */ | |
| :root{ | |
| --bg: #0b0d12; | |
| --panel: #0f141b; | |
| --text: #f2f7fb; /* very light */ | |
| --muted: #2d333b; | |
| --chip1: rgba(0, 214, 255, 0.28); /* cyan tile (even) */ | |
| --chip2: rgba(0, 214, 255, 0.46); /* cyan tile (odd) */ | |
| --chipb1: rgba(0, 214, 255, 0.62);/* borders (even) */ | |
| --chipb2: rgba(0, 214, 255, 0.78);/* borders (odd) */ | |
| --grid: #3a4048; /* hatch line color */ | |
| --accent: #00d6ff; /* pure cyan accent */ | |
| } | |
| html, body { background:var(--bg); color:var(--text); } | |
| .tokenized-text{ | |
| background:var(--panel) !important; | |
| color:var(--text) !important; | |
| box-shadow:none !important; | |
| border-radius:10px; | |
| } | |
| .token{ color:var(--text) !important; } | |
| .non-token{ | |
| border-top:1px solid var(--muted) !important; | |
| border-bottom:1px solid var(--muted) !important; | |
| } | |
| /* HIGHER CONTRAST TILES */ | |
| .even-token{ | |
| background:var(--chip1) !important; | |
| border-color:var(--chipb1) !important; | |
| } | |
| .odd-token{ | |
| background:var(--chip2) !important; | |
| border-color:var(--chipb2) !important; | |
| } | |
| /* multi-token hatch with cyan blend */ | |
| .even-token.multi-token,.odd-token.multi-token{ | |
| background: | |
| repeating-linear-gradient(45deg, transparent, transparent 1px, var(--grid) 1px, var(--grid) 2px), | |
| linear-gradient(to bottom, var(--accent), var(--chip2)) !important; | |
| border-color:var(--chipb2) !important; | |
| } | |
| .multi-token:hover::after{ | |
| background:#0a1220 !important; | |
| color:var(--text) !important; | |
| } | |
| /* special-token chips */ | |
| .special-token:empty::before, | |
| .special-token:not(:empty):before{ | |
| background:#12343a !important; /* deep teal/cyan-ish */ | |
| color:var(--text) !important; | |
| } | |
| /* annotations: keep background = currentColor, ensure label text readable */ | |
| .annotation:before{ color:#0b0d12 !important; } /* dark text on bright bg */ | |
| } | |
| </style> | |
| """ | |
| VIEWPORT_FIX = r""" | |
| <style id="viewport-fix"> | |
| .tokenized-text { | |
| max-height: none !important; | |
| overflow-y: visible !important; | |
| min-height: 100vh; | |
| } | |
| </style> | |
| """ | |
| def inject_dark_css(html: str) -> str: | |
| # de-dupe | |
| html = re.sub(r'<style id="dark-mode-overrides".*?</style>', "", html, flags=re.S) | |
| html = re.sub(r'<style id="viewport-fix".*?</style>', "", html, flags=re.S) | |
| html = re.sub( | |
| r'<meta name="color-scheme" content="dark light">\s*', "", html, flags=re.S | |
| ) | |
| combined = DARK_BLOCK + VIEWPORT_FIX | |
| return ( | |
| html.replace("</head>", combined + "\n</head>") | |
| if "</head>" in html | |
| else combined + html | |
| ) | |
| def main(): | |
| p = argparse.ArgumentParser( | |
| description="Inject dark-mode CSS into EncodingVisualizer HTML", | |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter, | |
| ) | |
| p.add_argument("input", help="Path to the generated HTML") | |
| p.add_argument( | |
| "-o", "--output", help="Write to this file (default: OVERWRITE input)" | |
| ) | |
| p.add_argument( | |
| "--backup", | |
| action="store_true", | |
| help="Write a .bak alongside before overwriting", | |
| ) | |
| p.add_argument( | |
| "--no-open", action="store_true", help="Do not open in browser after writing" | |
| ) | |
| args = p.parse_args() | |
| src = Path(args.input) | |
| out = Path(args.output) if args.output else src | |
| html = src.read_text(encoding="utf-8") | |
| patched = inject_dark_css(html) | |
| if not args.output and args.backup: | |
| src.with_suffix(src.suffix + ".bak").write_text(html, encoding="utf-8") | |
| out.write_text(patched, encoding="utf-8") | |
| print(f"dark-mode CSS injected → {out.resolve()}") | |
| if not args.no_open: | |
| try: | |
| webbrowser.open(out.resolve().as_uri()) | |
| except Exception: | |
| pass | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment