Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Last active October 1, 2025 13:21
Show Gist options
  • Save pszemraj/5bf5a801b617f607b9b147f4bcea8d8b to your computer and use it in GitHub Desktop.
Save pszemraj/5bf5a801b617f607b9b147f4bcea8d8b to your computer and use it in GitHub Desktop.
helper scripts for tokenizer encoding viz
import argparse
import webbrowser
from pathlib import Path
from typing import Any, Callable, Optional, Union
from tokenizers import Tokenizer as RustTokenizer
from tokenizers.tools import EncodingVisualizer
from transformers import AutoTokenizer, PreTrainedTokenizerBase
SAMPLE_TEXT = '''class DyT(nn.Module):
"""
Dynamic Tanh, Transformers without Normalization
https://jiachenzhu.github.io/DyT/
"""
def __init__(self, num_features, alpha_init_value=0.5):
"""Initialize DyT.
Args:
num_features (int): Number of feature channels for weight/bias.
alpha_init_value (float): Initial value for the gating scalar α.
α is learned (nn.Parameter) and scales the pre-activation
before tanh to control saturation at initialization.
"""
super().__init__()
self.alpha = nn.Parameter(torch.ones(1) * alpha_init_value)
self.weight = nn.Parameter(torch.ones(num_features))
self.bias = nn.Parameter(torch.zeros(num_features))
def forward(self, x):
x = torch.tanh(self.alpha * x)
return x * self.weight + self.bias
'''
def tokenizer_report_and_visualize(
tk: Union[PreTrainedTokenizerBase, RustTokenizer],
sample_text: str = SAMPLE_TEXT,
*,
add_special_tokens: bool = True,
output_html: Optional[str] = None,
open_in_browser: bool = False,
annotation_converter: Optional[Callable[[Any], Any]] = None,
n_first_tokens: int = 15,
):
"""
Count tokens and ALWAYS produce a visualization HTML file (works outside notebooks).
If running in a notebook, the viz will also render inline.
"""
if isinstance(tk, PreTrainedTokenizerBase):
name = getattr(tk, "name_or_path", tk.__class__.__name__)
backend = getattr(tk, "backend_tokenizer", None)
if backend is None:
raise ValueError(
"EncodingVisualizer needs a *fast* tokenizer. "
"Reload with `use_fast=True` or pass a `tokenizers.Tokenizer`."
)
input_ids = tk.encode(
sample_text,
add_special_tokens=add_special_tokens,
padding=False,
truncation=False,
)
tokens = tk.convert_ids_to_tokens(input_ids, skip_special_tokens=False)
rust_tok = backend
elif isinstance(tk, RustTokenizer):
name = "tokenizers.Tokenizer"
enc = tk.encode(sample_text)
input_ids, tokens = enc.ids, enc.tokens
rust_tok = tk
else:
raise TypeError(
"`tk` must be a HF *fast* tokenizer or a `tokenizers.Tokenizer`."
)
num_tokens = len(input_ids)
print(f"tokenizer ({name}): {num_tokens} tokens")
print(
"first tokens:",
tokens[:n_first_tokens] + (["..."] if len(tokens) > n_first_tokens else []),
)
# --- build visualizer with default_to_notebook=False to get real HTML text ---
viz = EncodingVisualizer(
tokenizer=rust_tok,
default_to_notebook=False,
annotation_converter=annotation_converter,
)
html_obj = viz(sample_text) # often a plain str; sometimes an HTML object
# robust extraction of raw HTML
html_str = None
if isinstance(html_obj, str):
html_str = html_obj
elif hasattr(html_obj, "data") and html_obj.data is not None:
html_str = html_obj.data
elif hasattr(html_obj, "_repr_html_"):
html_str = html_obj._repr_html_()
if not html_str:
# last resort: avoid writing "None"-fail loudly with context
raise RuntimeError("Could not extract HTML from EncodingVisualizer output.")
src_basename = (
tk.name_or_path.split("/")[-1] if len(str(tk.name_or_path)) > 1 else "unknown"
)
path = (
Path(output_html).resolve()
if output_html is not None
else Path.cwd() / f"tokenizer_viz-{src_basename}.html"
)
path.write_text(html_str, encoding="utf-8")
print(f"[saved] {path}")
# try to open a browser when outside notebooks
try:
from IPython import get_ipython
in_nb = bool(get_ipython() and hasattr(get_ipython(), "kernel"))
except Exception:
in_nb = False
if not in_nb and open_in_browser:
webbrowser.open(path.as_uri())
# also render inline if in a notebook
if in_nb:
try:
from IPython.display import HTML, display
display(HTML(html_str))
except Exception:
pass
return {
"tokenizer_name": name,
"num_tokens": num_tokens,
"input_ids": input_ids,
"tokens": tokens,
"html_file": str(path),
}
# --- example usage ---
# from transformers import AutoTokenizer
# tk = AutoTokenizer.from_pretrained(
# "gpt2",
# use_fast=True,
# open_in_browser=True,
# )
# report = tokenizer_report_and_visualize(tk)
def main():
parser = argparse.ArgumentParser(
description="Visualize how a tokenizer encodes text",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s gpt2
%(prog)s bert-base-uncased --text "Hello, world!"
%(prog)s meta-llama/Llama-2-7b-hf --file input.txt --open
%(prog)s gpt2 --no-special-tokens --output my_viz.html
""",
)
parser.add_argument(
"tokenizer",
type=str,
help="Hugging Face tokenizer name or path (e.g., 'gpt2', 'bert-base-uncased')",
)
parser.add_argument(
"--text",
type=str,
default=None,
help="Text to tokenize (default: built-in code sample)",
)
parser.add_argument(
"--file",
type=str,
default=None,
help="Path to text file to tokenize (overrides --text)",
)
parser.add_argument(
"--output",
"-o",
type=str,
default=None,
help="Output HTML file path (default: tokenizer_viz-<name>.html)",
)
parser.add_argument(
"--open",
action="store_true",
help="Open visualization in web browser after creation",
)
parser.add_argument(
"--no-special-tokens",
action="store_true",
help="Don't add special tokens (BOS, EOS, etc.)",
)
parser.add_argument(
"--n-first-tokens",
type=int,
default=15,
help="Number of first tokens to print (default: 15)",
)
args = parser.parse_args()
# Determine text to tokenize
if args.file:
text = Path(args.file).read_text(encoding="utf-8")
print(f"[input text] loaded {args.file}, ({len(text)} chars)")
elif args.text:
text = args.text
else:
text = SAMPLE_TEXT
print("[input text] default sample text")
# Load tokenizer
print(f"[tokenizer] loading {args.tokenizer}")
tk = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True)
print(f"[tokenizer] vocab size:\t{len(tk)}")
# Run visualization
_ = tokenizer_report_and_visualize(
tk,
sample_text=text,
add_special_tokens=not args.no_special_tokens,
output_html=args.output,
open_in_browser=args.open,
n_first_tokens=args.n_first_tokens,
)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
import argparse
import re
import webbrowser
from pathlib import Path
DARK_BLOCK = r"""
<meta name="color-scheme" content="dark light">
<style id="dark-mode-overrides">
:root { color-scheme: light dark; }
/* High-contrast dark with cyan accent */
@media (prefers-color-scheme: dark){
/* base palette */
:root{
--bg: #0b0d12;
--panel: #0f141b;
--text: #f2f7fb; /* very light */
--muted: #2d333b;
--chip1: rgba(0, 214, 255, 0.28); /* cyan tile (even) */
--chip2: rgba(0, 214, 255, 0.46); /* cyan tile (odd) */
--chipb1: rgba(0, 214, 255, 0.62);/* borders (even) */
--chipb2: rgba(0, 214, 255, 0.78);/* borders (odd) */
--grid: #3a4048; /* hatch line color */
--accent: #00d6ff; /* pure cyan accent */
}
html, body { background:var(--bg); color:var(--text); }
.tokenized-text{
background:var(--panel) !important;
color:var(--text) !important;
box-shadow:none !important;
border-radius:10px;
}
.token{ color:var(--text) !important; }
.non-token{
border-top:1px solid var(--muted) !important;
border-bottom:1px solid var(--muted) !important;
}
/* HIGHER CONTRAST TILES */
.even-token{
background:var(--chip1) !important;
border-color:var(--chipb1) !important;
}
.odd-token{
background:var(--chip2) !important;
border-color:var(--chipb2) !important;
}
/* multi-token hatch with cyan blend */
.even-token.multi-token,.odd-token.multi-token{
background:
repeating-linear-gradient(45deg, transparent, transparent 1px, var(--grid) 1px, var(--grid) 2px),
linear-gradient(to bottom, var(--accent), var(--chip2)) !important;
border-color:var(--chipb2) !important;
}
.multi-token:hover::after{
background:#0a1220 !important;
color:var(--text) !important;
}
/* special-token chips */
.special-token:empty::before,
.special-token:not(:empty):before{
background:#12343a !important; /* deep teal/cyan-ish */
color:var(--text) !important;
}
/* annotations: keep background = currentColor, ensure label text readable */
.annotation:before{ color:#0b0d12 !important; } /* dark text on bright bg */
}
</style>
"""
VIEWPORT_FIX = r"""
<style id="viewport-fix">
.tokenized-text {
max-height: none !important;
overflow-y: visible !important;
min-height: 100vh;
}
</style>
"""
def inject_dark_css(html: str) -> str:
# de-dupe
html = re.sub(r'<style id="dark-mode-overrides".*?</style>', "", html, flags=re.S)
html = re.sub(r'<style id="viewport-fix".*?</style>', "", html, flags=re.S)
html = re.sub(
r'<meta name="color-scheme" content="dark light">\s*', "", html, flags=re.S
)
combined = DARK_BLOCK + VIEWPORT_FIX
return (
html.replace("</head>", combined + "\n</head>")
if "</head>" in html
else combined + html
)
def main():
p = argparse.ArgumentParser(
description="Inject dark-mode CSS into EncodingVisualizer HTML",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
p.add_argument("input", help="Path to the generated HTML")
p.add_argument(
"-o", "--output", help="Write to this file (default: OVERWRITE input)"
)
p.add_argument(
"--backup",
action="store_true",
help="Write a .bak alongside before overwriting",
)
p.add_argument(
"--no-open", action="store_true", help="Do not open in browser after writing"
)
args = p.parse_args()
src = Path(args.input)
out = Path(args.output) if args.output else src
html = src.read_text(encoding="utf-8")
patched = inject_dark_css(html)
if not args.output and args.backup:
src.with_suffix(src.suffix + ".bak").write_text(html, encoding="utf-8")
out.write_text(patched, encoding="utf-8")
print(f"dark-mode CSS injected → {out.resolve()}")
if not args.no_open:
try:
webbrowser.open(out.resolve().as_uri())
except Exception:
pass
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment