pszemraj · October 1, 2025 13:21
diff --git a/encoding_visualizer.py b/encoding_visualizer.py
 import argparse
 import webbrowser
 from pathlib import Path
 from typing import Any, Callable, Optional, Union

 from tokenizers import Tokenizer as RustTokenizer
 from tokenizers.tools import EncodingVisualizer
 from transformers import AutoTokenizer, PreTrainedTokenizerBase

 SAMPLE_TEXT = '''class DyT(nn.Module):
    """
    Dynamic Tanh, Transformers without Normalization
    https://jiachenzhu.github.io/DyT/
    """

    def __init__(self, num_features, alpha_init_value=0.5):
        """Initialize DyT.

        Args:
            num_features (int): Number of feature channels for weight/bias.
            alpha_init_value (float): Initial value for the gating scalar α.
                α is learned (nn.Parameter) and scales the pre-activation
                before tanh to control saturation at initialization.
        """
        super().__init__()
        self.alpha = nn.Parameter(torch.ones(1) * alpha_init_value)
        self.weight = nn.Parameter(torch.ones(num_features))
        self.bias = nn.Parameter(torch.zeros(num_features))

    def forward(self, x):
        x = torch.tanh(self.alpha * x)
        return x * self.weight + self.bias
 '''


 def tokenizer_report_and_visualize(
    tk: Union[PreTrainedTokenizerBase, RustTokenizer],
    sample_text: str = SAMPLE_TEXT,
    *,
    add_special_tokens: bool = True,
    output_html: Optional[str] = None,
    open_in_browser: bool = False,
    annotation_converter: Optional[Callable[[Any], Any]] = None,
    n_first_tokens: int = 15,
 ):
    """
    Count tokens and ALWAYS produce a visualization HTML file (works outside notebooks).
    If running in a notebook, the viz will also render inline.
    """
    if isinstance(tk, PreTrainedTokenizerBase):
        name = getattr(tk, "name_or_path", tk.__class__.__name__)
        backend = getattr(tk, "backend_tokenizer", None)
        if backend is None:
            raise ValueError(
                "EncodingVisualizer needs a *fast* tokenizer. "
                "Reload with `use_fast=True` or pass a `tokenizers.Tokenizer`."
            )
        input_ids = tk.encode(
            sample_text,
            add_special_tokens=add_special_tokens,
            padding=False,
            truncation=False,
        )
        tokens = tk.convert_ids_to_tokens(input_ids, skip_special_tokens=False)
        rust_tok = backend
    elif isinstance(tk, RustTokenizer):
        name = "tokenizers.Tokenizer"
        enc = tk.encode(sample_text)
        input_ids, tokens = enc.ids, enc.tokens
        rust_tok = tk
    else:
        raise TypeError(
            "`tk` must be a HF *fast* tokenizer or a `tokenizers.Tokenizer`."
        )

    num_tokens = len(input_ids)
    print(f"tokenizer ({name}): {num_tokens} tokens")
    print(
        "first tokens:",
        tokens[:n_first_tokens] + (["..."] if len(tokens) > n_first_tokens else []),
    )

    # --- build visualizer with default_to_notebook=False to get real HTML text ---
    viz = EncodingVisualizer(
        tokenizer=rust_tok,
        default_to_notebook=False,
        annotation_converter=annotation_converter,
    )
    html_obj = viz(sample_text)  # often a plain str; sometimes an HTML object

    # robust extraction of raw HTML
    html_str = None
    if isinstance(html_obj, str):
        html_str = html_obj
    elif hasattr(html_obj, "data") and html_obj.data is not None:
        html_str = html_obj.data
    elif hasattr(html_obj, "_repr_html_"):
        html_str = html_obj._repr_html_()

    if not html_str:
        # last resort: avoid writing "None"-fail loudly with context
        raise RuntimeError("Could not extract HTML from EncodingVisualizer output.")

    src_basename = (
        tk.name_or_path.split("/")[-1] if len(str(tk.name_or_path)) > 1 else "unknown"
    )
    path = (
        Path(output_html).resolve()
        if output_html is not None
        else Path.cwd() / f"tokenizer_viz-{src_basename}.html"
    )
    path.write_text(html_str, encoding="utf-8")
    print(f"[saved] {path}")

    # try to open a browser when outside notebooks
    try:
        from IPython import get_ipython

        in_nb = bool(get_ipython() and hasattr(get_ipython(), "kernel"))
    except Exception:
        in_nb = False

    if not in_nb and open_in_browser:
        webbrowser.open(path.as_uri())

    # also render inline if in a notebook
    if in_nb:
        try:
            from IPython.display import HTML, display

            display(HTML(html_str))
        except Exception:
            pass

    return {
        "tokenizer_name": name,
        "num_tokens": num_tokens,
        "input_ids": input_ids,
        "tokens": tokens,
        "html_file": str(path),
    }


 # --- example usage ---
 # from transformers import AutoTokenizer
 # tk = AutoTokenizer.from_pretrained(
 #     "gpt2",
 #     use_fast=True,
 #     open_in_browser=True,
 # )
 # report = tokenizer_report_and_visualize(tk)


 def main():
    parser = argparse.ArgumentParser(
        description="Visualize how a tokenizer encodes text",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  %(prog)s gpt2
  %(prog)s bert-base-uncased --text "Hello, world!"
  %(prog)s meta-llama/Llama-2-7b-hf --file input.txt --open
  %(prog)s gpt2 --no-special-tokens --output my_viz.html
        """,
    )

    parser.add_argument(
        "tokenizer",
        type=str,
        help="Hugging Face tokenizer name or path (e.g., 'gpt2', 'bert-base-uncased')",
    )

    parser.add_argument(
        "--text",
        type=str,
        default=None,
        help="Text to tokenize (default: built-in code sample)",
    )

    parser.add_argument(
        "--file",
        type=str,
        default=None,
        help="Path to text file to tokenize (overrides --text)",
    )

    parser.add_argument(
        "--output",
        "-o",
        type=str,
        default=None,
        help="Output HTML file path (default: tokenizer_viz-<name>.html)",
    )

    parser.add_argument(
        "--open",
        action="store_true",
        help="Open visualization in web browser after creation",
    )

    parser.add_argument(
        "--no-special-tokens",
        action="store_true",
        help="Don't add special tokens (BOS, EOS, etc.)",
    )

    parser.add_argument(
        "--n-first-tokens",
        type=int,
        default=15,
        help="Number of first tokens to print (default: 15)",
    )

    args = parser.parse_args()

    # Determine text to tokenize
    if args.file:
        text = Path(args.file).read_text(encoding="utf-8")
        print(f"[input text] loaded {args.file}, ({len(text)} chars)")
    elif args.text:
        text = args.text
    else:
        text = SAMPLE_TEXT
        print("[input text] default sample text")

    # Load tokenizer
    print(f"[tokenizer] loading {args.tokenizer}")
    tk = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True)
    print(f"[tokenizer] vocab size:\t{len(tk)}")

    # Run visualization
    _ = tokenizer_report_and_visualize(
        tk,
        sample_text=text,
        add_special_tokens=not args.no_special_tokens,
        output_html=args.output,
        open_in_browser=args.open,
        n_first_tokens=args.n_first_tokens,
    )


 if __name__ == "__main__":
    main()
diff --git a/viz2darkmode.py b/viz2darkmode.py
 #!/usr/bin/env python3
 import argparse
 import re
 import webbrowser
 from pathlib import Path

 DARK_BLOCK = r"""
 <meta name="color-scheme" content="dark light">
 <style id="dark-mode-overrides">
 :root { color-scheme: light dark; }

 /* High-contrast dark with cyan accent */
 @media (prefers-color-scheme: dark){
  /* base palette */
  :root{
    --bg: #0b0d12;
    --panel: #0f141b;
    --text: #f2f7fb;             /* very light */
    --muted: #2d333b;
    --chip1: rgba(0, 214, 255, 0.28); /* cyan tile (even)   */
    --chip2: rgba(0, 214, 255, 0.46); /* cyan tile (odd)    */
    --chipb1: rgba(0, 214, 255, 0.62);/* borders (even)     */
    --chipb2: rgba(0, 214, 255, 0.78);/* borders (odd)      */
    --grid: #3a4048;                 /* hatch line color   */
    --accent: #00d6ff;               /* pure cyan accent   */
  }

  html, body { background:var(--bg); color:var(--text); }

  .tokenized-text{
    background:var(--panel) !important;
    color:var(--text) !important;
    box-shadow:none !important;
    border-radius:10px;
  }

  .token{ color:var(--text) !important; }
  .non-token{
    border-top:1px solid var(--muted) !important;
    border-bottom:1px solid var(--muted) !important;
  }

  /* HIGHER CONTRAST TILES */
  .even-token{
    background:var(--chip1) !important;
    border-color:var(--chipb1) !important;
  }
  .odd-token{
    background:var(--chip2) !important;
    border-color:var(--chipb2) !important;
  }

  /* multi-token hatch with cyan blend */
  .even-token.multi-token,.odd-token.multi-token{
    background:
      repeating-linear-gradient(45deg, transparent, transparent 1px, var(--grid) 1px, var(--grid) 2px),
      linear-gradient(to bottom, var(--accent), var(--chip2)) !important;
    border-color:var(--chipb2) !important;
  }

  .multi-token:hover::after{
    background:#0a1220 !important;
    color:var(--text) !important;
  }

  /* special-token chips */
  .special-token:empty::before,
  .special-token:not(:empty):before{
    background:#12343a !important;   /* deep teal/cyan-ish */
    color:var(--text) !important;
  }

  /* annotations: keep background = currentColor, ensure label text readable */
  .annotation:before{ color:#0b0d12 !important; }  /* dark text on bright bg */
 }
 </style>
 """


 VIEWPORT_FIX = r"""
 <style id="viewport-fix">
 .tokenized-text {
    max-height: none !important;
    overflow-y: visible !important;
    min-height: 100vh;
 }
 </style>
 """


 def inject_dark_css(html: str) -> str:
    # de-dupe
    html = re.sub(r'<style id="dark-mode-overrides".*?</style>', "", html, flags=re.S)
    html = re.sub(r'<style id="viewport-fix".*?</style>', "", html, flags=re.S)
    html = re.sub(
        r'<meta name="color-scheme" content="dark light">\s*', "", html, flags=re.S
    )

    combined = DARK_BLOCK + VIEWPORT_FIX
    return (
        html.replace("</head>", combined + "\n</head>")
        if "</head>" in html
        else combined + html
    )


 def main():
    p = argparse.ArgumentParser(
        description="Inject dark-mode CSS into EncodingVisualizer HTML",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    p.add_argument("input", help="Path to the generated HTML")
    p.add_argument(
        "-o", "--output", help="Write to this file (default: OVERWRITE input)"
    )
    p.add_argument(
        "--backup",
        action="store_true",
        help="Write a .bak alongside before overwriting",
    )
    p.add_argument(
        "--no-open", action="store_true", help="Do not open in browser after writing"
    )
    args = p.parse_args()

    src = Path(args.input)
    out = Path(args.output) if args.output else src

    html = src.read_text(encoding="utf-8")
    patched = inject_dark_css(html)

    if not args.output and args.backup:
        src.with_suffix(src.suffix + ".bak").write_text(html, encoding="utf-8")

    out.write_text(patched, encoding="utf-8")
    print(f"dark-mode CSS injected → {out.resolve()}")

    if not args.no_open:
        try:
            webbrowser.open(out.resolve().as_uri())
        except Exception:
            pass


 if __name__ == "__main__":
    main()
	import argparse
	import webbrowser
	from pathlib import Path
	from typing import Any, Callable, Optional, Union

	from tokenizers import Tokenizer as RustTokenizer
	from tokenizers.tools import EncodingVisualizer
	from transformers import AutoTokenizer, PreTrainedTokenizerBase

	SAMPLE_TEXT = '''class DyT(nn.Module):
	"""
	Dynamic Tanh, Transformers without Normalization
	https://jiachenzhu.github.io/DyT/
	"""

	def __init__(self, num_features, alpha_init_value=0.5):
	"""Initialize DyT.

	Args:
	num_features (int): Number of feature channels for weight/bias.
	alpha_init_value (float): Initial value for the gating scalar α.
	α is learned (nn.Parameter) and scales the pre-activation
	before tanh to control saturation at initialization.
	"""
	super().__init__()
	self.alpha = nn.Parameter(torch.ones(1) * alpha_init_value)
	self.weight = nn.Parameter(torch.ones(num_features))
	self.bias = nn.Parameter(torch.zeros(num_features))

	def forward(self, x):
	x = torch.tanh(self.alpha * x)
	return x * self.weight + self.bias
	'''


	def tokenizer_report_and_visualize(
	tk: Union[PreTrainedTokenizerBase, RustTokenizer],
	sample_text: str = SAMPLE_TEXT,
	*,
	add_special_tokens: bool = True,
	output_html: Optional[str] = None,
	open_in_browser: bool = False,
	annotation_converter: Optional[Callable[[Any], Any]] = None,
	n_first_tokens: int = 15,
	):
	"""
	Count tokens and ALWAYS produce a visualization HTML file (works outside notebooks).
	If running in a notebook, the viz will also render inline.
	"""
	if isinstance(tk, PreTrainedTokenizerBase):
	name = getattr(tk, "name_or_path", tk.__class__.__name__)
	backend = getattr(tk, "backend_tokenizer", None)
	if backend is None:
	raise ValueError(
	"EncodingVisualizer needs a fast tokenizer. "
	"Reload with `use_fast=True` or pass a `tokenizers.Tokenizer`."
	)
	input_ids = tk.encode(
	sample_text,
	add_special_tokens=add_special_tokens,
	padding=False,
	truncation=False,
	)
	tokens = tk.convert_ids_to_tokens(input_ids, skip_special_tokens=False)
	rust_tok = backend
	elif isinstance(tk, RustTokenizer):
	name = "tokenizers.Tokenizer"
	enc = tk.encode(sample_text)
	input_ids, tokens = enc.ids, enc.tokens
	rust_tok = tk
	else:
	raise TypeError(
	"`tk` must be a HF fast tokenizer or a `tokenizers.Tokenizer`."
	)

	num_tokens = len(input_ids)
	print(f"tokenizer ({name}): {num_tokens} tokens")
	print(
	"first tokens:",
	tokens[:n_first_tokens] + (["..."] if len(tokens) > n_first_tokens else []),
	)

	# --- build visualizer with default_to_notebook=False to get real HTML text ---
	viz = EncodingVisualizer(
	tokenizer=rust_tok,
	default_to_notebook=False,
	annotation_converter=annotation_converter,
	)
	html_obj = viz(sample_text) # often a plain str; sometimes an HTML object

	# robust extraction of raw HTML
	html_str = None
	if isinstance(html_obj, str):
	html_str = html_obj
	elif hasattr(html_obj, "data") and html_obj.data is not None:
	html_str = html_obj.data
	elif hasattr(html_obj, "_repr_html_"):
	html_str = html_obj._repr_html_()

	if not html_str:
	# last resort: avoid writing "None"-fail loudly with context
	raise RuntimeError("Could not extract HTML from EncodingVisualizer output.")

	src_basename = (
	tk.name_or_path.split("/")[-1] if len(str(tk.name_or_path)) > 1 else "unknown"
	)
	path = (
	Path(output_html).resolve()
	if output_html is not None
	else Path.cwd() / f"tokenizer_viz-{src_basename}.html"
	)
	path.write_text(html_str, encoding="utf-8")
	print(f"[saved] {path}")

	# try to open a browser when outside notebooks
	try:
	from IPython import get_ipython

	in_nb = bool(get_ipython() and hasattr(get_ipython(), "kernel"))
	except Exception:
	in_nb = False

	if not in_nb and open_in_browser:
	webbrowser.open(path.as_uri())

	# also render inline if in a notebook
	if in_nb:
	try:
	from IPython.display import HTML, display

	display(HTML(html_str))
	except Exception:
	pass

	return {
	"tokenizer_name": name,
	"num_tokens": num_tokens,
	"input_ids": input_ids,
	"tokens": tokens,
	"html_file": str(path),
	}


	# --- example usage ---
	# from transformers import AutoTokenizer
	# tk = AutoTokenizer.from_pretrained(
	# "gpt2",
	# use_fast=True,
	# open_in_browser=True,
	# )
	# report = tokenizer_report_and_visualize(tk)


	def main():
	parser = argparse.ArgumentParser(
	description="Visualize how a tokenizer encodes text",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	%(prog)s gpt2
	%(prog)s bert-base-uncased --text "Hello, world!"
	%(prog)s meta-llama/Llama-2-7b-hf --file input.txt --open
	%(prog)s gpt2 --no-special-tokens --output my_viz.html
	""",
	)

	parser.add_argument(
	"tokenizer",
	type=str,
	help="Hugging Face tokenizer name or path (e.g., 'gpt2', 'bert-base-uncased')",
	)

	parser.add_argument(
	"--text",
	type=str,
	default=None,
	help="Text to tokenize (default: built-in code sample)",
	)

	parser.add_argument(
	"--file",
	type=str,
	default=None,
	help="Path to text file to tokenize (overrides --text)",
	)

	parser.add_argument(
	"--output",
	"-o",
	type=str,
	default=None,
	help="Output HTML file path (default: tokenizer_viz-<name>.html)",
	)

	parser.add_argument(
	"--open",
	action="store_true",
	help="Open visualization in web browser after creation",
	)

	parser.add_argument(
	"--no-special-tokens",
	action="store_true",
	help="Don't add special tokens (BOS, EOS, etc.)",
	)

	parser.add_argument(
	"--n-first-tokens",
	type=int,
	default=15,
	help="Number of first tokens to print (default: 15)",
	)

	args = parser.parse_args()

	# Determine text to tokenize
	if args.file:
	text = Path(args.file).read_text(encoding="utf-8")
	print(f"[input text] loaded {args.file}, ({len(text)} chars)")
	elif args.text:
	text = args.text
	else:
	text = SAMPLE_TEXT
	print("[input text] default sample text")

	# Load tokenizer
	print(f"[tokenizer] loading {args.tokenizer}")
	tk = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True)
	print(f"[tokenizer] vocab size:\t{len(tk)}")

	# Run visualization
	_ = tokenizer_report_and_visualize(
	tk,
	sample_text=text,
	add_special_tokens=not args.no_special_tokens,
	output_html=args.output,
	open_in_browser=args.open,
	n_first_tokens=args.n_first_tokens,
	)


	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3
	import argparse
	import re
	import webbrowser
	from pathlib import Path

	DARK_BLOCK = r"""
	<meta name="color-scheme" content="dark light">
	<style id="dark-mode-overrides">
	:root { color-scheme: light dark; }

	/* High-contrast dark with cyan accent */
	@media (prefers-color-scheme: dark){
	/* base palette */
	:root{
	--bg: #0b0d12;
	--panel: #0f141b;
	--text: #f2f7fb; /* very light */
	--muted: #2d333b;
	--chip1: rgba(0, 214, 255, 0.28); /* cyan tile (even) */
	--chip2: rgba(0, 214, 255, 0.46); /* cyan tile (odd) */
	--chipb1: rgba(0, 214, 255, 0.62);/* borders (even) */
	--chipb2: rgba(0, 214, 255, 0.78);/* borders (odd) */
	--grid: #3a4048; /* hatch line color */
	--accent: #00d6ff; /* pure cyan accent */
	}

	html, body { background:var(--bg); color:var(--text); }

	.tokenized-text{
	background:var(--panel) !important;
	color:var(--text) !important;
	box-shadow:none !important;
	border-radius:10px;
	}

	.token{ color:var(--text) !important; }
	.non-token{
	border-top:1px solid var(--muted) !important;
	border-bottom:1px solid var(--muted) !important;
	}

	/* HIGHER CONTRAST TILES */
	.even-token{
	background:var(--chip1) !important;
	border-color:var(--chipb1) !important;
	}
	.odd-token{
	background:var(--chip2) !important;
	border-color:var(--chipb2) !important;
	}

	/* multi-token hatch with cyan blend */
	.even-token.multi-token,.odd-token.multi-token{
	background:
	repeating-linear-gradient(45deg, transparent, transparent 1px, var(--grid) 1px, var(--grid) 2px),
	linear-gradient(to bottom, var(--accent), var(--chip2)) !important;
	border-color:var(--chipb2) !important;
	}

	.multi-token:hover::after{
	background:#0a1220 !important;
	color:var(--text) !important;
	}

	/* special-token chips */
	.special-token:empty::before,
	.special-token:not(:empty):before{
	background:#12343a !important; /* deep teal/cyan-ish */
	color:var(--text) !important;
	}

	/* annotations: keep background = currentColor, ensure label text readable */
	.annotation:before{ color:#0b0d12 !important; } /* dark text on bright bg */
	}
	</style>
	"""


	VIEWPORT_FIX = r"""
	<style id="viewport-fix">
	.tokenized-text {
	max-height: none !important;
	overflow-y: visible !important;
	min-height: 100vh;
	}
	</style>
	"""


	def inject_dark_css(html: str) -> str:
	# de-dupe
	html = re.sub(r'<style id="dark-mode-overrides".*?</style>', "", html, flags=re.S)
	html = re.sub(r'<style id="viewport-fix".*?</style>', "", html, flags=re.S)
	html = re.sub(
	r'<meta name="color-scheme" content="dark light">\s*', "", html, flags=re.S
	)

	combined = DARK_BLOCK + VIEWPORT_FIX
	return (
	html.replace("</head>", combined + "\n</head>")
	if "</head>" in html
	else combined + html
	)


	def main():
	p = argparse.ArgumentParser(
	description="Inject dark-mode CSS into EncodingVisualizer HTML",
	formatter_class=argparse.ArgumentDefaultsHelpFormatter,
	)
	p.add_argument("input", help="Path to the generated HTML")
	p.add_argument(
	"-o", "--output", help="Write to this file (default: OVERWRITE input)"
	)
	p.add_argument(
	"--backup",
	action="store_true",
	help="Write a .bak alongside before overwriting",
	)
	p.add_argument(
	"--no-open", action="store_true", help="Do not open in browser after writing"
	)
	args = p.parse_args()

	src = Path(args.input)
	out = Path(args.output) if args.output else src

	html = src.read_text(encoding="utf-8")
	patched = inject_dark_css(html)

	if not args.output and args.backup:
	src.with_suffix(src.suffix + ".bak").write_text(html, encoding="utf-8")

	out.write_text(patched, encoding="utf-8")
	print(f"dark-mode CSS injected → {out.resolve()}")

	if not args.no_open:
	try:
	webbrowser.open(out.resolve().as_uri())
	except Exception:
	pass


	if __name__ == "__main__":
	main()