Skip to content

Instantly share code, notes, and snippets.

View krammnic's full-sized avatar

Mark krammnic

View GitHub Profile
from transformers import AutoTokenizer
mname = "google/gemma-2-2b-it" # or any checkpoint that has a fast tokenizer.
vocab_keep_items = 5000
tokenizer = AutoTokenizer.from_pretrained(mname)
assert tokenizer.is_fast, "This only works for fast tokenizers."
tokenizer.save_pretrained("big-tokenizer")
# Should be a generator of list of texts.
training_corpus = [
@krammnic
krammnic / .py
Created June 5, 2025 22:36
compress tokenizer
from torchtune.data import Message
from torchtune.modules.transforms.tokenizers import HuggingFaceModelTokenizer
TOKENIZER_CONFIG_PATH = "tokenizer_config_gemma.json"
GENERATION_CONFIG_PATH = "generation_config_gemma.json"
TOKENIZER_PATH = "tokenizer_gemma_cropped.json"
def test_huggingface_model_tokenizer():
try: