Skip to content

Instantly share code, notes, and snippets.

@eustlb
Last active December 5, 2025 19:47
Show Gist options
  • Select an option

  • Save eustlb/6d8b8e62e5f38c55e7d2adeca169de7c to your computer and use it in GitHub Desktop.

Select an option

Save eustlb/6d8b8e62e5f38c55e7d2adeca169de7c to your computer and use it in GitHub Desktop.
Lasr Tokenizer trfms vs sentensepiece
from transformers import LasrTokenizer, LasrFeatureExtractor, LasrProcessor
from transformers.tokenization_utils_sentencepiece import SentencePieceExtractor
from huggingface_hub import hf_hub_download
import sentencepiece
from datasets import load_dataset
from tqdm import tqdm
import unicodedata
import re
path = hf_hub_download(repo_id='wuketest/lasr_test', filename='spiece.model')
vocab_ids, vocab_scores, merges = SentencePieceExtractor(path).extract()
vocab_scores[0] = ("<pad>", vocab_scores[0][1])
tokenizer = LasrTokenizer(vocab=vocab_scores)
tokenizer.add_eos_token = False
feature_extractor = LasrFeatureExtractor()
processor = LasrProcessor(feature_extractor, tokenizer)
processor.save_pretrained("lasr_test")
sp_tokenizer = sentencepiece.SentencePieceProcessor()
sp_tokenizer.load(path)
assert tokenizer.encode('hello world') == sp_tokenizer.EncodeAsIds('hello world')
assert tokenizer.encode(' hello world') == sp_tokenizer.EncodeAsIds(' hello world')
xnli = load_dataset("xnli", "all_languages", split="validation")
original = sp_tokenizer
def verify(lang, text):
encoded_original = sp_tokenizer.EncodeAsIds(text)
encoded_fast = tokenizer.encode(text)
assert encoded_fast == encoded_original, f"Fast encode error: {lang} – {text}"
for p in tqdm(xnli["premise"]):
for lang, text in p.items():
# for english, we test fully
if lang == "en":
verify(lang, text)
else:
# for other languages
text = unicodedata.normalize('NFKD', text)
# Remove accented characters
text = ''.join(c for c in text if not unicodedata.combining(c))
# Remove zero-width characters and other problematic unicode characters
text = re.sub(r'[\u200b\u200c\u200d\u200f\ufeff]', '', text)
verify(lang, text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment