Last active
December 5, 2025 19:47
-
-
Save eustlb/6d8b8e62e5f38c55e7d2adeca169de7c to your computer and use it in GitHub Desktop.
Lasr Tokenizer trfms vs sentensepiece
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from transformers import LasrTokenizer, LasrFeatureExtractor, LasrProcessor | |
| from transformers.tokenization_utils_sentencepiece import SentencePieceExtractor | |
| from huggingface_hub import hf_hub_download | |
| import sentencepiece | |
| from datasets import load_dataset | |
| from tqdm import tqdm | |
| import unicodedata | |
| import re | |
| path = hf_hub_download(repo_id='wuketest/lasr_test', filename='spiece.model') | |
| vocab_ids, vocab_scores, merges = SentencePieceExtractor(path).extract() | |
| vocab_scores[0] = ("<pad>", vocab_scores[0][1]) | |
| tokenizer = LasrTokenizer(vocab=vocab_scores) | |
| tokenizer.add_eos_token = False | |
| feature_extractor = LasrFeatureExtractor() | |
| processor = LasrProcessor(feature_extractor, tokenizer) | |
| processor.save_pretrained("lasr_test") | |
| sp_tokenizer = sentencepiece.SentencePieceProcessor() | |
| sp_tokenizer.load(path) | |
| assert tokenizer.encode('hello world') == sp_tokenizer.EncodeAsIds('hello world') | |
| assert tokenizer.encode(' hello world') == sp_tokenizer.EncodeAsIds(' hello world') | |
| xnli = load_dataset("xnli", "all_languages", split="validation") | |
| original = sp_tokenizer | |
| def verify(lang, text): | |
| encoded_original = sp_tokenizer.EncodeAsIds(text) | |
| encoded_fast = tokenizer.encode(text) | |
| assert encoded_fast == encoded_original, f"Fast encode error: {lang} – {text}" | |
| for p in tqdm(xnli["premise"]): | |
| for lang, text in p.items(): | |
| # for english, we test fully | |
| if lang == "en": | |
| verify(lang, text) | |
| else: | |
| # for other languages | |
| text = unicodedata.normalize('NFKD', text) | |
| # Remove accented characters | |
| text = ''.join(c for c in text if not unicodedata.combining(c)) | |
| # Remove zero-width characters and other problematic unicode characters | |
| text = re.sub(r'[\u200b\u200c\u200d\u200f\ufeff]', '', text) | |
| verify(lang, text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment