Created
March 18, 2024 09:21
-
-
Save Norod/e08d40a05e9908188ba29d35e14353fe to your computer and use it in GitHub Desktop.
Compare Hebrew efficiency in various tokenizers (The lower the number, the better)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer | |
from transformers import LlamaTokenizerFast | |
#tokenizer_yam = AutoTokenizer.from_pretrained("yam-peleg/Hebrew-Gemma-11B-V2") | |
tokenizer_grok = LlamaTokenizerFast.from_pretrained('Xenova/grok-1-tokenizer') | |
tokenizer_gemma = AutoTokenizer.from_pretrained("google/gemma-7b-it") | |
tokenizer_aya101 = AutoTokenizer.from_pretrained("CohereForAI/aya-101") | |
tokenizer_gpt2 = AutoTokenizer.from_pretrained("gpt2") | |
prompt_text='''מודל ראשון בגודל 6-מיליארד פרמטרים מתאמן כרגע על חלק מהדאטסטים שהגבתם, עכשיו כשהמודל על האש אני אתפנה לענות לכולם. מתנצל על העיכוב, קיבלתי המון הודעות ולא ציפיתי לכזו הענות, אתם אדירים! | |
שלב הבא: להרכיב דאטהסט אחד ענק מכל הרעיונות והלינקים שצירפתם בשביל האימון המרכזי.''' | |
def test_tokenizer(tokenizer, prompt_text): | |
encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt") | |
num_of_tokenz = encoded_prompt.size()[-1] | |
return num_of_tokenz | |
prompt_length = len(prompt_text) | |
print(f'\nPrompt length: {prompt_length} \n') | |
#print(f'Yam tokenizer: {test_tokenizer(tokenizer_yam, prompt_text)}') #Same as Gemma | |
print(f'Grok tokenizer: {test_tokenizer(tokenizer_grok, prompt_text)}') | |
print(f'Gemma tokenizer: {test_tokenizer(tokenizer_gemma, prompt_text)}') | |
print(f'Aya-101 tokenizer: {test_tokenizer(tokenizer_aya101, prompt_text)}') | |
print(f'GPT-2 tokenizer: {test_tokenizer(tokenizer_gpt2, prompt_text)}') | |
####################### | |
# Prompt length: 267 | |
# | |
# Grok tokenizer: 300 | |
# Gemma tokenizer: 121 | |
# Aya-101 tokenizer: 109 | |
# GPT-2 tokenizer: 321 | |
####################### |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment