Created
July 18, 2024 13:02
-
-
Save Norod/c3888a0d26daa499ea62e254bb5c08b6 to your computer and use it in GitHub Desktop.
A set of scripts for: training a small tokenizer in a new language, merging small tokinizer with existing one and saving the combined and resized model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Given two tokenizers, combine them and create a new tokenizer | |
Usage: python combine_tokenizers.py --tokenizer1 ../config/en/roberta_8 --tokenizer2 ../config/hi/roberta_8 --save_dir ../config/en/en_hi/roberta_8 | |
Source: https://github.com/huggingface/tokenizers/issues/690#issuecomment-830665989 | |
""" | |
# Libraries for tokenizer | |
from pathlib import Path | |
from tokenizers import ByteLevelBPETokenizer | |
import argparse | |
import json | |
import os | |
from tqdm import tqdm | |
from transformers import AutoTokenizer | |
from timeit import default_timer as timer | |
import sys | |
def combine_tokenizers(args): | |
# Load both the json files, take the union, and store it | |
json1 = json.load(open(os.path.join(args.tokenizer1, 'vocab.json'))) | |
json2 = json.load(open(os.path.join(args.tokenizer2, 'vocab.json'))) | |
# Create a new vocabulary | |
new_vocab = {} | |
idx = 0 | |
for word in json1.keys(): | |
if word not in new_vocab.keys(): | |
new_vocab[word] = idx | |
idx += 1 | |
# Add words from second tokenizer | |
for word in json2.keys(): | |
if word not in new_vocab.keys(): | |
new_vocab[word] = idx | |
idx += 1 | |
# Make the directory if necessary | |
if not os.path.exists(args.save_dir): | |
os.makedirs(args.save_dir) | |
# Save the vocab | |
with open(os.path.join(args.save_dir, 'vocab.json'), 'w') as fp: | |
json.dump(new_vocab, fp, ensure_ascii=False) | |
# Merge the two merges file. Don't handle duplicates here | |
# Concatenate them, but ignore the first line of the second file | |
os.system('cat {} > {}'.format(os.path.join(args.tokenizer1, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt'))) | |
os.system('tail -n +2 -q {} >> {}'.format(os.path.join(args.tokenizer2, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt'))) | |
# Save other files | |
os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'special_tokens_map.json'), args.save_dir)) | |
os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'tokenizer_config.json'), args.save_dir)) | |
# Instantiate the new tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(args.save_dir, use_fast=True) | |
tokenizer.save_pretrained(args.save_dir+'/tokenizer') | |
def main(): | |
parser = argparse.ArgumentParser() | |
# Dataset Arguments | |
parser.add_argument("--tokenizer1", type=str, required=True, help="") | |
parser.add_argument("--tokenizer2", type=str, required=True, help="") | |
parser.add_argument("--save_dir", type=str, required=True, help="") | |
args = parser.parse_args() | |
combine_tokenizers(args) | |
if __name__ == '__main__': | |
main() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
device = "cpu" # for GPU usage or "cpu" for CPU usage | |
tokenizer = AutoTokenizer.from_pretrained("./SmolLM-tokenizer-with-added-hebrew-14k") | |
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")` | |
model = AutoModelForCausalLM.from_pretrained("./SmolLM-135M").to(device) | |
model.resize_token_embeddings(len(tokenizer)) | |
inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device) | |
outputs = model.generate(inputs) | |
print(tokenizer.decode(outputs[0])) | |
model.save_pretrained("./Heb-SmolLM-135M") | |
tokenizer.save_pretrained("./Heb-SmolLM-135M") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "9GmF1MAYFM8C" | |
}, | |
"source": [ | |
"# Training a new tokenizer from an old one" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "i6Ckxh5KFM8E" | |
}, | |
"source": [ | |
"Install the Transformers, Datasets, and Evaluate libraries to run this notebook." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"id": "CNqoBxR5FM8F" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n", | |
"Requirement already satisfied: datasets in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (2.14.0)\n", | |
"Requirement already satisfied: evaluate in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (0.4.0)\n", | |
"Requirement already satisfied: transformers in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (4.31.0)\n", | |
"Requirement already satisfied: sentencepiece in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (0.1.99)\n", | |
"Requirement already satisfied: tokenizers in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (0.13.3)\n", | |
"Requirement already satisfied: accelerate in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (0.19.0)\n", | |
"Collecting accelerate\n", | |
" Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)\n", | |
" -------------------------------------- 244.2/244.2 kB 3.0 MB/s eta 0:00:00\n", | |
"Requirement already satisfied: numpy>=1.17 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (1.23.5)\n", | |
"Requirement already satisfied: packaging in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (22.0)\n", | |
"Requirement already satisfied: requests>=2.19.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (2.25.1)\n", | |
"Requirement already satisfied: huggingface-hub<1.0.0,>=0.14.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (0.14.1)\n", | |
"Requirement already satisfied: dill<0.3.8,>=0.3.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (0.3.6)\n", | |
"Requirement already satisfied: pyarrow>=8.0.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (12.0.0)\n", | |
"Requirement already satisfied: xxhash in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (3.2.0)\n", | |
"Requirement already satisfied: multiprocess in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (0.70.14)\n", | |
"Requirement already satisfied: fsspec[http]>=2021.11.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (2023.5.0)\n", | |
"Requirement already satisfied: pyyaml>=5.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (6.0)\n", | |
"Requirement already satisfied: tqdm>=4.62.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (4.65.0)\n", | |
"Requirement already satisfied: pandas in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (2.0.1)\n", | |
"Requirement already satisfied: aiohttp in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (3.8.4)\n", | |
"Requirement already satisfied: responses<0.19 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from evaluate) (0.18.0)\n", | |
"Requirement already satisfied: safetensors>=0.3.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from transformers) (0.3.1)\n", | |
"Requirement already satisfied: regex!=2019.12.17 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from transformers) (2023.5.5)\n", | |
"Requirement already satisfied: filelock in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from transformers) (3.12.0)\n", | |
"Requirement already satisfied: torch>=1.10.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from accelerate) (2.0.1+cu118)\n", | |
"Requirement already satisfied: psutil in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from accelerate) (5.9.5)\n", | |
"Requirement already satisfied: multidict<7.0,>=4.5 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (6.0.4)\n", | |
"Requirement already satisfied: attrs>=17.3.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (23.1.0)\n", | |
"Requirement already satisfied: yarl<2.0,>=1.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (1.9.2)\n", | |
"Requirement already satisfied: frozenlist>=1.1.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (1.3.3)\n", | |
"Requirement already satisfied: aiosignal>=1.1.2 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (1.3.1)\n", | |
"Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (4.0.2)\n", | |
"Requirement already satisfied: charset-normalizer<4.0,>=2.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (2.0.4)\n", | |
"Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from huggingface-hub<1.0.0,>=0.14.0->datasets) (4.5.0)\n", | |
"Requirement already satisfied: idna<3,>=2.5 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from requests>=2.19.0->datasets) (2.10)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from requests>=2.19.0->datasets) (2023.5.7)\n", | |
"Requirement already satisfied: chardet<5,>=3.0.2 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from requests>=2.19.0->datasets) (4.0.0)\n", | |
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from requests>=2.19.0->datasets) (1.26.14)\n", | |
"Requirement already satisfied: jinja2 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from torch>=1.10.0->accelerate) (3.1.2)\n", | |
"Requirement already satisfied: sympy in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from torch>=1.10.0->accelerate) (1.11.1)\n", | |
"Requirement already satisfied: networkx in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from torch>=1.10.0->accelerate) (2.8.4)\n", | |
"Requirement already satisfied: colorama in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from tqdm>=4.62.1->datasets) (0.4.6)\n", | |
"Requirement already satisfied: tzdata>=2022.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from pandas->datasets) (2023.3)\n", | |
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from pandas->datasets) (2.8.2)\n", | |
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from pandas->datasets) (2023.3)\n", | |
"Requirement already satisfied: six>=1.5 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", | |
"Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\doron\\appdata\\roaming\\python\\python310\\site-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.2)\n", | |
"Requirement already satisfied: mpmath>=0.19 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from sympy->torch>=1.10.0->accelerate) (1.2.1)\n", | |
"Installing collected packages: accelerate\n", | |
" Attempting uninstall: accelerate\n", | |
" Found existing installation: accelerate 0.19.0\n", | |
" Uninstalling accelerate-0.19.0:\n", | |
" Successfully uninstalled accelerate-0.19.0\n", | |
"Successfully installed accelerate-0.21.0\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install --upgrade datasets evaluate transformers sentencepiece tokenizers accelerate\n", | |
"#!apt install git-lfs" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "TZxkkRZXFM8G" | |
}, | |
"source": [ | |
"You will need to setup git, adapt your email and name in the following cell." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "Ba_CYyHDFM8G" | |
}, | |
"outputs": [], | |
"source": [ | |
"!git config --global user.email \"[email protected]\"\n", | |
"!git config --global user.name \"Doron Adler\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "VBQxAoBYFM8H" | |
}, | |
"source": [ | |
"You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "qR19bo_2FM8I" | |
}, | |
"outputs": [], | |
"source": [ | |
"from huggingface_hub import notebook_login\n", | |
"\n", | |
"notebook_login()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"id": "hUx3zfy7FM8I" | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/opt/miniconda3/envs/pytorch2/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", | |
" from .autonotebook import tqdm as notebook_tqdm\n", | |
"Downloading readme: 100%|██████████| 286/286 [00:00<00:00, 766kB/s]\n", | |
"Downloading data: 100%|██████████| 663M/663M [00:33<00:00, 20.1MB/s] \n", | |
"Downloading data: 100%|██████████| 323M/323M [00:16<00:00, 19.1MB/s] \n", | |
"Downloading data: 100%|██████████| 386M/386M [00:23<00:00, 16.3MB/s] \n", | |
"Downloading data: 100%|██████████| 190M/190M [00:09<00:00, 20.3MB/s] \n", | |
"Downloading data: 100%|██████████| 147M/147M [00:08<00:00, 17.7MB/s] \n", | |
"Downloading data: 100%|██████████| 147M/147M [00:07<00:00, 18.9MB/s] \n", | |
"Downloading data: 100%|██████████| 144M/144M [00:07<00:00, 20.2MB/s] \n", | |
"Downloading data: 100%|██████████| 147M/147M [00:07<00:00, 19.1MB/s] \n", | |
"Downloading data: 100%|██████████| 147M/147M [00:08<00:00, 18.2MB/s] \n", | |
"Downloading data: 100%|██████████| 146M/146M [00:07<00:00, 18.3MB/s] \n", | |
"Downloading data: 100%|██████████| 326M/326M [00:18<00:00, 17.9MB/s] \n", | |
"Generating train split: 100%|██████████| 2188612/2188612 [00:04<00:00, 536444.11 examples/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"from datasets import load_dataset\n", | |
"\n", | |
"# This can take a few minutes to load, so grab a coffee or tea while you wait!\n", | |
"#raw_datasets = load_dataset(\"Norod78/hewiki-20220901-articles-dataset\")\n", | |
"raw_datasets = load_dataset(\"Norod78/Hebrew-corpus-other\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Dataset({\n", | |
" features: ['text'],\n", | |
" num_rows: 2188612\n", | |
"})" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"raw_datasets['train']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Dataset({\n", | |
" features: ['text'],\n", | |
" num_rows: 43773\n", | |
"})\n" | |
] | |
} | |
], | |
"source": [ | |
"raw_datasets = raw_datasets['train'].train_test_split(test_size=0.02, seed=42)\n", | |
"\n", | |
"print(raw_datasets['test'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"id": "bpoiLcGpFM8J" | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"DatasetDict({\n", | |
" train: Dataset({\n", | |
" features: ['text'],\n", | |
" num_rows: 2144839\n", | |
" })\n", | |
" test: Dataset({\n", | |
" features: ['text'],\n", | |
" num_rows: 43773\n", | |
" })\n", | |
"})" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"raw_datasets" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"id": "1ZmzNR_PFM8K" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"כאן: פתחנו מלא פודקאסטים אז אנחנו המצאנו את הפודקאסטים בארץ \\n הוא ממציא הדאבל אלבו. מתי יוצא הפרק של TALKYO?\\n האם ידידי יודע שישנם פודקאסטים ותיקים באנגלית המזכירים מאוד כמה מהפודקאסטים של כאן?\\n\\nהמאבק הבא של הארץ. של הפרוגרסיבים. של האנטי-ציוניים במדינת ישראל הוא היותה של מדינת ישראל מדינת כל אזרחיה. מה שרחש מתחת לקרקע יעלה כעת בכל כוחו לפרונט.\\n ותודות לנפתלי בנט לגדעון סער ולאלקין\\n בהנהגת הרפורמי בנט ☠💀☠\\n\\nגליה רהב הבוקר ברדיו: קריטי לחסן עכשיו בחיסון שלישי את מדוכאי החיסון. מעריכה שנתחיל עוד השבוע. אין כמובן קשר לזה שבסוף החודש החיסונים פגי תוקף ולפיכך הילדים (שצריכים 2 זריקות). כבר לא קהל יעד. סוכנת מכירות על מלא מלא\\n שתהווה דוגמא ותתחסן ראשונה. היא בהחלט מדוכאת מוח וזה דיכוי חיסוני ממדרגה ראשונה\\n עזוב את פג התוקף, אפילו ה FDA צחק להם בפרצוף על הבקשה הזו.\\n פרזנטורית על מלא\\n *פרנסה*\\n חחחח\\n אתה גם נחשב למדוכא חיסון....\\n אפשר להיות רגועים https://t.co/KuXiobwP4r\\n היא שטן בלתי נסבלת\\n צריך לשלול מהאישה הזאת את רישיון הרפואה\\n ברורררררר\\n אנשים במדינה 'מדוכאי גליה רהב', לא 'מדוכאי חיסון'.\\n\n" | |
] | |
} | |
], | |
"source": [ | |
"print(raw_datasets[\"train\"][656645][\"text\"])\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" הזיה, מה קורה איתו באמת?\\n אופיר לובל. הפך לבמאי שעבד בין היתר במערכונים של ארץ נהדרת, הסרט ״מה כבר יכול לקרות״ ובלקספייס\\n\\nאיזה מוזר זה בוקר חופשי בלי ילדים https://t.co/ZU7rVGlkJN\\n ירידה דראסטית באיכות התוכן\\n פרסום ראשון: בלי הילדים אבי אטיאס הוא בעצם דור פרץ\\n תמחק את החיוך לפני שמתקשרים אלייך כי הילדה העלתה חום. לא צועקים יש לפני הגול כמאמר הקלישאה\\n ילדים לא, כיווץ מצח למצלמה✅\\n\\nאתמול מישהו תקף אותי על שנתתי לייק לציוץ של פוליטיקאי. אז אני פה כדי להזכיר לכולם שאעשה לייקים לכל ציוץ שאני אוהב ולכל מי שאני רוצה. ואגב, לייקים בשונה מהמון דברים לא עולה כסף (שימו לב למספר הלייקים שעשיתי עד כה) ולכן, גם לציוץ הזה אתם מוזמנים ללייק בכיף. https://t.co/dI0Ic4c8y2\\n יש לך 165 שאתה עוקב אחריהם.\\n כל הכבוד לך, אל תתן להם לשנות אותך\\n לכל ציוץ שאתה אוהב ורוצה..ולי כמובן..\\n נראה לך שמאמינים לך?\\n חחחח, קמצן. https://t.co/G8D7glsm2h\\n\\n לא עזב. יחד עם מיקי חיימוביץ ורם שפע הצביע נגד הקואליציה במארב מתוכנן והפיל את הממשלה הפריטטית ששלושתם היו חלק ממנה. זו לא היתה החלטה קלה להפיל ממשלה ולגרור את המדינה לבחירות רביעיות תוך שנתים. זמיר הביע כבר זמן מה לפני כן את חוסר נוחותו מהתנהלות הממשלה וגם התפטר מתפקידו כשר.\\n\\n יותר חרא מברגיל? 🤣🤣🤣\\n רעיון טוב\\n\n" | |
] | |
} | |
], | |
"source": [ | |
"print(raw_datasets[\"test\"][6546][\"text\"])\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"id": "xkxxKLVdFM8L" | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_training_corpus():\n", | |
" dataset = raw_datasets[\"train\"]\n", | |
" for start_idx in range(0, len(dataset), 1000):\n", | |
" samples = dataset[start_idx : start_idx + 1000]\n", | |
" yield samples[\"text\"]\n", | |
"\n", | |
"#def get_training_corpus():\n", | |
"# for i in range(0, len(raw_datasets[\"test\"]), 1000):\n", | |
" #yield raw_datasets[\"test\"][i : i + 1000][\"text\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"id": "4ynOkDnVFM8O" | |
}, | |
"outputs": [], | |
"source": [ | |
"from transformers import AutoTokenizer\n", | |
"old_tokenizer = AutoTokenizer.from_pretrained(\"openai-community/gpt2\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"78\n" | |
] | |
} | |
], | |
"source": [ | |
"example = '''שלום לכולם:\n", | |
" \"\"\"האיש האחרון עליי אדמות ישב לבד בחדרו כשלפתע.\"\"\"\n", | |
" Hello world'''\n", | |
"print(len(old_tokenizer.tokenize(example)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"id": "myM5RipKFM8P" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), 14000) #50000\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"id": "-DBFJk_eS6t1" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"29\n", | |
"78\n" | |
] | |
} | |
], | |
"source": [ | |
"example = '''שלום לכולם:\n", | |
" \"\"\"האיש האחרון עליי אדמות ישב לבד בחדרו כשלפתע.\"\"\"\n", | |
" Hello world'''\n", | |
"tokens = tokenizer.tokenize(example)\n", | |
"print(len(tokens))\n", | |
"print(len(old_tokenizer.tokenize(example)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"id": "6LYw9KL2FM8R" | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"('hebrew-14k/tokenizer_config.json',\n", | |
" 'hebrew-14k/special_tokens_map.json',\n", | |
" 'hebrew-14k/vocab.json',\n", | |
" 'hebrew-14k/merges.txt',\n", | |
" 'hebrew-14k/added_tokens.json',\n", | |
" 'hebrew-14k/tokenizer.json')" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#tokenizer.save_pretrained(\"gpt2-tokenizer-with-added-hebrew-14k\")\n", | |
"tokenizer.save_pretrained(\"hebrew-14k\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"id": "KNx5PaAgFM8S" | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "232c4222c5434abe9b2bbaa90dff6959", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Upload 1 LFS files: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "ae26052036444545bc07389f9cb58ee1", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"tokenizer.model: 0%| | 0.00/500k [00:00<?, ?B/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"CommitInfo(commit_url='https://huggingface.co/Norod78/llama-hebrew-tokenizer-20k/commit/52e9922a37d990df70f5dff710badca3ccdd9940', commit_message='Upload tokenizer', commit_description='', oid='52e9922a37d990df70f5dff710badca3ccdd9940', pr_url=None, pr_revision=None, pr_num=None)" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#tokenizer.push_to_hub(\"Norod78/gpt2-tokenizer-with-added-hebrew-14k\")\n", | |
"#tokenizer.push_to_hub(\"Norod78/hebrew-14k\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"id": "bf9wYpV4FM8T" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Replace \"huggingface-course\" below with your actual namespace to use your own tokenizer\n", | |
"#tokenizer = AutoTokenizer.from_pretrained(\"Norod78/gpt-j-hebrew-tokenizer\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"id": "eAlNCHQpJaOP" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Hebrew tokenizer: Tokens = 83 length = 267\n", | |
"English tokenizer: Tokens = 321 length = 267\n" | |
] | |
} | |
], | |
"source": [ | |
"from transformers import AutoTokenizer\n", | |
"tokenizer_heb = AutoTokenizer.from_pretrained(\"./hebrew-14k\")\n", | |
"tokenizer_eng = AutoTokenizer.from_pretrained(\"openai-community/gpt2\")\n", | |
"prompt_text='''מודל ראשון בגודל 6-מיליארד פרמטרים מתאמן כרגע על חלק מהדאטסטים שהגבתם, עכשיו כשהמודל על האש אני אתפנה לענות לכולם. מתנצל על העיכוב, קיבלתי המון הודעות ולא ציפיתי לכזו הענות, אתם אדירים!\n", | |
"שלב הבא: להרכיב דאטהסט אחד ענק מכל הרעיונות והלינקים שצירפתם בשביל האימון המרכזי.'''\n", | |
"prompt_length = len(prompt_text)\n", | |
"encoded_prompt_heb = tokenizer_heb.encode(prompt_text, add_special_tokens=False, return_tensors=\"pt\")\n", | |
"num_of_tokenz_heb = encoded_prompt_heb.size()[-1]\n", | |
"print(f\"Hebrew tokenizer: Tokens = {num_of_tokenz_heb} length = {prompt_length}\") #Hebrew tokenizer: Tokens = 1 length = 4\n", | |
"\n", | |
"encoded_prompt_eng = tokenizer_eng.encode(prompt_text, add_special_tokens=False, return_tensors=\"pt\")\n", | |
"num_of_tokenz_eng = encoded_prompt_eng.size()[-1]\n", | |
"print(f\"English tokenizer: Tokens = {num_of_tokenz_eng} length = {prompt_length}\") #Prints: Tokens = 5 length = 4" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"id": "SDl4QYG-KKpj" | |
}, | |
"outputs": [], | |
"source": [ | |
"decoded_text = tokenizer_heb.decode(encoded_prompt_heb[-1])\n", | |
"#assert decoded_text == prompt_text" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"id": "v08muCceKiLq" | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'מודל ראשון בגודל 6-מיליארד פרמטרים מתאמן כרגע על חלק מהדאטסטים שהגבתם, עכשיו כשהמודל על האש אני אתפנה לענות לכולם. מתנצל על העיכוב, קיבלתי המון הודעות ולא ציפיתי לכזו הענות, אתם אדירים!\\nשלב הבא: להרכיב דאטהסט אחד ענק מכל הרעיונות והלינקים שצירפתם בשביל האימון המרכזי.'" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"decoded_text" | |
] | |
} | |
], | |
"metadata": { | |
"colab": { | |
"machine_shape": "hm", | |
"private_outputs": true, | |
"provenance": [] | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.14" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment