Norod · July 18, 2024 13:02
diff --git a/combine_tokenizers.py b/combine_tokenizers.py
 """
 Given two tokenizers, combine them and create a new tokenizer
 Usage: python combine_tokenizers.py --tokenizer1 ../config/en/roberta_8 --tokenizer2 ../config/hi/roberta_8 --save_dir ../config/en/en_hi/roberta_8

 Source: https://github.com/huggingface/tokenizers/issues/690#issuecomment-830665989
 """


 # Libraries for tokenizer
 from pathlib import Path
 from tokenizers import ByteLevelBPETokenizer
 import argparse
 import json
 import os
 from tqdm import tqdm
 from transformers import AutoTokenizer
 from timeit import default_timer as timer
 import sys

 def combine_tokenizers(args):
    # Load both the json files, take the union, and store it
    json1 = json.load(open(os.path.join(args.tokenizer1, 'vocab.json')))
    json2 = json.load(open(os.path.join(args.tokenizer2, 'vocab.json')))

    # Create a new vocabulary
    new_vocab = {}
    idx = 0
    for word in json1.keys():
        if word not in new_vocab.keys():
            new_vocab[word] = idx
            idx += 1

    # Add words from second tokenizer
    for word in json2.keys():
        if word not in new_vocab.keys():
            new_vocab[word] = idx
            idx += 1

    # Make the directory if necessary
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)

    # Save the vocab
    with open(os.path.join(args.save_dir, 'vocab.json'), 'w') as fp:
        json.dump(new_vocab, fp, ensure_ascii=False)

    # Merge the two merges file. Don't handle duplicates here
    # Concatenate them, but ignore the first line of the second file
    os.system('cat {} > {}'.format(os.path.join(args.tokenizer1, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt')))
    os.system('tail -n +2 -q {} >> {}'.format(os.path.join(args.tokenizer2, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt')))

    # Save other files
    os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'special_tokens_map.json'), args.save_dir))
    os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'tokenizer_config.json'), args.save_dir))

    # Instantiate the new tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.save_dir, use_fast=True)
    tokenizer.save_pretrained(args.save_dir+'/tokenizer')


 def main():
    parser = argparse.ArgumentParser()

    # Dataset Arguments
    parser.add_argument("--tokenizer1", type=str, required=True, help="")
    parser.add_argument("--tokenizer2", type=str, required=True, help="")
    parser.add_argument("--save_dir", type=str, required=True, help="")

    args = parser.parse_args()

    combine_tokenizers(args)

 if __name__ == '__main__':
    main()
    
diff --git a/save_combined.py b/save_combined.py
 import os
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

 from transformers import AutoModelForCausalLM, AutoTokenizer
 device = "cpu" # for GPU usage or "cpu" for CPU usage
 tokenizer = AutoTokenizer.from_pretrained("./SmolLM-tokenizer-with-added-hebrew-14k")
 # for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
 model = AutoModelForCausalLM.from_pretrained("./SmolLM-135M").to(device)
 model.resize_token_embeddings(len(tokenizer))
 inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device)
 outputs = model.generate(inputs)
 print(tokenizer.decode(outputs[0]))
 model.save_pretrained("./Heb-SmolLM-135M")
 tokenizer.save_pretrained("./Heb-SmolLM-135M")
diff --git a/Training_a_new_tokenizer_from_an_old_one.ipynb b/Training_a_new_tokenizer_from_an_old_one.ipynb
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9GmF1MAYFM8C"
      },
      "source": [
        "# Training a new tokenizer from an old one"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "i6Ckxh5KFM8E"
      },
      "source": [
        "Install the Transformers, Datasets, and Evaluate libraries to run this notebook."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "CNqoBxR5FM8F"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
            "Requirement already satisfied: datasets in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (2.14.0)\n",
            "Requirement already satisfied: evaluate in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (0.4.0)\n",
            "Requirement already satisfied: transformers in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (4.31.0)\n",
            "Requirement already satisfied: sentencepiece in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (0.1.99)\n",
            "Requirement already satisfied: tokenizers in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (0.13.3)\n",
            "Requirement already satisfied: accelerate in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (0.19.0)\n",
            "Collecting accelerate\n",
            "  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)\n",
            "     -------------------------------------- 244.2/244.2 kB 3.0 MB/s eta 0:00:00\n",
            "Requirement already satisfied: numpy>=1.17 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (1.23.5)\n",
            "Requirement already satisfied: packaging in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (22.0)\n",
            "Requirement already satisfied: requests>=2.19.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (2.25.1)\n",
            "Requirement already satisfied: huggingface-hub<1.0.0,>=0.14.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (0.14.1)\n",
            "Requirement already satisfied: dill<0.3.8,>=0.3.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (0.3.6)\n",
            "Requirement already satisfied: pyarrow>=8.0.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (12.0.0)\n",
            "Requirement already satisfied: xxhash in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (3.2.0)\n",
            "Requirement already satisfied: multiprocess in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (0.70.14)\n",
            "Requirement already satisfied: fsspec[http]>=2021.11.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (2023.5.0)\n",
            "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (6.0)\n",
            "Requirement already satisfied: tqdm>=4.62.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (4.65.0)\n",
            "Requirement already satisfied: pandas in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (2.0.1)\n",
            "Requirement already satisfied: aiohttp in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (3.8.4)\n",
            "Requirement already satisfied: responses<0.19 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from evaluate) (0.18.0)\n",
            "Requirement already satisfied: safetensors>=0.3.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from transformers) (0.3.1)\n",
            "Requirement already satisfied: regex!=2019.12.17 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from transformers) (2023.5.5)\n",
            "Requirement already satisfied: filelock in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from transformers) (3.12.0)\n",
            "Requirement already satisfied: torch>=1.10.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from accelerate) (2.0.1+cu118)\n",
            "Requirement already satisfied: psutil in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from accelerate) (5.9.5)\n",
            "Requirement already satisfied: multidict<7.0,>=4.5 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (6.0.4)\n",
            "Requirement already satisfied: attrs>=17.3.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (23.1.0)\n",
            "Requirement already satisfied: yarl<2.0,>=1.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (1.9.2)\n",
            "Requirement already satisfied: frozenlist>=1.1.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (1.3.3)\n",
            "Requirement already satisfied: aiosignal>=1.1.2 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (1.3.1)\n",
            "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (4.0.2)\n",
            "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (2.0.4)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from huggingface-hub<1.0.0,>=0.14.0->datasets) (4.5.0)\n",
            "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from requests>=2.19.0->datasets) (2.10)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from requests>=2.19.0->datasets) (2023.5.7)\n",
            "Requirement already satisfied: chardet<5,>=3.0.2 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from requests>=2.19.0->datasets) (4.0.0)\n",
            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from requests>=2.19.0->datasets) (1.26.14)\n",
            "Requirement already satisfied: jinja2 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from torch>=1.10.0->accelerate) (3.1.2)\n",
            "Requirement already satisfied: sympy in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from torch>=1.10.0->accelerate) (1.11.1)\n",
            "Requirement already satisfied: networkx in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from torch>=1.10.0->accelerate) (2.8.4)\n",
            "Requirement already satisfied: colorama in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from tqdm>=4.62.1->datasets) (0.4.6)\n",
            "Requirement already satisfied: tzdata>=2022.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from pandas->datasets) (2023.3)\n",
            "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from pandas->datasets) (2.8.2)\n",
            "Requirement already satisfied: pytz>=2020.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from pandas->datasets) (2023.3)\n",
            "Requirement already satisfied: six>=1.5 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\doron\\appdata\\roaming\\python\\python310\\site-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.2)\n",
            "Requirement already satisfied: mpmath>=0.19 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from sympy->torch>=1.10.0->accelerate) (1.2.1)\n",
            "Installing collected packages: accelerate\n",
            "  Attempting uninstall: accelerate\n",
            "    Found existing installation: accelerate 0.19.0\n",
            "    Uninstalling accelerate-0.19.0:\n",
            "      Successfully uninstalled accelerate-0.19.0\n",
            "Successfully installed accelerate-0.21.0\n"
          ]
        }
      ],
      "source": [
        "!pip install --upgrade datasets evaluate transformers sentencepiece tokenizers accelerate\n",
        "#!apt install git-lfs"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TZxkkRZXFM8G"
      },
      "source": [
        "You will need to setup git, adapt your email and name in the following cell."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Ba_CYyHDFM8G"
      },
      "outputs": [],
      "source": [
        "!git config --global user.email \"[email protected]\"\n",
        "!git config --global user.name \"Doron Adler\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "VBQxAoBYFM8H"
      },
      "source": [
        "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "qR19bo_2FM8I"
      },
      "outputs": [],
      "source": [
        "from huggingface_hub import notebook_login\n",
        "\n",
        "notebook_login()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "hUx3zfy7FM8I"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/opt/miniconda3/envs/pytorch2/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
            "  from .autonotebook import tqdm as notebook_tqdm\n",
            "Downloading readme: 100%|██████████| 286/286 [00:00<00:00, 766kB/s]\n",
            "Downloading data: 100%|██████████| 663M/663M [00:33<00:00, 20.1MB/s] \n",
            "Downloading data: 100%|██████████| 323M/323M [00:16<00:00, 19.1MB/s] \n",
            "Downloading data: 100%|██████████| 386M/386M [00:23<00:00, 16.3MB/s] \n",
            "Downloading data: 100%|██████████| 190M/190M [00:09<00:00, 20.3MB/s] \n",
            "Downloading data: 100%|██████████| 147M/147M [00:08<00:00, 17.7MB/s] \n",
            "Downloading data: 100%|██████████| 147M/147M [00:07<00:00, 18.9MB/s] \n",
            "Downloading data: 100%|██████████| 144M/144M [00:07<00:00, 20.2MB/s] \n",
            "Downloading data: 100%|██████████| 147M/147M [00:07<00:00, 19.1MB/s] \n",
            "Downloading data: 100%|██████████| 147M/147M [00:08<00:00, 18.2MB/s] \n",
            "Downloading data: 100%|██████████| 146M/146M [00:07<00:00, 18.3MB/s] \n",
            "Downloading data: 100%|██████████| 326M/326M [00:18<00:00, 17.9MB/s] \n",
            "Generating train split: 100%|██████████| 2188612/2188612 [00:04<00:00, 536444.11 examples/s]\n"
          ]
        }
      ],
      "source": [
        "from datasets import load_dataset\n",
        "\n",
        "# This can take a few minutes to load, so grab a coffee or tea while you wait!\n",
        "#raw_datasets = load_dataset(\"Norod78/hewiki-20220901-articles-dataset\")\n",
        "raw_datasets = load_dataset(\"Norod78/Hebrew-corpus-other\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/plain": [
              "Dataset({\n",
              "    features: ['text'],\n",
              "    num_rows: 2188612\n",
              "})"
            ]
          },
          "execution_count": 2,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "raw_datasets['train']"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Dataset({\n",
            "    features: ['text'],\n",
            "    num_rows: 43773\n",
            "})\n"
          ]
        }
      ],
      "source": [
        "raw_datasets = raw_datasets['train'].train_test_split(test_size=0.02, seed=42)\n",
        "\n",
        "print(raw_datasets['test'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "id": "bpoiLcGpFM8J"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "DatasetDict({\n",
              "    train: Dataset({\n",
              "        features: ['text'],\n",
              "        num_rows: 2144839\n",
              "    })\n",
              "    test: Dataset({\n",
              "        features: ['text'],\n",
              "        num_rows: 43773\n",
              "    })\n",
              "})"
            ]
          },
          "execution_count": 4,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "raw_datasets"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "id": "1ZmzNR_PFM8K"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "כאן: פתחנו מלא פודקאסטים אז אנחנו המצאנו את הפודקאסטים בארץ \\n הוא ממציא הדאבל אלבו. מתי יוצא הפרק של TALKYO?\\n האם ידידי יודע שישנם פודקאסטים ותיקים באנגלית המזכירים מאוד כמה מהפודקאסטים של כאן?\\n\\nהמאבק הבא של הארץ. של הפרוגרסיבים. של האנטי-ציוניים במדינת ישראל הוא היותה של מדינת ישראל מדינת כל אזרחיה. מה שרחש מתחת לקרקע יעלה כעת בכל כוחו לפרונט.\\n ותודות לנפתלי בנט לגדעון סער ולאלקין\\n בהנהגת הרפורמי בנט ☠💀☠\\n\\nגליה רהב הבוקר ברדיו: קריטי לחסן עכשיו בחיסון שלישי את מדוכאי החיסון. מעריכה שנתחיל עוד השבוע. אין כמובן קשר לזה שבסוף החודש החיסונים פגי תוקף ולפיכך הילדים (שצריכים 2 זריקות). כבר לא קהל יעד. סוכנת מכירות על מלא מלא\\n שתהווה דוגמא ותתחסן ראשונה. היא בהחלט מדוכאת מוח וזה דיכוי חיסוני ממדרגה ראשונה\\n עזוב את פג התוקף, אפילו ה FDA צחק להם בפרצוף על הבקשה הזו.\\n פרזנטורית על מלא\\n  *פרנסה*\\n  חחחח\\n אתה גם נחשב למדוכא חיסון....\\n  אפשר להיות רגועים https://t.co/KuXiobwP4r\\n היא שטן בלתי נסבלת\\n צריך לשלול מהאישה הזאת את רישיון הרפואה\\n ברורררררר\\n אנשים במדינה 'מדוכאי גליה רהב', לא 'מדוכאי חיסון'.\\n\n"
          ]
        }
      ],
      "source": [
        "print(raw_datasets[\"train\"][656645][\"text\"])\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " הזיה, מה קורה איתו באמת?\\n  אופיר לובל. הפך לבמאי שעבד בין היתר במערכונים של ארץ נהדרת, הסרט ״מה כבר יכול לקרות״ ובלקספייס\\n\\nאיזה מוזר זה בוקר חופשי בלי ילדים https://t.co/ZU7rVGlkJN\\n ירידה דראסטית באיכות התוכן\\n פרסום ראשון: בלי הילדים אבי אטיאס הוא בעצם דור פרץ\\n תמחק את החיוך לפני שמתקשרים אלייך כי הילדה העלתה חום. לא צועקים יש לפני הגול כמאמר הקלישאה\\n ילדים לא, כיווץ מצח למצלמה✅\\n\\nאתמול מישהו תקף אותי על שנתתי לייק לציוץ של פוליטיקאי. אז אני פה כדי להזכיר לכולם שאעשה לייקים לכל ציוץ שאני אוהב ולכל מי שאני רוצה. ואגב, לייקים בשונה מהמון דברים לא עולה כסף (שימו לב למספר הלייקים שעשיתי עד כה) ולכן, גם לציוץ הזה אתם מוזמנים ללייק בכיף. https://t.co/dI0Ic4c8y2\\n יש לך 165 שאתה עוקב אחריהם.\\n כל הכבוד לך, אל תתן להם לשנות אותך\\n לכל ציוץ שאתה אוהב ורוצה..ולי כמובן..\\n נראה לך שמאמינים לך?\\n חחחח, קמצן. https://t.co/G8D7glsm2h\\n\\n  לא עזב. יחד עם מיקי חיימוביץ ורם שפע הצביע נגד הקואליציה במארב מתוכנן והפיל את הממשלה הפריטטית ששלושתם היו חלק ממנה. זו לא היתה החלטה קלה להפיל ממשלה ולגרור את המדינה לבחירות רביעיות תוך שנתים. זמיר הביע כבר זמן מה לפני כן את חוסר נוחותו מהתנהלות הממשלה וגם התפטר מתפקידו כשר.\\n\\n יותר חרא מברגיל? 🤣🤣🤣\\n רעיון טוב\\n\n"
          ]
        }
      ],
      "source": [
        "print(raw_datasets[\"test\"][6546][\"text\"])\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "id": "xkxxKLVdFM8L"
      },
      "outputs": [],
      "source": [
        "def get_training_corpus():\n",
        "    dataset = raw_datasets[\"train\"]\n",
        "    for start_idx in range(0, len(dataset), 1000):\n",
        "        samples = dataset[start_idx : start_idx + 1000]\n",
        "        yield samples[\"text\"]\n",
        "\n",
        "#def get_training_corpus():\n",
        "#    for i in range(0, len(raw_datasets[\"test\"]), 1000):\n",
        "        #yield raw_datasets[\"test\"][i : i + 1000][\"text\"]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "id": "4ynOkDnVFM8O"
      },
      "outputs": [],
      "source": [
        "from transformers import AutoTokenizer\n",
        "old_tokenizer = AutoTokenizer.from_pretrained(\"openai-community/gpt2\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "78\n"
          ]
        }
      ],
      "source": [
        "example = '''שלום לכולם:\n",
        "    \"\"\"האיש האחרון עליי אדמות ישב לבד בחדרו כשלפתע.\"\"\"\n",
        "    Hello world'''\n",
        "print(len(old_tokenizer.tokenize(example)))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "id": "myM5RipKFM8P"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "\n",
            "\n"
          ]
        }
      ],
      "source": [
        "tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), 14000) #50000\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {
        "id": "-DBFJk_eS6t1"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "29\n",
            "78\n"
          ]
        }
      ],
      "source": [
        "example = '''שלום לכולם:\n",
        "    \"\"\"האיש האחרון עליי אדמות ישב לבד בחדרו כשלפתע.\"\"\"\n",
        "    Hello world'''\n",
        "tokens = tokenizer.tokenize(example)\n",
        "print(len(tokens))\n",
        "print(len(old_tokenizer.tokenize(example)))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "id": "6LYw9KL2FM8R"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "('hebrew-14k/tokenizer_config.json',\n",
              " 'hebrew-14k/special_tokens_map.json',\n",
              " 'hebrew-14k/vocab.json',\n",
              " 'hebrew-14k/merges.txt',\n",
              " 'hebrew-14k/added_tokens.json',\n",
              " 'hebrew-14k/tokenizer.json')"
            ]
          },
          "execution_count": 12,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "#tokenizer.save_pretrained(\"gpt2-tokenizer-with-added-hebrew-14k\")\n",
        "tokenizer.save_pretrained(\"hebrew-14k\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {
        "id": "KNx5PaAgFM8S"
      },
      "outputs": [
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "232c4222c5434abe9b2bbaa90dff6959",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "ae26052036444545bc07389f9cb58ee1",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "CommitInfo(commit_url='https://huggingface.co/Norod78/llama-hebrew-tokenizer-20k/commit/52e9922a37d990df70f5dff710badca3ccdd9940', commit_message='Upload tokenizer', commit_description='', oid='52e9922a37d990df70f5dff710badca3ccdd9940', pr_url=None, pr_revision=None, pr_num=None)"
            ]
          },
          "execution_count": 13,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "#tokenizer.push_to_hub(\"Norod78/gpt2-tokenizer-with-added-hebrew-14k\")\n",
        "#tokenizer.push_to_hub(\"Norod78/hebrew-14k\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 14,
      "metadata": {
        "id": "bf9wYpV4FM8T"
      },
      "outputs": [],
      "source": [
        "# Replace \"huggingface-course\" below with your actual namespace to use your own tokenizer\n",
        "#tokenizer = AutoTokenizer.from_pretrained(\"Norod78/gpt-j-hebrew-tokenizer\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 15,
      "metadata": {
        "id": "eAlNCHQpJaOP"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Hebrew tokenizer: Tokens = 83 length = 267\n",
            "English tokenizer: Tokens = 321 length = 267\n"
          ]
        }
      ],
      "source": [
        "from transformers import AutoTokenizer\n",
        "tokenizer_heb = AutoTokenizer.from_pretrained(\"./hebrew-14k\")\n",
        "tokenizer_eng = AutoTokenizer.from_pretrained(\"openai-community/gpt2\")\n",
        "prompt_text='''מודל ראשון בגודל 6-מיליארד פרמטרים מתאמן כרגע על חלק מהדאטסטים שהגבתם, עכשיו כשהמודל על האש אני אתפנה לענות לכולם. מתנצל על העיכוב, קיבלתי המון הודעות ולא ציפיתי לכזו הענות, אתם אדירים!\n",
        "שלב הבא: להרכיב דאטהסט אחד ענק מכל הרעיונות והלינקים שצירפתם בשביל האימון המרכזי.'''\n",
        "prompt_length = len(prompt_text)\n",
        "encoded_prompt_heb = tokenizer_heb.encode(prompt_text, add_special_tokens=False, return_tensors=\"pt\")\n",
        "num_of_tokenz_heb = encoded_prompt_heb.size()[-1]\n",
        "print(f\"Hebrew tokenizer: Tokens = {num_of_tokenz_heb} length = {prompt_length}\") #Hebrew tokenizer: Tokens = 1 length = 4\n",
        "\n",
        "encoded_prompt_eng = tokenizer_eng.encode(prompt_text, add_special_tokens=False, return_tensors=\"pt\")\n",
        "num_of_tokenz_eng = encoded_prompt_eng.size()[-1]\n",
        "print(f\"English tokenizer: Tokens = {num_of_tokenz_eng} length = {prompt_length}\") #Prints: Tokens = 5 length = 4"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 16,
      "metadata": {
        "id": "SDl4QYG-KKpj"
      },
      "outputs": [],
      "source": [
        "decoded_text = tokenizer_heb.decode(encoded_prompt_heb[-1])\n",
        "#assert  decoded_text == prompt_text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "metadata": {
        "id": "v08muCceKiLq"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "'מודל ראשון בגודל 6-מיליארד פרמטרים מתאמן כרגע על חלק מהדאטסטים שהגבתם, עכשיו כשהמודל על האש אני אתפנה לענות לכולם. מתנצל על העיכוב, קיבלתי המון הודעות ולא ציפיתי לכזו הענות, אתם אדירים!\\nשלב הבא: להרכיב דאטהסט אחד ענק מכל הרעיונות והלינקים שצירפתם בשביל האימון המרכזי.'"
            ]
          },
          "execution_count": 17,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "decoded_text"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "machine_shape": "hm",
      "private_outputs": true,
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.14"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }
	"""
	Given two tokenizers, combine them and create a new tokenizer
	Usage: python combine_tokenizers.py --tokenizer1 ../config/en/roberta_8 --tokenizer2 ../config/hi/roberta_8 --save_dir ../config/en/en_hi/roberta_8

	Source: https://github.com/huggingface/tokenizers/issues/690#issuecomment-830665989
	"""


	# Libraries for tokenizer
	from pathlib import Path
	from tokenizers import ByteLevelBPETokenizer
	import argparse
	import json
	import os
	from tqdm import tqdm
	from transformers import AutoTokenizer
	from timeit import default_timer as timer
	import sys

	def combine_tokenizers(args):
	# Load both the json files, take the union, and store it
	json1 = json.load(open(os.path.join(args.tokenizer1, 'vocab.json')))
	json2 = json.load(open(os.path.join(args.tokenizer2, 'vocab.json')))

	# Create a new vocabulary
	new_vocab = {}
	idx = 0
	for word in json1.keys():
	if word not in new_vocab.keys():
	new_vocab[word] = idx
	idx += 1

	# Add words from second tokenizer
	for word in json2.keys():
	if word not in new_vocab.keys():
	new_vocab[word] = idx
	idx += 1

	# Make the directory if necessary
	if not os.path.exists(args.save_dir):
	os.makedirs(args.save_dir)

	# Save the vocab
	with open(os.path.join(args.save_dir, 'vocab.json'), 'w') as fp:
	json.dump(new_vocab, fp, ensure_ascii=False)

	# Merge the two merges file. Don't handle duplicates here
	# Concatenate them, but ignore the first line of the second file
	os.system('cat {} > {}'.format(os.path.join(args.tokenizer1, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt')))
	os.system('tail -n +2 -q {} >> {}'.format(os.path.join(args.tokenizer2, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt')))

	# Save other files
	os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'special_tokens_map.json'), args.save_dir))
	os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'tokenizer_config.json'), args.save_dir))

	# Instantiate the new tokenizer
	tokenizer = AutoTokenizer.from_pretrained(args.save_dir, use_fast=True)
	tokenizer.save_pretrained(args.save_dir+'/tokenizer')


	def main():
	parser = argparse.ArgumentParser()

	# Dataset Arguments
	parser.add_argument("--tokenizer1", type=str, required=True, help="")
	parser.add_argument("--tokenizer2", type=str, required=True, help="")
	parser.add_argument("--save_dir", type=str, required=True, help="")

	args = parser.parse_args()

	combine_tokenizers(args)

	if __name__ == '__main__':
	main()
	import os
	os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

	from transformers import AutoModelForCausalLM, AutoTokenizer
	device = "cpu" # for GPU usage or "cpu" for CPU usage
	tokenizer = AutoTokenizer.from_pretrained("./SmolLM-tokenizer-with-added-hebrew-14k")
	# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
	model = AutoModelForCausalLM.from_pretrained("./SmolLM-135M").to(device)
	model.resize_token_embeddings(len(tokenizer))
	inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device)
	outputs = model.generate(inputs)
	print(tokenizer.decode(outputs[0]))
	model.save_pretrained("./Heb-SmolLM-135M")
	tokenizer.save_pretrained("./Heb-SmolLM-135M")