Last active
August 30, 2023 10:00
-
-
Save gromgull/58981f4f76719b6e7eb0508cdf8f990d to your computer and use it in GitHub Desktop.
Unigram Tokenizer Tutorial from Huggingface
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "1ddf9f86", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "corpus = [\n", | |
| " \"This is the Hugging Face Course.\",\n", | |
| " \"This chapter is about tokenization.\",\n", | |
| " \"This section shows several tokenizer algorithms.\",\n", | |
| " \"Hopefully, you will be able to understand how they are trained and generate tokens.\",\n", | |
| "]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "eab45427", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "2023-08-30 10:25:33.079067: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", | |
| "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "50f14aa634744fa7a997fd55af6d0cc5", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Downloading: 0%| | 0.00/760 [00:00<?, ?B/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "21c4921369f84c14b187b9bfec36d933", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Downloading: 0%| | 0.00/798k [00:00<?, ?B/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "484f586a5acb4d7da04d7a3f10cc99c1", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Downloading: 0%| | 0.00/1.38M [00:00<?, ?B/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "from transformers import AutoTokenizer\n", | |
| "\n", | |
| "tokenizer = AutoTokenizer.from_pretrained(\"xlnet-base-cased\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "534bc241", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "defaultdict(int,\n", | |
| " {'▁This': 3,\n", | |
| " '▁is': 2,\n", | |
| " '▁the': 1,\n", | |
| " '▁Hugging': 1,\n", | |
| " '▁Face': 1,\n", | |
| " '▁Course.': 1,\n", | |
| " '▁chapter': 1,\n", | |
| " '▁about': 1,\n", | |
| " '▁tokenization.': 1,\n", | |
| " '▁section': 1,\n", | |
| " '▁shows': 1,\n", | |
| " '▁several': 1,\n", | |
| " '▁tokenizer': 1,\n", | |
| " '▁algorithms.': 1,\n", | |
| " '▁Hopefully,': 1,\n", | |
| " '▁you': 1,\n", | |
| " '▁will': 1,\n", | |
| " '▁be': 1,\n", | |
| " '▁able': 1,\n", | |
| " '▁to': 1,\n", | |
| " '▁understand': 1,\n", | |
| " '▁how': 1,\n", | |
| " '▁they': 1,\n", | |
| " '▁are': 1,\n", | |
| " '▁trained': 1,\n", | |
| " '▁and': 1,\n", | |
| " '▁generate': 1,\n", | |
| " '▁tokens.': 1})" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "from collections import defaultdict\n", | |
| "\n", | |
| "word_freqs = defaultdict(int)\n", | |
| "for text in corpus:\n", | |
| " words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n", | |
| " new_words = [word for word, offset in words_with_offsets]\n", | |
| " for word in new_words:\n", | |
| " word_freqs[word] += 1\n", | |
| "\n", | |
| "word_freqs" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "id": "328b80c2", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('▁t', 7),\n", | |
| " ('is', 5),\n", | |
| " ('er', 5),\n", | |
| " ('▁a', 5),\n", | |
| " ('▁to', 4),\n", | |
| " ('to', 4),\n", | |
| " ('en', 4),\n", | |
| " ('▁T', 3),\n", | |
| " ('▁Th', 3),\n", | |
| " ('▁Thi', 3)]" | |
| ] | |
| }, | |
| "execution_count": 21, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "char_freqs = defaultdict(int)\n", | |
| "subwords_freqs = defaultdict(int)\n", | |
| "for word, freq in word_freqs.items():\n", | |
| " for i in range(len(word)):\n", | |
| " char_freqs[word[i]] += freq\n", | |
| " # Loop through the subwords of length at least 2\n", | |
| " for j in range(i + 2, len(word) + 1):\n", | |
| " subwords_freqs[word[i:j]] += freq\n", | |
| "\n", | |
| "# Sort subwords by frequency\n", | |
| "sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)\n", | |
| "sorted_subwords[:10]\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "id": "d70bb8d1", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "token_freqs = list(char_freqs.items()) + sorted_subwords[: 300 - len(char_freqs)]\n", | |
| "token_freqs = {token: freq for token, freq in token_freqs}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "id": "ec931454", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'▁': 31,\n", | |
| " 'T': 3,\n", | |
| " 'h': 9,\n", | |
| " 'i': 13,\n", | |
| " 's': 13,\n", | |
| " 't': 14,\n", | |
| " 'e': 21,\n", | |
| " 'H': 2,\n", | |
| " 'u': 6,\n", | |
| " 'g': 5,\n", | |
| " 'n': 11,\n", | |
| " 'F': 1,\n", | |
| " 'a': 12,\n", | |
| " 'c': 3,\n", | |
| " 'C': 1,\n", | |
| " 'o': 13,\n", | |
| " 'r': 9,\n", | |
| " '.': 4,\n", | |
| " 'p': 2,\n", | |
| " 'b': 3,\n", | |
| " 'k': 3,\n", | |
| " 'z': 2,\n", | |
| " 'w': 3,\n", | |
| " 'v': 1,\n", | |
| " 'l': 7,\n", | |
| " 'm': 1,\n", | |
| " 'f': 1,\n", | |
| " 'y': 3,\n", | |
| " ',': 1,\n", | |
| " 'd': 4,\n", | |
| " '▁t': 7,\n", | |
| " 'is': 5,\n", | |
| " 'er': 5,\n", | |
| " '▁a': 5,\n", | |
| " '▁to': 4,\n", | |
| " 'to': 4,\n", | |
| " 'en': 4,\n", | |
| " '▁T': 3,\n", | |
| " '▁Th': 3,\n", | |
| " '▁Thi': 3,\n", | |
| " '▁This': 3,\n", | |
| " 'Th': 3,\n", | |
| " 'Thi': 3,\n", | |
| " 'This': 3,\n", | |
| " 'hi': 3,\n", | |
| " 'his': 3,\n", | |
| " 'th': 3,\n", | |
| " 'ou': 3,\n", | |
| " 'se': 3,\n", | |
| " '▁tok': 3,\n", | |
| " '▁toke': 3,\n", | |
| " '▁token': 3,\n", | |
| " 'tok': 3,\n", | |
| " 'toke': 3,\n", | |
| " 'token': 3,\n", | |
| " 'ok': 3,\n", | |
| " 'oke': 3,\n", | |
| " 'oken': 3,\n", | |
| " 'ke': 3,\n", | |
| " 'ken': 3,\n", | |
| " '▁s': 3,\n", | |
| " 'ra': 3,\n", | |
| " 'nd': 3,\n", | |
| " '▁i': 2,\n", | |
| " '▁is': 2,\n", | |
| " '▁th': 2,\n", | |
| " '▁the': 2,\n", | |
| " 'the': 2,\n", | |
| " 'he': 2,\n", | |
| " '▁H': 2,\n", | |
| " 'in': 2,\n", | |
| " 'rs': 2,\n", | |
| " 'te': 2,\n", | |
| " '▁ab': 2,\n", | |
| " 'ab': 2,\n", | |
| " '▁tokeni': 2,\n", | |
| " '▁tokeniz': 2,\n", | |
| " 'tokeni': 2,\n", | |
| " 'tokeniz': 2,\n", | |
| " 'okeni': 2,\n", | |
| " 'okeniz': 2,\n", | |
| " 'keni': 2,\n", | |
| " 'keniz': 2,\n", | |
| " 'eni': 2,\n", | |
| " 'eniz': 2,\n", | |
| " 'ni': 2,\n", | |
| " 'niz': 2,\n", | |
| " 'iz': 2,\n", | |
| " 'at': 2,\n", | |
| " 'ti': 2,\n", | |
| " 'tio': 2,\n", | |
| " 'tion': 2,\n", | |
| " 'io': 2,\n", | |
| " 'ion': 2,\n", | |
| " 'on': 2,\n", | |
| " '▁se': 2,\n", | |
| " 'ho': 2,\n", | |
| " 'how': 2,\n", | |
| " 'ow': 2,\n", | |
| " 'era': 2,\n", | |
| " 'al': 2,\n", | |
| " 's.': 2,\n", | |
| " 'll': 2,\n", | |
| " 'an': 2,\n", | |
| " 'and': 2,\n", | |
| " 'ne': 2,\n", | |
| " '▁Hu': 1,\n", | |
| " '▁Hug': 1,\n", | |
| " '▁Hugg': 1,\n", | |
| " '▁Huggi': 1,\n", | |
| " '▁Huggin': 1,\n", | |
| " '▁Hugging': 1,\n", | |
| " 'Hu': 1,\n", | |
| " 'Hug': 1,\n", | |
| " 'Hugg': 1,\n", | |
| " 'Huggi': 1,\n", | |
| " 'Huggin': 1,\n", | |
| " 'Hugging': 1,\n", | |
| " 'ug': 1,\n", | |
| " 'ugg': 1,\n", | |
| " 'uggi': 1,\n", | |
| " 'uggin': 1,\n", | |
| " 'ugging': 1,\n", | |
| " 'gg': 1,\n", | |
| " 'ggi': 1,\n", | |
| " 'ggin': 1,\n", | |
| " 'gging': 1,\n", | |
| " 'gi': 1,\n", | |
| " 'gin': 1,\n", | |
| " 'ging': 1,\n", | |
| " 'ing': 1,\n", | |
| " 'ng': 1,\n", | |
| " '▁F': 1,\n", | |
| " '▁Fa': 1,\n", | |
| " '▁Fac': 1,\n", | |
| " '▁Face': 1,\n", | |
| " 'Fa': 1,\n", | |
| " 'Fac': 1,\n", | |
| " 'Face': 1,\n", | |
| " 'ac': 1,\n", | |
| " 'ace': 1,\n", | |
| " 'ce': 1,\n", | |
| " '▁C': 1,\n", | |
| " '▁Co': 1,\n", | |
| " '▁Cou': 1,\n", | |
| " '▁Cour': 1,\n", | |
| " '▁Cours': 1,\n", | |
| " '▁Course': 1,\n", | |
| " '▁Course.': 1,\n", | |
| " 'Co': 1,\n", | |
| " 'Cou': 1,\n", | |
| " 'Cour': 1,\n", | |
| " 'Cours': 1,\n", | |
| " 'Course': 1,\n", | |
| " 'Course.': 1,\n", | |
| " 'our': 1,\n", | |
| " 'ours': 1,\n", | |
| " 'ourse': 1,\n", | |
| " 'ourse.': 1,\n", | |
| " 'ur': 1,\n", | |
| " 'urs': 1,\n", | |
| " 'urse': 1,\n", | |
| " 'urse.': 1,\n", | |
| " 'rse': 1,\n", | |
| " 'rse.': 1,\n", | |
| " 'se.': 1,\n", | |
| " 'e.': 1,\n", | |
| " '▁c': 1,\n", | |
| " '▁ch': 1,\n", | |
| " '▁cha': 1,\n", | |
| " '▁chap': 1,\n", | |
| " '▁chapt': 1,\n", | |
| " '▁chapte': 1,\n", | |
| " '▁chapter': 1,\n", | |
| " 'ch': 1,\n", | |
| " 'cha': 1,\n", | |
| " 'chap': 1,\n", | |
| " 'chapt': 1,\n", | |
| " 'chapte': 1,\n", | |
| " 'chapter': 1,\n", | |
| " 'ha': 1,\n", | |
| " 'hap': 1,\n", | |
| " 'hapt': 1,\n", | |
| " 'hapte': 1,\n", | |
| " 'hapter': 1,\n", | |
| " 'ap': 1,\n", | |
| " 'apt': 1,\n", | |
| " 'apte': 1,\n", | |
| " 'apter': 1,\n", | |
| " 'pt': 1,\n", | |
| " 'pte': 1,\n", | |
| " 'pter': 1,\n", | |
| " 'ter': 1,\n", | |
| " '▁abo': 1,\n", | |
| " '▁abou': 1,\n", | |
| " '▁about': 1,\n", | |
| " 'abo': 1,\n", | |
| " 'abou': 1,\n", | |
| " 'about': 1,\n", | |
| " 'bo': 1,\n", | |
| " 'bou': 1,\n", | |
| " 'bout': 1,\n", | |
| " 'out': 1,\n", | |
| " 'ut': 1,\n", | |
| " '▁tokeniza': 1,\n", | |
| " '▁tokenizat': 1,\n", | |
| " '▁tokenizati': 1,\n", | |
| " '▁tokenizatio': 1,\n", | |
| " '▁tokenization': 1,\n", | |
| " '▁tokenization.': 1,\n", | |
| " 'tokeniza': 1,\n", | |
| " 'tokenizat': 1,\n", | |
| " 'tokenizati': 1,\n", | |
| " 'tokenizatio': 1,\n", | |
| " 'tokenization': 1,\n", | |
| " 'tokenization.': 1,\n", | |
| " 'okeniza': 1,\n", | |
| " 'okenizat': 1,\n", | |
| " 'okenizati': 1,\n", | |
| " 'okenizatio': 1,\n", | |
| " 'okenization': 1,\n", | |
| " 'okenization.': 1,\n", | |
| " 'keniza': 1,\n", | |
| " 'kenizat': 1,\n", | |
| " 'kenizati': 1,\n", | |
| " 'kenizatio': 1,\n", | |
| " 'kenization': 1,\n", | |
| " 'kenization.': 1,\n", | |
| " 'eniza': 1,\n", | |
| " 'enizat': 1,\n", | |
| " 'enizati': 1,\n", | |
| " 'enizatio': 1,\n", | |
| " 'enization': 1,\n", | |
| " 'enization.': 1,\n", | |
| " 'niza': 1,\n", | |
| " 'nizat': 1,\n", | |
| " 'nizati': 1,\n", | |
| " 'nizatio': 1,\n", | |
| " 'nization': 1,\n", | |
| " 'nization.': 1,\n", | |
| " 'iza': 1,\n", | |
| " 'izat': 1,\n", | |
| " 'izati': 1,\n", | |
| " 'izatio': 1,\n", | |
| " 'ization': 1,\n", | |
| " 'ization.': 1,\n", | |
| " 'za': 1,\n", | |
| " 'zat': 1,\n", | |
| " 'zati': 1,\n", | |
| " 'zatio': 1,\n", | |
| " 'zation': 1,\n", | |
| " 'zation.': 1,\n", | |
| " 'ati': 1,\n", | |
| " 'atio': 1,\n", | |
| " 'ation': 1,\n", | |
| " 'ation.': 1,\n", | |
| " 'tion.': 1,\n", | |
| " 'ion.': 1,\n", | |
| " 'on.': 1,\n", | |
| " 'n.': 1,\n", | |
| " '▁sec': 1,\n", | |
| " '▁sect': 1,\n", | |
| " '▁secti': 1,\n", | |
| " '▁sectio': 1,\n", | |
| " '▁section': 1,\n", | |
| " 'sec': 1,\n", | |
| " 'sect': 1,\n", | |
| " 'secti': 1,\n", | |
| " 'sectio': 1,\n", | |
| " 'section': 1,\n", | |
| " 'ec': 1,\n", | |
| " 'ect': 1,\n", | |
| " 'ecti': 1,\n", | |
| " 'ectio': 1,\n", | |
| " 'ection': 1,\n", | |
| " 'ct': 1,\n", | |
| " 'cti': 1,\n", | |
| " 'ctio': 1,\n", | |
| " 'ction': 1,\n", | |
| " '▁sh': 1,\n", | |
| " '▁sho': 1,\n", | |
| " '▁show': 1,\n", | |
| " '▁shows': 1,\n", | |
| " 'sh': 1,\n", | |
| " 'sho': 1,\n", | |
| " 'show': 1,\n", | |
| " 'shows': 1,\n", | |
| " 'hows': 1,\n", | |
| " 'ows': 1,\n", | |
| " 'ws': 1,\n", | |
| " '▁sev': 1,\n", | |
| " '▁seve': 1,\n", | |
| " '▁sever': 1,\n", | |
| " '▁severa': 1,\n", | |
| " '▁several': 1,\n", | |
| " 'sev': 1,\n", | |
| " 'seve': 1,\n", | |
| " 'sever': 1,\n", | |
| " 'severa': 1,\n", | |
| " 'several': 1}" | |
| ] | |
| }, | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "token_freqs" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "c8622b4f", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from math import log\n", | |
| "\n", | |
| "total_sum = sum([freq for token, freq in token_freqs.items()])\n", | |
| "model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "65abfe44", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def encode_word(word, model):\n", | |
| " best_segmentations = [{\"start\": 0, \"score\": 1}] + [\n", | |
| " {\"start\": None, \"score\": None} for _ in range(len(word))\n", | |
| " ]\n", | |
| " for start_idx in range(len(word)):\n", | |
| " # This should be properly filled by the previous steps of the loop\n", | |
| " best_score_at_start = best_segmentations[start_idx][\"score\"]\n", | |
| " for end_idx in range(start_idx + 1, len(word) + 1):\n", | |
| " token = word[start_idx:end_idx]\n", | |
| " if token in model and best_score_at_start is not None:\n", | |
| " score = model[token] + best_score_at_start\n", | |
| " # If we have found a better segmentation ending at end_idx, we update\n", | |
| " if (\n", | |
| " best_segmentations[end_idx][\"score\"] is None\n", | |
| " or best_segmentations[end_idx][\"score\"] > score\n", | |
| " ):\n", | |
| " best_segmentations[end_idx] = {\"start\": start_idx, \"score\": score}\n", | |
| "\n", | |
| " segmentation = best_segmentations[-1]\n", | |
| " if segmentation[\"score\"] is None:\n", | |
| " # We did not find a tokenization of the word -> unknown\n", | |
| " return [\"<unk>\"], None\n", | |
| "\n", | |
| " score = segmentation[\"score\"]\n", | |
| " start = segmentation[\"start\"]\n", | |
| " end = len(word)\n", | |
| " tokens = []\n", | |
| " while start != 0:\n", | |
| " tokens.insert(0, word[start:end])\n", | |
| " next_start = best_segmentations[start][\"start\"]\n", | |
| " end = start\n", | |
| " start = next_start\n", | |
| " tokens.insert(0, word[start:end])\n", | |
| " return tokens, score" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "b3f4d087", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "(['H', 'o', 'p', 'e', 'f', 'u', 'll', 'y'], 41.5157494601402)\n", | |
| "(['This'], 6.288267030694535)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(encode_word(\"Hopefully\", model))\n", | |
| "print(encode_word(\"This\", model))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "d0e6991d", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def compute_loss(model):\n", | |
| " loss = 0\n", | |
| " for word, freq in word_freqs.items():\n", | |
| " _, word_loss = encode_word(word, model)\n", | |
| " loss += freq * word_loss\n", | |
| " return loss" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "8681ebbb", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "413.10377642940875" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "compute_loss(model)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "id": "2cd27f4c", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import copy\n", | |
| "\n", | |
| "\n", | |
| "def compute_scores(model):\n", | |
| " scores = {}\n", | |
| " model_loss = compute_loss(model)\n", | |
| " for token, score in model.items():\n", | |
| " # We always keep tokens of length 1\n", | |
| " if len(token) == 1:\n", | |
| " continue\n", | |
| " model_without_token = copy.deepcopy(model)\n", | |
| " _ = model_without_token.pop(token)\n", | |
| " scores[token] = compute_loss(model_without_token) - model_loss\n", | |
| " return scores" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "id": "3a42bd3c", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "6.376412403623874\n", | |
| "0.0\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "scores = compute_scores(model)\n", | |
| "print(scores[\"ll\"])\n", | |
| "print(scores[\"his\"])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "id": "53c09fbc", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "percent_to_remove = 0.1\n", | |
| "while len(model) > 100:\n", | |
| " scores = compute_scores(model)\n", | |
| " sorted_scores = sorted(scores.items(), key=lambda x: x[1])\n", | |
| " # Remove percent_to_remove tokens with the lowest scores.\n", | |
| " for i in range(int(len(model) * percent_to_remove)):\n", | |
| " _ = token_freqs.pop(sorted_scores[i][0])\n", | |
| "\n", | |
| " total_sum = sum([freq for token, freq in token_freqs.items()])\n", | |
| " model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "id": "84633d43", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def tokenize(text, model):\n", | |
| " words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n", | |
| " pre_tokenized_text = [word for word, offset in words_with_offsets]\n", | |
| " encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text]\n", | |
| " return sum(encoded_words, [])\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "id": "de95cf77", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['▁This',\n", | |
| " '▁is',\n", | |
| " '▁the',\n", | |
| " '▁Hugging',\n", | |
| " '▁Face',\n", | |
| " '▁',\n", | |
| " 'c',\n", | |
| " 'ou',\n", | |
| " 'r',\n", | |
| " 's',\n", | |
| " 'e',\n", | |
| " '.']" | |
| ] | |
| }, | |
| "execution_count": 26, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "tokenize(\"This is the Hugging Face course.\", model)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "03270dbe", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.6" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment