Last active
August 30, 2023 10:00
-
-
Save gromgull/58981f4f76719b6e7eb0508cdf8f990d to your computer and use it in GitHub Desktop.
Unigram Tokenizer Tutorial from Huggingface
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "1ddf9f86", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"corpus = [\n", | |
" \"This is the Hugging Face Course.\",\n", | |
" \"This chapter is about tokenization.\",\n", | |
" \"This section shows several tokenizer algorithms.\",\n", | |
" \"Hopefully, you will be able to understand how they are trained and generate tokens.\",\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "eab45427", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2023-08-30 10:25:33.079067: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", | |
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "50f14aa634744fa7a997fd55af6d0cc5", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Downloading: 0%| | 0.00/760 [00:00<?, ?B/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "21c4921369f84c14b187b9bfec36d933", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Downloading: 0%| | 0.00/798k [00:00<?, ?B/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "484f586a5acb4d7da04d7a3f10cc99c1", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Downloading: 0%| | 0.00/1.38M [00:00<?, ?B/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"from transformers import AutoTokenizer\n", | |
"\n", | |
"tokenizer = AutoTokenizer.from_pretrained(\"xlnet-base-cased\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "534bc241", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"defaultdict(int,\n", | |
" {'▁This': 3,\n", | |
" '▁is': 2,\n", | |
" '▁the': 1,\n", | |
" '▁Hugging': 1,\n", | |
" '▁Face': 1,\n", | |
" '▁Course.': 1,\n", | |
" '▁chapter': 1,\n", | |
" '▁about': 1,\n", | |
" '▁tokenization.': 1,\n", | |
" '▁section': 1,\n", | |
" '▁shows': 1,\n", | |
" '▁several': 1,\n", | |
" '▁tokenizer': 1,\n", | |
" '▁algorithms.': 1,\n", | |
" '▁Hopefully,': 1,\n", | |
" '▁you': 1,\n", | |
" '▁will': 1,\n", | |
" '▁be': 1,\n", | |
" '▁able': 1,\n", | |
" '▁to': 1,\n", | |
" '▁understand': 1,\n", | |
" '▁how': 1,\n", | |
" '▁they': 1,\n", | |
" '▁are': 1,\n", | |
" '▁trained': 1,\n", | |
" '▁and': 1,\n", | |
" '▁generate': 1,\n", | |
" '▁tokens.': 1})" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from collections import defaultdict\n", | |
"\n", | |
"word_freqs = defaultdict(int)\n", | |
"for text in corpus:\n", | |
" words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n", | |
" new_words = [word for word, offset in words_with_offsets]\n", | |
" for word in new_words:\n", | |
" word_freqs[word] += 1\n", | |
"\n", | |
"word_freqs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "328b80c2", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('▁t', 7),\n", | |
" ('is', 5),\n", | |
" ('er', 5),\n", | |
" ('▁a', 5),\n", | |
" ('▁to', 4),\n", | |
" ('to', 4),\n", | |
" ('en', 4),\n", | |
" ('▁T', 3),\n", | |
" ('▁Th', 3),\n", | |
" ('▁Thi', 3)]" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"char_freqs = defaultdict(int)\n", | |
"subwords_freqs = defaultdict(int)\n", | |
"for word, freq in word_freqs.items():\n", | |
" for i in range(len(word)):\n", | |
" char_freqs[word[i]] += freq\n", | |
" # Loop through the subwords of length at least 2\n", | |
" for j in range(i + 2, len(word) + 1):\n", | |
" subwords_freqs[word[i:j]] += freq\n", | |
"\n", | |
"# Sort subwords by frequency\n", | |
"sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)\n", | |
"sorted_subwords[:10]\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "d70bb8d1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"token_freqs = list(char_freqs.items()) + sorted_subwords[: 300 - len(char_freqs)]\n", | |
"token_freqs = {token: freq for token, freq in token_freqs}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"id": "ec931454", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'▁': 31,\n", | |
" 'T': 3,\n", | |
" 'h': 9,\n", | |
" 'i': 13,\n", | |
" 's': 13,\n", | |
" 't': 14,\n", | |
" 'e': 21,\n", | |
" 'H': 2,\n", | |
" 'u': 6,\n", | |
" 'g': 5,\n", | |
" 'n': 11,\n", | |
" 'F': 1,\n", | |
" 'a': 12,\n", | |
" 'c': 3,\n", | |
" 'C': 1,\n", | |
" 'o': 13,\n", | |
" 'r': 9,\n", | |
" '.': 4,\n", | |
" 'p': 2,\n", | |
" 'b': 3,\n", | |
" 'k': 3,\n", | |
" 'z': 2,\n", | |
" 'w': 3,\n", | |
" 'v': 1,\n", | |
" 'l': 7,\n", | |
" 'm': 1,\n", | |
" 'f': 1,\n", | |
" 'y': 3,\n", | |
" ',': 1,\n", | |
" 'd': 4,\n", | |
" '▁t': 7,\n", | |
" 'is': 5,\n", | |
" 'er': 5,\n", | |
" '▁a': 5,\n", | |
" '▁to': 4,\n", | |
" 'to': 4,\n", | |
" 'en': 4,\n", | |
" '▁T': 3,\n", | |
" '▁Th': 3,\n", | |
" '▁Thi': 3,\n", | |
" '▁This': 3,\n", | |
" 'Th': 3,\n", | |
" 'Thi': 3,\n", | |
" 'This': 3,\n", | |
" 'hi': 3,\n", | |
" 'his': 3,\n", | |
" 'th': 3,\n", | |
" 'ou': 3,\n", | |
" 'se': 3,\n", | |
" '▁tok': 3,\n", | |
" '▁toke': 3,\n", | |
" '▁token': 3,\n", | |
" 'tok': 3,\n", | |
" 'toke': 3,\n", | |
" 'token': 3,\n", | |
" 'ok': 3,\n", | |
" 'oke': 3,\n", | |
" 'oken': 3,\n", | |
" 'ke': 3,\n", | |
" 'ken': 3,\n", | |
" '▁s': 3,\n", | |
" 'ra': 3,\n", | |
" 'nd': 3,\n", | |
" '▁i': 2,\n", | |
" '▁is': 2,\n", | |
" '▁th': 2,\n", | |
" '▁the': 2,\n", | |
" 'the': 2,\n", | |
" 'he': 2,\n", | |
" '▁H': 2,\n", | |
" 'in': 2,\n", | |
" 'rs': 2,\n", | |
" 'te': 2,\n", | |
" '▁ab': 2,\n", | |
" 'ab': 2,\n", | |
" '▁tokeni': 2,\n", | |
" '▁tokeniz': 2,\n", | |
" 'tokeni': 2,\n", | |
" 'tokeniz': 2,\n", | |
" 'okeni': 2,\n", | |
" 'okeniz': 2,\n", | |
" 'keni': 2,\n", | |
" 'keniz': 2,\n", | |
" 'eni': 2,\n", | |
" 'eniz': 2,\n", | |
" 'ni': 2,\n", | |
" 'niz': 2,\n", | |
" 'iz': 2,\n", | |
" 'at': 2,\n", | |
" 'ti': 2,\n", | |
" 'tio': 2,\n", | |
" 'tion': 2,\n", | |
" 'io': 2,\n", | |
" 'ion': 2,\n", | |
" 'on': 2,\n", | |
" '▁se': 2,\n", | |
" 'ho': 2,\n", | |
" 'how': 2,\n", | |
" 'ow': 2,\n", | |
" 'era': 2,\n", | |
" 'al': 2,\n", | |
" 's.': 2,\n", | |
" 'll': 2,\n", | |
" 'an': 2,\n", | |
" 'and': 2,\n", | |
" 'ne': 2,\n", | |
" '▁Hu': 1,\n", | |
" '▁Hug': 1,\n", | |
" '▁Hugg': 1,\n", | |
" '▁Huggi': 1,\n", | |
" '▁Huggin': 1,\n", | |
" '▁Hugging': 1,\n", | |
" 'Hu': 1,\n", | |
" 'Hug': 1,\n", | |
" 'Hugg': 1,\n", | |
" 'Huggi': 1,\n", | |
" 'Huggin': 1,\n", | |
" 'Hugging': 1,\n", | |
" 'ug': 1,\n", | |
" 'ugg': 1,\n", | |
" 'uggi': 1,\n", | |
" 'uggin': 1,\n", | |
" 'ugging': 1,\n", | |
" 'gg': 1,\n", | |
" 'ggi': 1,\n", | |
" 'ggin': 1,\n", | |
" 'gging': 1,\n", | |
" 'gi': 1,\n", | |
" 'gin': 1,\n", | |
" 'ging': 1,\n", | |
" 'ing': 1,\n", | |
" 'ng': 1,\n", | |
" '▁F': 1,\n", | |
" '▁Fa': 1,\n", | |
" '▁Fac': 1,\n", | |
" '▁Face': 1,\n", | |
" 'Fa': 1,\n", | |
" 'Fac': 1,\n", | |
" 'Face': 1,\n", | |
" 'ac': 1,\n", | |
" 'ace': 1,\n", | |
" 'ce': 1,\n", | |
" '▁C': 1,\n", | |
" '▁Co': 1,\n", | |
" '▁Cou': 1,\n", | |
" '▁Cour': 1,\n", | |
" '▁Cours': 1,\n", | |
" '▁Course': 1,\n", | |
" '▁Course.': 1,\n", | |
" 'Co': 1,\n", | |
" 'Cou': 1,\n", | |
" 'Cour': 1,\n", | |
" 'Cours': 1,\n", | |
" 'Course': 1,\n", | |
" 'Course.': 1,\n", | |
" 'our': 1,\n", | |
" 'ours': 1,\n", | |
" 'ourse': 1,\n", | |
" 'ourse.': 1,\n", | |
" 'ur': 1,\n", | |
" 'urs': 1,\n", | |
" 'urse': 1,\n", | |
" 'urse.': 1,\n", | |
" 'rse': 1,\n", | |
" 'rse.': 1,\n", | |
" 'se.': 1,\n", | |
" 'e.': 1,\n", | |
" '▁c': 1,\n", | |
" '▁ch': 1,\n", | |
" '▁cha': 1,\n", | |
" '▁chap': 1,\n", | |
" '▁chapt': 1,\n", | |
" '▁chapte': 1,\n", | |
" '▁chapter': 1,\n", | |
" 'ch': 1,\n", | |
" 'cha': 1,\n", | |
" 'chap': 1,\n", | |
" 'chapt': 1,\n", | |
" 'chapte': 1,\n", | |
" 'chapter': 1,\n", | |
" 'ha': 1,\n", | |
" 'hap': 1,\n", | |
" 'hapt': 1,\n", | |
" 'hapte': 1,\n", | |
" 'hapter': 1,\n", | |
" 'ap': 1,\n", | |
" 'apt': 1,\n", | |
" 'apte': 1,\n", | |
" 'apter': 1,\n", | |
" 'pt': 1,\n", | |
" 'pte': 1,\n", | |
" 'pter': 1,\n", | |
" 'ter': 1,\n", | |
" '▁abo': 1,\n", | |
" '▁abou': 1,\n", | |
" '▁about': 1,\n", | |
" 'abo': 1,\n", | |
" 'abou': 1,\n", | |
" 'about': 1,\n", | |
" 'bo': 1,\n", | |
" 'bou': 1,\n", | |
" 'bout': 1,\n", | |
" 'out': 1,\n", | |
" 'ut': 1,\n", | |
" '▁tokeniza': 1,\n", | |
" '▁tokenizat': 1,\n", | |
" '▁tokenizati': 1,\n", | |
" '▁tokenizatio': 1,\n", | |
" '▁tokenization': 1,\n", | |
" '▁tokenization.': 1,\n", | |
" 'tokeniza': 1,\n", | |
" 'tokenizat': 1,\n", | |
" 'tokenizati': 1,\n", | |
" 'tokenizatio': 1,\n", | |
" 'tokenization': 1,\n", | |
" 'tokenization.': 1,\n", | |
" 'okeniza': 1,\n", | |
" 'okenizat': 1,\n", | |
" 'okenizati': 1,\n", | |
" 'okenizatio': 1,\n", | |
" 'okenization': 1,\n", | |
" 'okenization.': 1,\n", | |
" 'keniza': 1,\n", | |
" 'kenizat': 1,\n", | |
" 'kenizati': 1,\n", | |
" 'kenizatio': 1,\n", | |
" 'kenization': 1,\n", | |
" 'kenization.': 1,\n", | |
" 'eniza': 1,\n", | |
" 'enizat': 1,\n", | |
" 'enizati': 1,\n", | |
" 'enizatio': 1,\n", | |
" 'enization': 1,\n", | |
" 'enization.': 1,\n", | |
" 'niza': 1,\n", | |
" 'nizat': 1,\n", | |
" 'nizati': 1,\n", | |
" 'nizatio': 1,\n", | |
" 'nization': 1,\n", | |
" 'nization.': 1,\n", | |
" 'iza': 1,\n", | |
" 'izat': 1,\n", | |
" 'izati': 1,\n", | |
" 'izatio': 1,\n", | |
" 'ization': 1,\n", | |
" 'ization.': 1,\n", | |
" 'za': 1,\n", | |
" 'zat': 1,\n", | |
" 'zati': 1,\n", | |
" 'zatio': 1,\n", | |
" 'zation': 1,\n", | |
" 'zation.': 1,\n", | |
" 'ati': 1,\n", | |
" 'atio': 1,\n", | |
" 'ation': 1,\n", | |
" 'ation.': 1,\n", | |
" 'tion.': 1,\n", | |
" 'ion.': 1,\n", | |
" 'on.': 1,\n", | |
" 'n.': 1,\n", | |
" '▁sec': 1,\n", | |
" '▁sect': 1,\n", | |
" '▁secti': 1,\n", | |
" '▁sectio': 1,\n", | |
" '▁section': 1,\n", | |
" 'sec': 1,\n", | |
" 'sect': 1,\n", | |
" 'secti': 1,\n", | |
" 'sectio': 1,\n", | |
" 'section': 1,\n", | |
" 'ec': 1,\n", | |
" 'ect': 1,\n", | |
" 'ecti': 1,\n", | |
" 'ectio': 1,\n", | |
" 'ection': 1,\n", | |
" 'ct': 1,\n", | |
" 'cti': 1,\n", | |
" 'ctio': 1,\n", | |
" 'ction': 1,\n", | |
" '▁sh': 1,\n", | |
" '▁sho': 1,\n", | |
" '▁show': 1,\n", | |
" '▁shows': 1,\n", | |
" 'sh': 1,\n", | |
" 'sho': 1,\n", | |
" 'show': 1,\n", | |
" 'shows': 1,\n", | |
" 'hows': 1,\n", | |
" 'ows': 1,\n", | |
" 'ws': 1,\n", | |
" '▁sev': 1,\n", | |
" '▁seve': 1,\n", | |
" '▁sever': 1,\n", | |
" '▁severa': 1,\n", | |
" '▁several': 1,\n", | |
" 'sev': 1,\n", | |
" 'seve': 1,\n", | |
" 'sever': 1,\n", | |
" 'severa': 1,\n", | |
" 'several': 1}" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"token_freqs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "c8622b4f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from math import log\n", | |
"\n", | |
"total_sum = sum([freq for token, freq in token_freqs.items()])\n", | |
"model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "65abfe44", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def encode_word(word, model):\n", | |
" best_segmentations = [{\"start\": 0, \"score\": 1}] + [\n", | |
" {\"start\": None, \"score\": None} for _ in range(len(word))\n", | |
" ]\n", | |
" for start_idx in range(len(word)):\n", | |
" # This should be properly filled by the previous steps of the loop\n", | |
" best_score_at_start = best_segmentations[start_idx][\"score\"]\n", | |
" for end_idx in range(start_idx + 1, len(word) + 1):\n", | |
" token = word[start_idx:end_idx]\n", | |
" if token in model and best_score_at_start is not None:\n", | |
" score = model[token] + best_score_at_start\n", | |
" # If we have found a better segmentation ending at end_idx, we update\n", | |
" if (\n", | |
" best_segmentations[end_idx][\"score\"] is None\n", | |
" or best_segmentations[end_idx][\"score\"] > score\n", | |
" ):\n", | |
" best_segmentations[end_idx] = {\"start\": start_idx, \"score\": score}\n", | |
"\n", | |
" segmentation = best_segmentations[-1]\n", | |
" if segmentation[\"score\"] is None:\n", | |
" # We did not find a tokenization of the word -> unknown\n", | |
" return [\"<unk>\"], None\n", | |
"\n", | |
" score = segmentation[\"score\"]\n", | |
" start = segmentation[\"start\"]\n", | |
" end = len(word)\n", | |
" tokens = []\n", | |
" while start != 0:\n", | |
" tokens.insert(0, word[start:end])\n", | |
" next_start = best_segmentations[start][\"start\"]\n", | |
" end = start\n", | |
" start = next_start\n", | |
" tokens.insert(0, word[start:end])\n", | |
" return tokens, score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "b3f4d087", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(['H', 'o', 'p', 'e', 'f', 'u', 'll', 'y'], 41.5157494601402)\n", | |
"(['This'], 6.288267030694535)\n" | |
] | |
} | |
], | |
"source": [ | |
"print(encode_word(\"Hopefully\", model))\n", | |
"print(encode_word(\"This\", model))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "d0e6991d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def compute_loss(model):\n", | |
" loss = 0\n", | |
" for word, freq in word_freqs.items():\n", | |
" _, word_loss = encode_word(word, model)\n", | |
" loss += freq * word_loss\n", | |
" return loss" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "8681ebbb", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"413.10377642940875" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"compute_loss(model)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "2cd27f4c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import copy\n", | |
"\n", | |
"\n", | |
"def compute_scores(model):\n", | |
" scores = {}\n", | |
" model_loss = compute_loss(model)\n", | |
" for token, score in model.items():\n", | |
" # We always keep tokens of length 1\n", | |
" if len(token) == 1:\n", | |
" continue\n", | |
" model_without_token = copy.deepcopy(model)\n", | |
" _ = model_without_token.pop(token)\n", | |
" scores[token] = compute_loss(model_without_token) - model_loss\n", | |
" return scores" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "3a42bd3c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"6.376412403623874\n", | |
"0.0\n" | |
] | |
} | |
], | |
"source": [ | |
"scores = compute_scores(model)\n", | |
"print(scores[\"ll\"])\n", | |
"print(scores[\"his\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "53c09fbc", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"percent_to_remove = 0.1\n", | |
"while len(model) > 100:\n", | |
" scores = compute_scores(model)\n", | |
" sorted_scores = sorted(scores.items(), key=lambda x: x[1])\n", | |
" # Remove percent_to_remove tokens with the lowest scores.\n", | |
" for i in range(int(len(model) * percent_to_remove)):\n", | |
" _ = token_freqs.pop(sorted_scores[i][0])\n", | |
"\n", | |
" total_sum = sum([freq for token, freq in token_freqs.items()])\n", | |
" model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "84633d43", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def tokenize(text, model):\n", | |
" words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n", | |
" pre_tokenized_text = [word for word, offset in words_with_offsets]\n", | |
" encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text]\n", | |
" return sum(encoded_words, [])\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "de95cf77", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['▁This',\n", | |
" '▁is',\n", | |
" '▁the',\n", | |
" '▁Hugging',\n", | |
" '▁Face',\n", | |
" '▁',\n", | |
" 'c',\n", | |
" 'ou',\n", | |
" 'r',\n", | |
" 's',\n", | |
" 'e',\n", | |
" '.']" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tokenize(\"This is the Hugging Face course.\", model)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "03270dbe", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment