Created
February 25, 2021 20:36
-
-
Save LysandreJik/04c7cfe3d2656ae1c4c388ce9cdd3ea4 to your computer and use it in GitHub Desktop.
Broken Tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Broken Tokenizer", | |
"provenance": [], | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/LysandreJik/04c7cfe3d2656ae1c4c388ce9cdd3ea4/broken-tokenizer.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "79DQ3WqLT1rz", | |
"outputId": "528af80d-f6f6-4e1c-8f62-2cfda6f054ef" | |
}, | |
"source": [ | |
"!pip install tokenizers==0.10.1 transformers==4.3.3" | |
], | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Collecting tokenizers==0.10.1\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)\n", | |
"\u001b[K |████████████████████████████████| 3.2MB 6.7MB/s \n", | |
"\u001b[?25hCollecting transformers==4.3.3\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)\n", | |
"\u001b[K |████████████████████████████████| 1.9MB 34.5MB/s \n", | |
"\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.3) (20.9)\n", | |
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.3) (1.19.5)\n", | |
"Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.3) (3.0.12)\n", | |
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.3) (2.23.0)\n", | |
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.3) (4.41.1)\n", | |
"Collecting sacremoses\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)\n", | |
"\u001b[K |████████████████████████████████| 890kB 35.6MB/s \n", | |
"\u001b[?25hRequirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.3) (3.4.0)\n", | |
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.3) (2019.12.20)\n", | |
"Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers==4.3.3) (2.4.7)\n", | |
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.3) (3.0.4)\n", | |
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.3) (2.10)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.3) (2020.12.5)\n", | |
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.3) (1.24.3)\n", | |
"Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.3.3) (1.15.0)\n", | |
"Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.3.3) (7.1.2)\n", | |
"Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.3.3) (1.0.1)\n", | |
"Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers==4.3.3) (3.7.4.3)\n", | |
"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers==4.3.3) (3.4.0)\n", | |
"Building wheels for collected packages: sacremoses\n", | |
" Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=cd09dc6605e1a6a1c26668f0d347aad64598e3f4334867237d6ad1de8b936295\n", | |
" Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45\n", | |
"Successfully built sacremoses\n", | |
"Installing collected packages: tokenizers, sacremoses, transformers\n", | |
"Successfully installed sacremoses-0.0.43 tokenizers-0.10.1 transformers-4.3.3\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "nbrq0DMFT5s8", | |
"outputId": "b742df55-cfd8-4b3e-97ae-5474ad42d6c3" | |
}, | |
"source": [ | |
"!wget https://www.gutenberg.org/files/1112/1112.txt -O data.txt" | |
], | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"--2021-02-25 20:32:20-- https://www.gutenberg.org/files/1112/1112.txt\n", | |
"Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47\n", | |
"Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 179410 (175K) [text/plain]\n", | |
"Saving to: ‘data.txt’\n", | |
"\n", | |
"data.txt 100%[===================>] 175.21K 882KB/s in 0.2s \n", | |
"\n", | |
"2021-02-25 20:32:21 (882 KB/s) - ‘data.txt’ saved [179410/179410]\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Ach1EHSuUDXA" | |
}, | |
"source": [ | |
"from tokenizers import ByteLevelBPETokenizer\n", | |
"from tokenizers.processors import RobertaProcessing" | |
], | |
"execution_count": 12, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "VyMUQYK8UhQY" | |
}, | |
"source": [ | |
"from pathlib import Path\n", | |
"import shutil\n", | |
"workdir = Path(\"./workdir\")\n", | |
"workdir.mkdir(exist_ok=True)" | |
], | |
"execution_count": 29, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "_z4fwwY7UEq5", | |
"outputId": "d1c6bdb1-fe18-42c4-e5e4-0f1e4cb29861" | |
}, | |
"source": [ | |
" tokenizer = ByteLevelBPETokenizer(lowercase=False)\n", | |
" tokenizer.post_processor = RobertaProcessing(sep=(\"<s>\", 2), cls=(\"</s>\", 0))\n", | |
" tokenizer.train(\n", | |
" [\"./data.txt\"],\n", | |
" vocab_size=1234,\n", | |
" special_tokens=[\"<s>\", \"<pad>\", \"</s>\", \"<unk>\", \"<mask>\"],\n", | |
" min_frequency=5,\n", | |
" )\n", | |
" tokenizer_output_path = workdir / \"tokenizer\"\n", | |
" shutil.rmtree(tokenizer_output_path)\n", | |
" tokenizer_output_path.mkdir(exist_ok=False)\n", | |
" tokenizer_json = tokenizer_output_path / \"tokenizer.json\"\n", | |
" tokenizer.save(str(tokenizer_json.absolute()), True)\n", | |
" tokenizer.save_model(str(tokenizer_output_path.absolute()))\n", | |
"\n" | |
], | |
"execution_count": 30, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"['/content/workdir/tokenizer/vocab.json',\n", | |
" '/content/workdir/tokenizer/merges.txt']" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 30 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "qhAbNtAaUny4", | |
"outputId": "e4e2c3da-61e1-4872-d6e7-e1a6f4a44670" | |
}, | |
"source": [ | |
"!ls ./workdir/tokenizer" | |
], | |
"execution_count": 31, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"merges.txt tokenizer.json vocab.json\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "iBGe4YdaUvi8" | |
}, | |
"source": [ | |
"from transformers import RobertaTokenizerFast, RobertaTokenizer" | |
], | |
"execution_count": 32, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "j0HrRzbrU0sn" | |
}, | |
"source": [ | |
"tfast = RobertaTokenizerFast.from_pretrained(\"./workdir/tokenizer\", model_max_length=10)" | |
], | |
"execution_count": 33, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "aMNukldtU323", | |
"outputId": "028ee0ee-a01c-48d7-cd0f-aa7c840db2d4" | |
}, | |
"source": [ | |
"tfast(\"asd\", add_special_tokens=True)" | |
], | |
"execution_count": 34, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"{'input_ids': [0, 400, 72, 2], 'attention_mask': [1, 1, 1, 1]}" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 34 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "91Ed-y5xU8Xc", | |
"outputId": "f7dcb8f8-c166-4ff6-9b01-757b06e04e18" | |
}, | |
"source": [ | |
"tslow = RobertaTokenizer.from_pretrained(\"./workdir/tokenizer\", model_max_length=10)\n", | |
"tslow(\"asd\", add_special_tokens=True)" | |
], | |
"execution_count": 35, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"{'input_ids': [0, 400, 72, 2], 'attention_mask': [1, 1, 1, 1]}" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 35 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "_47Pb8CEVEnn" | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": 10, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment