Last active
September 18, 2018 18:52
-
-
Save brianspiering/353297d9fc1246d46d08364a90f14179 to your computer and use it in GitHub Desktop.
Explore different methods for tokenizing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Explore different tokenizers: both quality and performance\n", | |
"====\n", | |
"\n", | |
"HT:\n", | |
"- https://github.com/norvig/pytudes/blob/master/How%20to%20Do%20Things%20with%20Words.ipynb\n", | |
"- https://www.oreilly.com/learning/how-can-i-tokenize-a-sentence-with-python" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"reset -fs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from urllib import request" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Get a reasonable chuck of text data to test speed\n", | |
"url = \"https://www.wolframcloud.com/objects/fd2da57e-2af0-4114-9d08-f45c062389d4\" # Leaves of Grass by Whitman\n", | |
"text = request.urlopen(url).read().decode(\"utf8\") # NOTE: Convert from bytes to str" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'LEAVES OF GRASS\\n\\nBy Walt Whitman\\n\\n\\n\\n Come, said my soul,\\n Such verses for my Body let us write, (for we are one,)\\n That should I after return,\\n Or, long, long hence, in other spheres,\\n '" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"text[:205]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Tests suite\n", | |
"def tokenize_example_text(tokenizer_function_under_test):\n", | |
" text = \"This is: A test, 1, 2, 3, this-is.\"\n", | |
" print(f\"Orginal:\\t{text}\")\n", | |
" print(\" \")\n", | |
" print(\"Tokenized: \", end=\"\\t\")\n", | |
" print(*tokenizer_function_under_test(text), sep=\"\\n\\t\\t\")\n", | |
" \n", | |
"def evaluate_tokenizer(tokenizer_function_under_test, text=text):\n", | |
" tokenizer_function_under_test(text)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Built-in string method\n", | |
"-----" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Define tokenizer function\n", | |
"tokens_str_split = str.split" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Orginal:\tThis is: A test, 1, 2, 3, this-is.\n", | |
" \n", | |
"Tokenized: \tThis\n", | |
"\t\tis:\n", | |
"\t\tA\n", | |
"\t\ttest,\n", | |
"\t\t1,\n", | |
"\t\t2,\n", | |
"\t\t3,\n", | |
"\t\tthis-is.\n" | |
] | |
} | |
], | |
"source": [ | |
"# Visually inspect results for quality\n", | |
"tokenize_example_text(tokens_str_split)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Evaluate performance speed" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"12.7 ms ± 2.18 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -n3\n", | |
"\n", | |
"evaluate_tokenizer(tokens_str_split)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Regex methods\n", | |
"----" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import re" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Define tokenizer function\n", | |
"def tokens_regex_restricted(text):\n", | |
" \"List all the word tokens in a text.\"\n", | |
" return re.findall('[\\w]+', text.lower())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Orginal:\tThis is: A test, 1, 2, 3, this-is.\n", | |
" \n", | |
"Tokenized: \tthis\n", | |
"\t\tis\n", | |
"\t\ta\n", | |
"\t\ttest\n", | |
"\t\t1\n", | |
"\t\t2\n", | |
"\t\t3\n", | |
"\t\tthis\n", | |
"\t\tis\n" | |
] | |
} | |
], | |
"source": [ | |
"# Visually inspect results for quality\n", | |
"tokenize_example_text(tokens_regex_restricted)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Evaluate performance speed" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"55.4 ms ± 7.81 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -n3\n", | |
"\n", | |
"evaluate_tokenizer(tokens_regex_restricted)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Define tokenizer function\n", | |
"def tokens_regex_improved(text):\n", | |
" \"List all the word tokens (consecutive letters) in a text. Normalize to lowercase.\"\n", | |
" return re.findall('[a-z]+', text.lower())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Orginal:\tThis is: A test, 1, 2, 3, this-is.\n", | |
" \n", | |
"Tokenized: \tthis\n", | |
"\t\tis\n", | |
"\t\ta\n", | |
"\t\ttest\n", | |
"\t\tthis\n", | |
"\t\tis\n" | |
] | |
} | |
], | |
"source": [ | |
"# Visually inspect results for quality\n", | |
"tokenize_example_text(tokens_regex_improved)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Evaluate performance speed" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"46.9 ms ± 2.2 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -n3\n", | |
"\n", | |
"evaluate_tokenizer(tokens_regex_improved)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Python packages\n", | |
"----\n", | |
"\n", | |
"### nltk" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# NOTE: nltk does not support 3.7 ☹\n", | |
"\n", | |
"# # Import tokenizer function\n", | |
"# from nltk.tokenize import word_tokenize as tokens_nltk" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# # Visually inspect results for quality\n", | |
"# tokenize_example_text(tokens_nltk)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Evaluate performance speed" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# %%timeit -n3\n", | |
"\n", | |
"# evaluate_tokenizer(tokens_nltk) # Takes about 1 second in wall time" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### spacCy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import spacy\n", | |
"\n", | |
"nlp = spacy.load('en_core_web_sm') # Import language model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Define helper function\n", | |
"def tokens_spacy(text):\n", | |
" doc = nlp.tokenizer(text)\n", | |
" return [token for token in doc]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Orginal:\tThis is: A test, 1, 2, 3, this-is.\n", | |
" \n", | |
"Tokenized: \tThis\n", | |
"\t\tis\n", | |
"\t\t:\n", | |
"\t\tA\n", | |
"\t\ttest\n", | |
"\t\t,\n", | |
"\t\t1\n", | |
"\t\t,\n", | |
"\t\t2\n", | |
"\t\t,\n", | |
"\t\t3\n", | |
"\t\t,\n", | |
"\t\tthis\n", | |
"\t\t-\n", | |
"\t\tis\n", | |
"\t\t.\n" | |
] | |
} | |
], | |
"source": [ | |
"# Visually inspect results for quality\n", | |
"tokenize_example_text(tokens_spacy)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Evaluate performance speed" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"4.94 s ± 135 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -n1\n", | |
"\n", | |
"evaluate_tokenizer(tokens_spacy) " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<br>\n", | |
"<br> \n", | |
"<br>\n", | |
"\n", | |
"----" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment