Last active
May 3, 2021 16:57
-
-
Save BryanCutler/07ab41418ba1a2575676a2dfb7ea40f9 to your computer and use it in GitHub Desktop.
Text Extensions for Pandas: Tips and Techniques for Extending Pandas, Part 1 Blog
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>token_id</th>\n", | |
" <th>span</th>\n", | |
" <th>input_id</th>\n", | |
" <th>token_type_id</th>\n", | |
" <th>attention_mask</th>\n", | |
" <th>special_tokens_mask</th>\n", | |
" <th>embedding</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>[0, 0): ''</td>\n", | |
" <td>101</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>True</td>\n", | |
" <td>[ -0.2746679, 0.4125931, -0.03875526...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>[0, 5): 'Monty'</td>\n", | |
" <td>18446</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>False</td>\n", | |
" <td>[ 0.8781024, 0.16891398, -0.0992790...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>[6, 12): 'Python'</td>\n", | |
" <td>18750</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>False</td>\n", | |
" <td>[ 0.44240445, 0.97607553, 0.002306254...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>[13, 16): 'and'</td>\n", | |
" <td>1998</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>False</td>\n", | |
" <td>[ -0.8111421, 0.61899036, -0.64434...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>[17, 20): 'the'</td>\n", | |
" <td>1996</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>False</td>\n", | |
" <td>[ -0.3957557, 0.51493996, -0.646472...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>5</td>\n", | |
" <td>[21, 25): 'Holy'</td>\n", | |
" <td>4151</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>False</td>\n", | |
" <td>[ 1.4014304, 0.68754816, 0.2479123...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>6</td>\n", | |
" <td>[26, 28): 'Gr'</td>\n", | |
" <td>24665</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>False</td>\n", | |
" <td>[ -0.084513076, 0.33457386, 0.2393767...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>7</td>\n", | |
" <td>[28, 31): 'ail'</td>\n", | |
" <td>12502</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>False</td>\n", | |
" <td>[ 0.304468, 0.19518888, -0.7640254...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>8</td>\n", | |
" <td>[0, 0): ''</td>\n", | |
" <td>102</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>True</td>\n", | |
" <td>[ 0.934723, 0.3263963, -0.07516271...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" token_id span input_id token_type_id attention_mask \\\n", | |
"0 0 [0, 0): '' 101 0 1 \n", | |
"1 1 [0, 5): 'Monty' 18446 0 1 \n", | |
"2 2 [6, 12): 'Python' 18750 0 1 \n", | |
"3 3 [13, 16): 'and' 1998 0 1 \n", | |
"4 4 [17, 20): 'the' 1996 0 1 \n", | |
"5 5 [21, 25): 'Holy' 4151 0 1 \n", | |
"6 6 [26, 28): 'Gr' 24665 0 1 \n", | |
"7 7 [28, 31): 'ail' 12502 0 1 \n", | |
"8 8 [0, 0): '' 102 0 1 \n", | |
"\n", | |
" special_tokens_mask embedding \n", | |
"0 True [ -0.2746679, 0.4125931, -0.03875526... \n", | |
"1 False [ 0.8781024, 0.16891398, -0.0992790... \n", | |
"2 False [ 0.44240445, 0.97607553, 0.002306254... \n", | |
"3 False [ -0.8111421, 0.61899036, -0.64434... \n", | |
"4 False [ -0.3957557, 0.51493996, -0.646472... \n", | |
"5 False [ 1.4014304, 0.68754816, 0.2479123... \n", | |
"6 False [ -0.084513076, 0.33457386, 0.2393767... \n", | |
"7 False [ 0.304468, 0.19518888, -0.7640254... \n", | |
"8 True [ 0.934723, 0.3263963, -0.07516271... " | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import text_extensions_for_pandas as tp\n", | |
"from transformers import BertTokenizerFast, BertModel\n", | |
"\n", | |
"# Create a BERT model from the Huggingface transformers library.\n", | |
"model_name = \"bert-base-uncased\"\n", | |
"tokenizer = BertTokenizerFast.from_pretrained(model_name, add_special_tokens=True)\n", | |
"bert = BertModel.from_pretrained(model_name)\n", | |
"\n", | |
"# Sample text to generate example extension arrays.\n", | |
"text = \"Monty Python and the Holy Grail\"\n", | |
"\n", | |
"# Create Pandas DataFrame with tokens and embeddings from the model.\n", | |
"df = tp.io.bert.make_bert_tokens(text, tokenizer)\n", | |
"df = tp.io.bert.add_embeddings(df, bert)\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"token_id int64\n", | |
"span SpanDtype\n", | |
"input_id int64\n", | |
"token_type_id int64\n", | |
"attention_mask int64\n", | |
"special_tokens_mask bool\n", | |
"embedding TensorDtype\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# In the above DataFrame the columns \"span\" and \"embedding\" are extension dtypes.\n", | |
"df.dtypes" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment