Created
March 5, 2019 23:24
-
-
Save gangtao/4a6e1a0c13d201d0519945b6050a83a1 to your computer and use it in GitHub Desktop.
One Hot Encoding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Manual One Hot Encoding" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"hello world\n", | |
"[7, 4, 11, 11, 14, 26, 22, 14, 17, 11, 3]\n", | |
"[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n", | |
"h\n", | |
"e\n", | |
"l\n", | |
"l\n", | |
"o\n", | |
" \n", | |
"w\n", | |
"o\n", | |
"r\n", | |
"l\n", | |
"d\n" | |
] | |
} | |
], | |
"source": [ | |
"from numpy import argmax\n", | |
"# define input string\n", | |
"data = 'hello world'\n", | |
"print(data)\n", | |
"# define universe of possible input values\n", | |
"alphabet = 'abcdefghijklmnopqrstuvwxyz '\n", | |
"# define a mapping of chars to integers\n", | |
"char_to_int = dict((c, i) for i, c in enumerate(alphabet))\n", | |
"int_to_char = dict((i, c) for i, c in enumerate(alphabet))\n", | |
"# integer encode input data\n", | |
"integer_encoded = [char_to_int[char] for char in data]\n", | |
"print(integer_encoded)\n", | |
"# one hot encode\n", | |
"onehot_encoded = list()\n", | |
"for value in integer_encoded:\n", | |
"\tletter = [0 for _ in range(len(alphabet))]\n", | |
"\tletter[value] = 1\n", | |
"\tonehot_encoded.append(letter)\n", | |
"print(onehot_encoded)\n", | |
"# invert encoding\n", | |
"for i in range(len(data)):\n", | |
" inverted = int_to_char[argmax(onehot_encoded[i])]\n", | |
" print(inverted)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"One Hot Encode with scikit-learn" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']\n", | |
"[0 0 2 0 1 1 2 0 2 1]\n", | |
"[[1. 0. 0.]\n", | |
" [1. 0. 0.]\n", | |
" [0. 0. 1.]\n", | |
" [1. 0. 0.]\n", | |
" [0. 1. 0.]\n", | |
" [0. 1. 0.]\n", | |
" [0. 0. 1.]\n", | |
" [1. 0. 0.]\n", | |
" [0. 0. 1.]\n", | |
" [0. 1. 0.]]\n", | |
"['cold']\n", | |
"['cold']\n", | |
"['warm']\n", | |
"['cold']\n", | |
"['hot']\n", | |
"['hot']\n", | |
"['warm']\n", | |
"['cold']\n", | |
"['warm']\n", | |
"['hot']\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", | |
" if diff:\n", | |
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", | |
" if diff:\n", | |
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", | |
" if diff:\n", | |
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", | |
" if diff:\n", | |
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", | |
" if diff:\n", | |
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", | |
" if diff:\n", | |
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", | |
" if diff:\n", | |
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", | |
" if diff:\n", | |
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", | |
" if diff:\n", | |
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", | |
" if diff:\n" | |
] | |
} | |
], | |
"source": [ | |
"from numpy import array\n", | |
"from numpy import argmax\n", | |
"from sklearn.preprocessing import LabelEncoder\n", | |
"from sklearn.preprocessing import OneHotEncoder\n", | |
"# define example\n", | |
"data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']\n", | |
"values = array(data)\n", | |
"print(values)\n", | |
"# integer encode\n", | |
"label_encoder = LabelEncoder()\n", | |
"integer_encoded = label_encoder.fit_transform(values)\n", | |
"print(integer_encoded)\n", | |
"# binary encode\n", | |
"onehot_encoder = OneHotEncoder(sparse=False)\n", | |
"integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)\n", | |
"onehot_encoded = onehot_encoder.fit_transform(integer_encoded)\n", | |
"print(onehot_encoded)\n", | |
"# invert first example\n", | |
"for i in range(len(data)):\n", | |
" inverted = label_encoder.inverse_transform([argmax(onehot_encoded[i, :])])\n", | |
" print(inverted)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"One Hot Encode with Keras" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Using TensorFlow backend.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[1 3 2 0 3 2 2 1 0 1]\n", | |
"[[0. 1. 0. 0.]\n", | |
" [0. 0. 0. 1.]\n", | |
" [0. 0. 1. 0.]\n", | |
" [1. 0. 0. 0.]\n", | |
" [0. 0. 0. 1.]\n", | |
" [0. 0. 1. 0.]\n", | |
" [0. 0. 1. 0.]\n", | |
" [0. 1. 0. 0.]\n", | |
" [1. 0. 0. 0.]\n", | |
" [0. 1. 0. 0.]]\n", | |
"1\n" | |
] | |
} | |
], | |
"source": [ | |
"from numpy import array\n", | |
"from numpy import argmax\n", | |
"from keras.utils import to_categorical\n", | |
"# define example\n", | |
"data = [1, 3, 2, 0, 3, 2, 2, 1, 0, 1]\n", | |
"data = array(data)\n", | |
"print(data)\n", | |
"# one hot encode\n", | |
"encoded = to_categorical(data)\n", | |
"print(encoded)\n", | |
"# invert encoding\n", | |
"inverted = argmax(encoded[0])\n", | |
"print(inverted)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment