Skip to content

Instantly share code, notes, and snippets.

@gangtao
Created March 5, 2019 23:24
Show Gist options
  • Save gangtao/4a6e1a0c13d201d0519945b6050a83a1 to your computer and use it in GitHub Desktop.
Save gangtao/4a6e1a0c13d201d0519945b6050a83a1 to your computer and use it in GitHub Desktop.
One Hot Encoding
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Manual One Hot Encoding"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"hello world\n",
"[7, 4, 11, 11, 14, 26, 22, 14, 17, 11, 3]\n",
"[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n",
"h\n",
"e\n",
"l\n",
"l\n",
"o\n",
" \n",
"w\n",
"o\n",
"r\n",
"l\n",
"d\n"
]
}
],
"source": [
"from numpy import argmax\n",
"# define input string\n",
"data = 'hello world'\n",
"print(data)\n",
"# define universe of possible input values\n",
"alphabet = 'abcdefghijklmnopqrstuvwxyz '\n",
"# define a mapping of chars to integers\n",
"char_to_int = dict((c, i) for i, c in enumerate(alphabet))\n",
"int_to_char = dict((i, c) for i, c in enumerate(alphabet))\n",
"# integer encode input data\n",
"integer_encoded = [char_to_int[char] for char in data]\n",
"print(integer_encoded)\n",
"# one hot encode\n",
"onehot_encoded = list()\n",
"for value in integer_encoded:\n",
"\tletter = [0 for _ in range(len(alphabet))]\n",
"\tletter[value] = 1\n",
"\tonehot_encoded.append(letter)\n",
"print(onehot_encoded)\n",
"# invert encoding\n",
"for i in range(len(data)):\n",
" inverted = int_to_char[argmax(onehot_encoded[i])]\n",
" print(inverted)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"One Hot Encode with scikit-learn"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']\n",
"[0 0 2 0 1 1 2 0 2 1]\n",
"[[1. 0. 0.]\n",
" [1. 0. 0.]\n",
" [0. 0. 1.]\n",
" [1. 0. 0.]\n",
" [0. 1. 0.]\n",
" [0. 1. 0.]\n",
" [0. 0. 1.]\n",
" [1. 0. 0.]\n",
" [0. 0. 1.]\n",
" [0. 1. 0.]]\n",
"['cold']\n",
"['cold']\n",
"['warm']\n",
"['cold']\n",
"['hot']\n",
"['hot']\n",
"['warm']\n",
"['cold']\n",
"['warm']\n",
"['hot']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
" if diff:\n",
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
" if diff:\n",
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
" if diff:\n",
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
" if diff:\n",
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
" if diff:\n",
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
" if diff:\n",
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
" if diff:\n",
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
" if diff:\n",
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
" if diff:\n",
"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
" if diff:\n"
]
}
],
"source": [
"from numpy import array\n",
"from numpy import argmax\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"# define example\n",
"data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']\n",
"values = array(data)\n",
"print(values)\n",
"# integer encode\n",
"label_encoder = LabelEncoder()\n",
"integer_encoded = label_encoder.fit_transform(values)\n",
"print(integer_encoded)\n",
"# binary encode\n",
"onehot_encoder = OneHotEncoder(sparse=False)\n",
"integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)\n",
"onehot_encoded = onehot_encoder.fit_transform(integer_encoded)\n",
"print(onehot_encoded)\n",
"# invert first example\n",
"for i in range(len(data)):\n",
" inverted = label_encoder.inverse_transform([argmax(onehot_encoded[i, :])])\n",
" print(inverted)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"One Hot Encode with Keras"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1 3 2 0 3 2 2 1 0 1]\n",
"[[0. 1. 0. 0.]\n",
" [0. 0. 0. 1.]\n",
" [0. 0. 1. 0.]\n",
" [1. 0. 0. 0.]\n",
" [0. 0. 0. 1.]\n",
" [0. 0. 1. 0.]\n",
" [0. 0. 1. 0.]\n",
" [0. 1. 0. 0.]\n",
" [1. 0. 0. 0.]\n",
" [0. 1. 0. 0.]]\n",
"1\n"
]
}
],
"source": [
"from numpy import array\n",
"from numpy import argmax\n",
"from keras.utils import to_categorical\n",
"# define example\n",
"data = [1, 3, 2, 0, 3, 2, 2, 1, 0, 1]\n",
"data = array(data)\n",
"print(data)\n",
"# one hot encode\n",
"encoded = to_categorical(data)\n",
"print(encoded)\n",
"# invert encoding\n",
"inverted = argmax(encoded[0])\n",
"print(inverted)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment